单细胞测序—标准流程代码（1）

现在的单细胞测序很少是单个样本测序了，一般是多个样本。这里用ifnb.SeuratData包中的ifnb示例数据来模拟单细胞测序多样本分析流程。

首先需要额外安装ifnb.SeuratData包

代码语言：r复制

# InstallData('ifnb.SeuratData')
# 使用上面的代码下载，但是非常考验网络
# trying URL 'http://seurat.nygenome.org/src/contrib/ifnb.SeuratData_3.1.0.tar.gz'
# Content type 'application/octet-stream' length 413266233 bytes (394.1 MB)
# 如果网络不行，就想办法下载(394.1 MB)保存在自己的电脑里面，然后下面的代码安装
#install.packages(pkgs = '/home/data/t110558/R/ifnb.SeuratData_3.1.0.tar.gz',
#                  repos = NULL,
#                  type = "source" )
# 一定要看到这个包成功安装哦
# * DONE (ifnb.SeuratData)

step1:导入数据

代码语言：r复制

rm(list=ls())
options(stringsAsFactors = F) 
source('scRNA_scripts/lib.R')

#测试数据需要额外加载
library(SeuratData)
library(ifnb.SeuratData)
library(ReactomePA)
library(org.Hs.eg.db)
library(ggplot2)
data("ifnb")
ifnb
ifnb=UpdateSeuratObject(ifnb)
ifnb
table(ifnb$orig.ident)
## IMMUNE_CTRL IMMUNE_STIM 
##     6548        7451

为了后续分析方便所有的seurat统一命名为sce.all

代码语言：r复制

sce.all <- ifnb
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident) 
## IMMUNE_CTRL IMMUNE_STIM 
##     6548        7451

另scRNA_scripts/lib.R为

代码语言：r复制

library(COSG)
# devtools::install_github('genecell/COSGR')
library(harmony)
library(ggsci)
library(dplyr) 
library(future)
library(Seurat)
library(clustree)
library(cowplot)
library(data.table)
library(dplyr)
library(ggplot2)
library(patchwork)
library(stringr)

step2:QC质控

将qc质控分装到basic_qc( )函数中，构建好seurat对象后直接调用即可

代码语言：r复制

dir.create("./1-QC")
setwd("./1-QC")
# 如果过滤的太狠，就需要去修改这个过滤代码
source('../scRNA_scripts/qc.R')
sce.all.filt = basic_qc(sce.all)
print(dim(sce.all))
print(dim(sce.all.filt))
setwd('../')
sp='human'

scRNA_scripts/qc.R

代码语言：r复制

basic_qc <- function(input_sce){
  #计算线粒体基因比例
  mito_genes=rownames(input_sce)[grep("^MT-", rownames(input_sce),ignore.case = T)] 
  print(mito_genes) #可能是13个线粒体基因
  #input_sce=PercentageFeatureSet(input_sce, "^MT-", col.name = "percent_mito")
  input_sce=PercentageFeatureSet(input_sce, features = mito_genes, col.name = "percent_mito")
  fivenum(input_sce@meta.data$percent_mito)
  
  #计算核糖体基因比例
  ribo_genes=rownames(input_sce)[grep("^Rp[sl]", rownames(input_sce),ignore.case = T)]
  print(ribo_genes)
  input_sce=PercentageFeatureSet(input_sce,  features = ribo_genes, col.name = "percent_ribo")
  fivenum(input_sce@meta.data$percent_ribo)
  
  #计算红血细胞基因比例
  Hb_genes=rownames(input_sce)[grep("^Hb[^(p)]", rownames(input_sce),ignore.case = T)]
  print(Hb_genes)
  input_sce=PercentageFeatureSet(input_sce,  features = Hb_genes,col.name = "percent_hb")
  fivenum(input_sce@meta.data$percent_hb)
  
  #可视化细胞的上述比例情况
  feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito",
             "percent_ribo", "percent_hb")
  feats <- c("nFeature_RNA", "nCount_RNA")
  p1=VlnPlot(input_sce, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2)   
    NoLegend()
  p1 
  w=length(unique(input_sce$orig.ident))/3 5;w
  ggsave(filename="Vlnplot1.pdf",plot=p1,width = w,height = 5)
  
  feats <- c("percent_mito", "percent_ribo", "percent_hb")
  p2=VlnPlot(input_sce, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3, same.y.lims=T)   
    scale_y_continuous(breaks=seq(0, 100, 5))  
    NoLegend()
  p2	
  w=length(unique(input_sce$orig.ident))/2 5;w
  ggsave(filename="Vlnplot2.pdf",plot=p2,width = w,height = 5)
  
  p3=FeatureScatter(input_sce, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
  ggsave(filename="Scatterplot.pdf",plot=p3)
  
  #根据上述指标，过滤低质量细胞/基因
  #过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
  # 一般来说，在CreateSeuratObject的时候已经是进行了这个过滤操作
  # 如果后期看到了自己的单细胞降维聚类分群结果很诡异，就可以回过头来看质量控制环节
  # 先走默认流程即可
  if(F){
    selected_c <- WhichCells(input_sce, expression = nFeature_RNA > 500)
    selected_f <- rownames(input_sce)[Matrix::rowSums(input_sce@assays$RNA$counts > 0 ) > 3]
    input_sce.filt <- subset(input_sce, features = selected_f, cells = selected_c)
    dim(input_sce) 
    dim(input_sce.filt) 
  }
  
  input_sce.filt =  input_sce
  
  # par(mar = c(4, 8, 2, 1))
  # 这里的C 这个矩阵，有一点大，可以考虑随抽样 
  C=subset(input_sce.filt,downsample=100)@assays$RNA$counts
  dim(C)
  C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100

  most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
  
  pdf("TOP50_most_expressed_gene.pdf",width=14)
  boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
          cex = 0.1, las = 1, 
          xlab = "% total count per cell", 
          col = (scales::hue_pal())(50)[50:1], 
          horizontal = TRUE)
  dev.off()
  rm(C)
  
  #过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
  selected_mito <- WhichCells(input_sce.filt, expression = percent_mito < 25)
  selected_ribo <- WhichCells(input_sce.filt, expression = percent_ribo > 3)
  selected_hb <- WhichCells(input_sce.filt, expression = percent_hb < 1 )
  length(selected_hb)
  length(selected_ribo)
  length(selected_mito)
  
  input_sce.filt <- subset(input_sce.filt, cells = selected_mito)
  #input_sce.filt <- subset(input_sce.filt, cells = selected_ribo)
  input_sce.filt <- subset(input_sce.filt, cells = selected_hb)
  dim(input_sce.filt)
  
  table(input_sce.filt$orig.ident) 
  
  #可视化过滤后的情况
  feats <- c("nFeature_RNA", "nCount_RNA")
  p1_filtered=VlnPlot(input_sce.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 2)   
    NoLegend()
  w=length(unique(input_sce.filt$orig.ident))/3 5;w 
  ggsave(filename="Vlnplot1_filtered.pdf",plot=p1_filtered,width = w,height = 5)
  
  feats <- c("percent_mito", "percent_ribo", "percent_hb")
  p2_filtered=VlnPlot(input_sce.filt, group.by = "orig.ident", features = feats, pt.size = 0, ncol = 3)   
    NoLegend()
  w=length(unique(input_sce.filt$orig.ident))/2 5;w 
  ggsave(filename="Vlnplot2_filtered.pdf",plot=p2_filtered,width = w,height = 5) 
  return(input_sce.filt) 
  
}

解释

首先check下传入的seurat对象

代码语言：r复制

input_sce <- sce.all
View(input_sce@metadata)

分别计算线粒体、核糖体、血红细胞基因比例，并在原meda.data表中添加对应的新列，

以线粒体基因为例。

代码语言：r复制

#计算线粒体基因比例
mito_genes=rownames(input_sce)[grep("^MT-", rownames(input_sce),ignore.case = T)] 
print(mito_genes) #可能是13个线粒体基因
##character(0)
#input_sce=PercentageFeatureSet(input_sce, "^MT-", col.name = "percent_mito")
input_sce=PercentageFeatureSet(input_sce, features = mito_genes, col.name = "percent_mito")
fivenum(input_sce@meta.data$percent_mito)
[1] 0 0 0 0 0

执行完的meda.data表长这样

接下来的代码根据以上信息绘制了三张图，保存着在QC文件夹中

Vlnplot1.pdf

两个分组中的nFeature、nCount_RNA小提琴图

Vlnplot2.pdf

两个分组中的线粒体、核糖体、血红细胞基因比例的小提琴图

Scatterplot.pdf

两个分组中的nFeature、nCount_RNA相关性图

接下来根据以上画图制定过滤策略：

#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因

一般来说，在CreateSeuratObject的时候已经是进行了这个过滤操作，在这里进行了表达量最高的50个基因的可视化

代码语言：r复制

C=subset(input_sce.filt,downsample=100)@assays$RNA$counts
  dim(C)
  C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100

  most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]
  
  pdf("TOP50_most_expressed_gene.pdf",width=14)
  boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
          cex = 0.1, las = 1, 
          xlab = "% total count per cell", 
          col = (scales::hue_pal())(50)[50:1], 
          horizontal = TRUE)
  dev.off()
  rm(C)

这段代码的主要目的是通过下采样、归一化，识别单细胞RNA测序数据中表达量最高的50个基因，并将这些基因的表达分布以箱线图的形式保存为PDF文件。

以下是详细的解释：

提取数据并下采样

代码语言：r复制

C=subset(input_sce.filt, downsample=100)@assays$RNA$counts

subset(input_sce.filt, downsample=100)：对input_sce.filt对象中的细胞进行下采样，每个细胞随机保留100个UMI（Unique Molecular Identifier）。这样做的目的是减少计算量，尤其是对于大规模数据集。

downsample=100是指在对input_sce.filt对象进行子集选择时，对每个细胞中的UMI（Unique Molecular Identifier）计数进行下采样，每个细胞随机保留100个UMI。这个操作是通过Seurat包内部的函数实现的。

@assays$RNA$counts：提取下采样后的RNA测序数据的计数矩阵C，其中每一行代表一个基因，每一列代表一个细胞，矩阵中的值是基因在细胞中的表达量。

归一化基因表达数据

代码语言：r复制

C=Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100

Matrix::colSums(C)：计算每个细胞的总表达量。
Matrix::t(C)/Matrix::colSums(C)：对计数矩阵进行列标准化，即每个基因的表达量除以该细胞的总表达量。这样可以消除不同细胞间测序深度的差异。
Matrix::t(...) * 100：将标准化后的表达量乘以100，转换为百分比形式。

识别表达量最高的50个基因

代码语言：r复制

most_expressed <- order(apply(C, 1, median), decreasing = T)[50:1]

apply(C, 1, median)：对矩阵C的每一行（即每个基因），计算其在所有细胞中的中位数表达量。
order(..., decreasing = T)50:1：将这些中位数按从高到低的顺序排列，并返回对应的基因索引,选择表达量最高的前50个基因的索引。

可视化表达量最高的50个基因

代码语言：r复制

pdf("TOP50_most_expressed_gene.pdf",width=14)
boxplot(as.matrix(Matrix::t(C[most_expressed, ])),
        cex = 0.1, las = 1, 
        xlab = "% total count per cell", 
        col = (scales::hue_pal())(50)[50:1], 
        horizontal = TRUE)
dev.off()

pdf("TOP50_most_expressed_gene.pdf",width=14)：打开一个PDF设备，准备将接下来的图像输出到TOP50_most_expressed_gene.pdf文件中，宽度为14个单位。
Matrix::t(Cmost_expressed, )：转置矩阵，使每一行代表一个细胞，每一列代表一个基因。
as.matrix(...)：将转置后的C转换为普通矩阵。
绘制箱线图，每个箱线图代表一个基因的表达分布。
- cex = 0.1：设置点的大小。
- las = 1：使刻度标签平行于轴。
- xlab = "% total count per cell"：设置x轴标签为“每个细胞的总计数百分比”。
- col = (scales::hue_pal())(50)50:1：为每个基因的箱线图分配不同的颜色，颜色从明到暗对应基因表达量从高到低。
- horizontal = TRUE：使箱线图水平显示。
dev.off()：关闭PDF设备，保存图像文件。
rm(C)：删除变量C，释放内存，尤其是对于大规模数据，释放不再需要的数据非常重要。

TOP50_most_expressed_gene.pdf

根据线粒体基因比例、核糖体基因比例和红细胞基因比例的阈值，逐步筛选出高质量细胞，去除了可能是死细胞、红细胞或其他质量较差的细胞。

代码语言：r复制

selected_mito <- WhichCells(input_sce.filt, expression = percent_mito < 25)
selected_ribo <- WhichCells(input_sce.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(input_sce.filt, expression = percent_hb < 1 )

这行代码筛选出input_sce.filt对象中线粒体基因比例低于25%的细胞。高线粒体基因比例通常表明细胞处于死亡或凋亡状态，因此需要去除这些细胞。
筛选出核糖体基因比例高于3%的细胞。核糖体基因高表达可能表明该细胞具有正常的蛋白质合成活动，因此这一步是为了保留这些细胞。
筛选出红细胞基因比例低于1%的细胞。红细胞基因高表达可能表示样本中混入了红细胞，因此需要去除这些细胞。

代码语言：r复制

input_sce.filt <- subset(input_sce.filt, cells = selected_mito)
input_sce.filt <- subset(input_sce.filt, cells = selected_ribo)
input_sce.filt <- subset(input_sce.filt, cells = selected_hb)

返回一个过滤后的input_sce.filt对象，以便进行下游分析。

接下来的代码可视化过滤后的情况，存在两张图

Vlnplot1_filtered.pdf

两个分组中过滤后的nFeature、nCount_RNA小提琴图

Vlnplot2_filtered.pdf

两个分组中过滤后的的线粒体、核糖体、血红细胞基因比例的小提琴图

总结：

QC质控总共出6张图：Vlnplot1.pdf、Vlnplot2.pdf、Scatterplot.pdf、TOP50_most_expressed_gene.pdf、Vlnplot1_filtered.pdf、Vlnplot2_filtered.pdf.

step3: harmony整合多个单细胞样品

harmony整合多个单细胞样品,即去批次。

同上，直接调用/scRNA_scripts/harmony.R代码即可

代码语言：r复制

if(T){
  dir.create("2-harmony")
  getwd()
  setwd("2-harmony")
  source('../scRNA_scripts/harmony.R')
  # 默认 ScaleData 没有添加"nCount_RNA", "nFeature_RNA"
  # 默认的
  sce.all.int = run_harmony(sce.all.filt)
  setwd('../')  
}

/scRNA_scripts/harmony.R

代码语言：r复制

run_harmony <- function(input_sce){
  
  print(dim(input_sce))
  input_sce <- NormalizeData(input_sce, 
                             normalization.method = "LogNormalize",
                             scale.factor = 1e4) 
  input_sce <- FindVariableFeatures(input_sce)
  input_sce <- ScaleData(input_sce)
  input_sce <- RunPCA(input_sce, features = VariableFeatures(object = input_sce))
  seuratObj <- RunHarmony(input_sce, "orig.ident")
  names(seuratObj@reductions)
  [1] "pca"     "harmony"
  seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                       reduction = "harmony")
  
  # p = DimPlot(seuratObj,reduction = "umap",label=T ) 
  # ggsave(filename='umap-by-orig.ident-after-harmony',plot = p)
  
  input_sce=seuratObj
  input_sce <- FindNeighbors(input_sce, reduction = "harmony",
                             dims = 1:15) 
  input_sce.all=input_sce
  
  #设置不同的分辨率，观察分群效果(选择哪一个？)
  for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
    input_sce.all=FindClusters(input_sce.all, #graph.name = "CCA_snn", 
                               resolution = res, algorithm = 1)
  }
  colnames(input_sce.all@meta.data)
  apply(input_sce.all@meta.data[,grep("RNA_snn",colnames(input_sce.all@meta.data))],2,table)
  
  
  #选择低分辨率可视化
  p1_dim=plot_grid(ncol = 3, DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.01")   
                     ggtitle("louvain_0.01"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.1")   
                     ggtitle("louvain_0.1"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2")   
                     ggtitle("louvain_0.2"))
  ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_low.pdf",width = 16,height = 7)
  
  
  #选择高分辨率可视化
  p1_dim=plot_grid(ncol = 3, DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.8")   
                     ggtitle("louvain_0.8"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.1")   
                     ggtitle("louvain_1"), DimPlot(input_sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3")   
                     ggtitle("louvain_0.3"))
  ggsave(plot=p1_dim, filename="Dimplot_diff_resolution_high.pdf",width = 16,height = 7)
  
  
  #树状图绘制
  p2_tree=clustree(input_sce.all@meta.data, prefix = "RNA_snn_res.")
  ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf",width = 8,height = 12)
  table(input_sce.all@active.ident) 
  saveRDS(input_sce.all, "sce.all_int.rds")
  return(input_sce.all)
  
}

该函数包括数据的标准化、主成分分析（PCA）、批次效应校正、降维可视化、聚类分析，以及最终结果的保存。下面是对该函数每个步骤的详细解释：

代码语言：r复制

input_sce <- NormalizeData(...) 
input_sce <- FindVariableFeatures(...)
input_sce <- ScaleData(...)

NormalizeData：对数据进行标准化，采用对数归一化方法，使得每个细胞的总表达量为 1e4。这种标准化处理有助于消除测序深度的影响。
FindVariableFeatures：识别在所有细胞中表达变异度最大的基因，这些基因通常用于后续的降维分析（如PCA）。
ScaleData：对数据进行缩放，通常是将每个基因的表达值减去均值，再除以标准差，使得数据更适合PCA和其他线性模型的分析。

降维

代码语言：r复制

input_sce <- RunPCA(...)
seuratObj <- RunHarmony(...)
seuratObj <- RunUMAP

RunPCA：使用前面找到的高变基因进行PCA，降低数据维度，保留主要的变化趋势。PCA的结果存储在Seurat对象中。
RunHarmony：利用Harmony算法来校正数据中的批次效应（orig.ident表示不同的批次或实验条件）。该方法通过在PCA基础上整合不同批次的数据，使得数据更为一致。
RunUMAP：在校正后的数据上使用UMAP算法进行降维，以便可视化不同细胞群体的关系。这里UMAP基于Harmony的校正结果。

代码语言：r复制

input_sce <- FindNeighbors(input_sce, reduction = "harmony", dims = 1:15)

FindNeighbors：基于校正后的数据（Harmony结果），计算每个细胞的邻居（即相似度最高的细胞）。该步骤为后续聚类分析做准备。

聚类

代码语言：r复制

for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8, 1)) {
    input_sce.all=FindClusters(input_sce.all, resolution = res, algorithm = 1)
}

FindClusters：对细胞进行聚类分析。resolution参数控制聚类的颗粒度，值越高，得到的聚类数目越多。在这里，对多个不同的 resolution值进行了尝试，以便找到最适合的数据聚类分辨率。

执行完这一步后，meta.data后增加了相应的resolution的列

代码语言：r复制

apply(input_sce.all@meta.data[,grep("RNA_snn",colnames(input_sce.all@meta.data))],2,table)

这行代码的作用是对不同分辨率下的聚类结果进行统计和展示，统计了每个分辨率（聚类分辨率RNA_snn_res.*）下每个聚类类别（即不同的细胞群体）的细胞数量。

接下来进行三张图的绘制，分别是查看低分辨率的聚类分群、高分辨率的聚类分群、以及生成一个树状图，展示不同分辨率下聚类结果之间的关系，帮助选择最适合的数据分辨率。

Dimplot_diff_resolution_low.pdf

Dimplot_diff_resolution_high.pdf

Tree_diff_resolution.pdf

代码语言：r复制

table(input_sce.all@active.ident) 
saveRDS(input_sce.all, "sce.all_int.rds")

table(input_sce.all@active.ident)：查看最终聚类的细胞群体分布。
saveRDS：将处理后的Seurat对象保存到文件中，以便后续使用。

注：

问：table(input_sce.all@active.ident)中如何确定最终聚类的细胞群体分布，即如何确定active.ident的？

答：在Seurat中，active.ident 是一个非常重要的字段，用于标识当前活跃的（即正在使用的）细胞群体标识符（cluster identity）。当你进行不同分辨率的聚类分析时，Seurat 会在 meta.data 中为每个分辨率生成一个新的列，例如 RNA_snn_res.0.1、RNA_snn_res.0.3 等。你可以选择其中一个分辨率作为最终的聚类结果，并将其设置为 active.ident。

如以上代码最后执行的分辨率是1，则active.ident就是分辨率为1的结果

问：如何重新选择active.ident？

答：一旦选择了一个适当的分辨率，可以将这个分辨率下的聚类结果设置为 active.ident，这样 Seurat 中的所有下游分析都会基于这个聚类结果。

代码语言：r复制

# 假设你选择了分辨率为0.5的聚类结果
input_sce.all$active.ident <- input_sce.all$RNA_snn_res.0.5

数据挖掘生物基因单细胞测序

0 人点赞