单细胞多数据集整合示例

有很多人有或多或少的原因并不会自己做单细胞实验送测序，加上目前单细胞转录组数据上传在公共数据库的数据也有不少了，大家会倾向于从公共数据集中筛选出多个数据集来做整合分析。所以这周推文打算从理论角度，跟大家分享一下多个数据集整合分析的代码。

举个例子，如果只关注某一肿瘤组织中的Tcell或Bcell或髓系细胞，选取该种肿瘤样本数据集，两到三个数据集合并分析，提取Tcell或Bcell或髓系细胞亚群，那么在分析这种数据的时候就会遇到一个问题。

「1. 是在对多个不同数据集分别降维分群后再提取各个数据集的Tcell或Bcell或髓系细胞亚群整合到一块？」

「2. 还是说在对单细胞数据集一开始分析的时候，就定义好分组整合到一块呢？」

「我个人觉得这个部分怎么分析并没有一个特定的答案或者规则，具体还是得看数据集适合以上提出的哪种分析顺序。」

例如之前有一篇文章中提到过用三个鼻咽癌单细胞转录组数据： GSE150825, GSE150430, GSE162025。不过这篇文献是对三组数据整合后，选取其中的一部分T细胞进行着重分析。

这里介绍其中一种方式：三个数据集分别降维分群，亚群细分后再进行整合。

篇幅有限，这里以GSE150430数据集为例。

以下代码比较详细的展示了第一次降维分群，提取髓系亚群后再次降维分群，定义细胞亚群。

最后部分还提供怎么把三组数据合并的代码。

代码语言：javascript复制

rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

###### step1:导入数据 ######     
library(data.table)

group=fread('group.txt' ) 
group=as.data.frame(t(group)[-1,])
head(group) 
library(stringr)
colnames(group)='celltype'
group$patient=str_split(rownames(group),'_',simplify = T)[,1]
head(group)

Sys.time()
raw.data <- fread( 'GSE150430_npc_scRNA_hg19_processed_data.txt.gz', 
                       data.table = F,skip = 2)
Sys.time()

dim(raw.data) # 
raw.data[1:4,1:4]
rownames(raw.data)=raw.data[,1]
raw.data=raw.data[,-1]
head(group)
colnames(raw.data)=rownames(group)
head(colnames(raw.data))  

sce.all <- CreateSeuratObject(counts = raw.data) 
sce.all <- AddMetaData(object = sce.all, metadata = group)
 
table(sce.all@meta.data$orig.ident)
library(stringr)
sce.all@meta.data$group = substring( sce.all$orig.ident ,1,1) 
table(sce.all@meta.data$group)

「这里省略展示QC代码」

代码语言：javascript复制

dir.create("2-harmony")
getwd()
setwd("2-harmony")

sce=sce.all.filt 
sce
sce <- NormalizeData(sce, 
                         normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 

sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率，观察分群效果(选择哪一个？)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn", 
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)

#接下来分析，按照分辨率为0.8进行 
sel.clust = "RNA_snn_res.0.8"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")

setwd("../")

定义细胞亚群

代码语言：javascript复制

rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
setwd('3-cell/')
sce.all=readRDS( "../2-harmony/sce.all_int.rds")
sce.all

# 需要自行看图，定细胞亚群： 
library(ggplot2) 
genes_to_check =list(
  CM=c("TTN","MYH7","MYH6","TNNT2") ,
  EC=c("VWF", "IFI27", "PECAM1","MGP"),
  FB=c("DCN", "C7" ,"LUM","FBLN1","COL1A2"),
  MP=c("CD163", "CCL4", "CXCL8","PTPRC"),
  SMC=c("ACTA2", "CALD1", "MYH11"),
  Tc=c("CD3D","CD3E"),
  peric=c("ABCC9","PDGFRB","RGS5")
)
p_all_markers=DotPlot(sce.all, 
                      features = genes_to_check,
                      scale = T,assay='RNA' ) 
  theme(axis.text.x=element_text(angle=45,hjust = 1))
p_all_markers

celltype=data.frame(ClusterID=0:21,
                    celltype= 0:21) 
#定义细胞亚群 
celltype[celltype$ClusterID %in% c(0,3,10),2]='Bcells'   
celltype[celltype$ClusterID %in% c( 7 ),2]='plasma'  
 

celltype[celltype$ClusterID %in% c( 8),2]='Cycl-T'  
celltype[celltype$ClusterID %in% c(1,17),2]='NKT'  
celltype[celltype$ClusterID %in% c(2),2]='Treg'  
celltype[celltype$ClusterID %in% c(4),2]='naive-T'  


celltype[celltype$ClusterID %in% c(6,11 ),2]='Mac'  
celltype[celltype$ClusterID %in% c(15 ),2]='DC' 
celltype[celltype$ClusterID %in% c(13),2]='pDC' 
 
celltype[celltype$ClusterID %in% c( 5,9,12,14,16,18,19,21 ),2]='epi' 
celltype[celltype$ClusterID %in% c(20 ),2]='stem-epi'  

head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A', 
                   'CD68', 'CD163', 'CD14',  
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A', 
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'ZEB1', 'ZEB2', 'PDGFRA' , 'SNAI2',
                   'CDKN1A','KLF4',
                   'MUC1' ,'EPCAM',
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
genes_to_check= unique(genes_to_check)
th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
 
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA' ,group.by = 'celltype' )    coord_flip()   th

p
ggsave(plot=p, filename="check_marker_by_celltype.pdf",
       width = 7 ,height = 8)

library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                      assay='RNA' ,group.by = 'celltype' )    coord_flip() th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T)
p_all_markers p_umap
ggsave('markers_umap_by_celltype.pdf',width = 12,height = 8)
phe=sce.all@meta.data
save(phe,file = 'phe-by-markers.Rdata')

image.png

之后提取了 GSE150825, GSE150430, GSE162025 每个数据集中的髓系进行亚群细分，还是以GSE150430为例

代码语言：javascript复制

library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
library(patchwork)

getwd()
dir.create("./outputs")
setwd("./outputs/")

sce =readRDS( "../../2-harmony/sce.all_int.rds")
sce 
load('../../3-cell/phe-by-markers.Rdata')

colnames(sce@meta.data)
DimPlot(sce, reduction = "umap", label = T)   
  DimPlot(sce, reduction = "umap",group.by = 'orig.ident', label = T)
ggsave('umap_check_harmony.pdf',width = 15)

sce@meta.data = phe

table(sce$orig.ident) 
table(sce$celltype) 
table(sce$group)

sce=sce[,sce$celltype %in% c('DC','Mac','pDC')]
sce

sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
                       meta.data = sce@meta.data[,1:9])
# sce@meta.data[,1:9]
save(sce,file = 'myeloids_sce.Rdata')
#load(file = 'myeloids_sce.Rdata')

降维分群

代码语言：javascript复制

sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                     scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, 
                            selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce) 

sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)  

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "tsne",label=T)
ggsave(filename = 'before_harmony_tsne.pdf') 

sce <- RunUMAP(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'before_harmony_umap.pdf') 
head(sce@meta.data)
DimPlot(sce,reduction = "umap",label=T,group.by = 'orig.ident') 
ggsave(filename = 'before_harmony_umap_orig.ident.pdf') 


sce <- FindClusters(sce, resolution = 0.1)
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'before_harmony_umap_recluster_by_0.1.pdf') 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'before_harmony_umap_sce_recluster_by_orig.ident.pdf')
 

tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='before_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 9,height  = 8)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 

colnames(sce@meta.data)  

library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )  


sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15)
sce <- FindClusters(sce, resolution = 0.1)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T) 
ggsave(filename = 'harmony_umap_recluster_by_0.1.pdf') 
DimPlot(sce,reduction = "umap",label=T,
        group.by = 'orig.ident') 
ggsave(filename = 'harmony_umap_sce_recluster_by_orig.ident.pdf')  

tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)   
library(gplots)
tab.1 
pro='after_harmony'
pdf(file = paste0(pro,'  seurat_clusters VS  orig.ident.pdf'),
    width = 8,height  = 8)
balloonplot(tab.1, main =" seurat_clusters VS  orig.ident ", 
            xlab ="", ylab="",
            label = T, show.margins = F)
dev.off() 


save(sce,file = 'first_sce.Rdata')

定义细胞亚群

代码语言：javascript复制

rm(list=ls())
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

getwd()
setwd("./outputs/")
load(file = 'first_sce.Rdata')
colnames(sce@meta.data)
table(sce$orig.ident)
table(sce$seurat_clusters)

sce <- FindClusters(sce, resolution = 0.8)
DimPlot(sce,reduction = "umap",label=T )  

sel.clust = "RNA_snn_res.0.8"
sce <- SetIdent(sce, value = sel.clust)
table(sce@active.ident)

Mac=c("CD14", "CD163", "APOE", "C1QA", "C1QB", "C1QC")
pDC=c("LILRA4", "IL3RA","TCF4","TCL1A","CLEC4C")
DC1=c("CLEC9A", "XCR1", "BATF3")  
DC2=c("CD1A", "FCER1A", "CD1C","CD1E","CLEC10A")
DC3=c("CCR7", "LAMP3", "FSCN1","CCL22","BIRC3")
Mono=c("VCVN", "FCN1", "S100A12", "S100A8", "S100A9",'FCGR3A')
genes_to_check =list(
  Mac=Mac,
  pDC=pDC,
  DC1=DC1, 
  DC2=DC2,
  DC3=DC3 ,
  Mono=Mono
)
library(stringr)
genes_to_check = lapply(genes_to_check, str_to_upper)
p_all_markers=DotPlot(sce , 
                      features = genes_to_check,
                      scale = T,assay='RNA' ) 
  theme(axis.text.x=element_text(angle=45,hjust = 1))
p_all_markers
ggsave('check_myeloids_markers.pdf',height = 11,width = 11)

celltype=data.frame(ClusterID=0:13,
                      celltype=0:13 )   
  
  celltype[celltype$ClusterID %in% c( 13 ),2]='mast'  
  celltype[celltype$ClusterID %in% c(  9),2]='DC1'  
  celltype[celltype$ClusterID %in% c( 2,10,11),2]='DC2'  
  celltype[celltype$ClusterID %in% c(  6),2]='DC3'
  celltype[celltype$ClusterID %in% c(  1),2]='pDC'
  
  celltype[celltype$ClusterID %in% c( 5),2]='Mac-mono'
  celltype[celltype$ClusterID %in% c( 4 ),2]='mono' 
  celltype[celltype$ClusterID %in% c( 0,3 ),2]='mac'   
  
  celltype[celltype$ClusterID %in% c( 8),2]='cycling'  
  celltype[celltype$ClusterID %in% c( 7,12),2]='B-meyiod' 
  
  
  head(celltype)
  celltype 
  table(celltype$celltype)
  
  sce@meta.data$celltype = "NA"
  table(sce@meta.data$RNA_snn_res.0.8 )
  
  for(i in 1:nrow(celltype)){
    sce@meta.data[which(sce@meta.data$RNA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  table(sce@meta.data$celltype)
  
  table(sce@meta.data$celltype)
  DimPlot(sce, reduction = "umap", 
          group.by = "celltype",label = T,label.box = T) 
  ggsave('umap_bycelltype.pdf',height = 7,width = 8)

sce.all=sce
  save(sce.all,file = 'second_sce.Rdata')

在GSE150825, GSE150430, GSE162025 每个数据集中的髓系亚群细分结束后，会存成Rdata数据，之后将三组数据整合读取重新降维分群，再继续分析。

代码语言：javascript复制

library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)

load('./inputs/GSE150430/second_sce.Rdata')
GSE150430 = sce.all
GSE150430$study = 'GSE150430'

load('./inputs/GSE150825/second_sce2.Rdata')
GSE150825 = sce.all
GSE150825$study = 'GSE150825'

load('./inputs/GSE162025/second_sce.Rdata')
GSE162025 = sce 
GSE162025$study = 'GSE162025'

sce = merge(GSE150430,list(GSE150825,GSE162025),
            add.cell.ids = c('GSE150430','GSE150825','GSE162025'))
table(sce$study)
colnames(sce@meta.data)
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
                       meta.data = sce@meta.data )
sce
table(sce$group)
save(sce,file = 'input.Rdata')

data label meta table 数据

0 人点赞