有很多人有或多或少的原因并不会自己做单细胞实验送测序,加上目前单细胞转录组数据上传在公共数据库的数据也有不少了,大家会倾向于从公共数据集中筛选出多个数据集来做整合分析。所以这周推文打算从理论角度,跟大家分享一下多个数据集整合分析的代码。
举个例子,如果只关注某一肿瘤组织中的Tcell或Bcell或髓系细胞,选取该种肿瘤样本数据集,两到三个数据集合并分析,提取Tcell或Bcell或髓系细胞亚群,那么在分析这种数据的时候就会遇到一个问题。
「1. 是在对多个不同数据集分别降维分群后再提取各个数据集的Tcell或Bcell或髓系细胞亚群整合到一块?」
「2. 还是说在对单细胞数据集一开始分析的时候,就定义好分组整合到一块呢?」
「我个人觉得这个部分怎么分析并没有一个特定的答案或者规则,具体还是得看数据集适合以上提出的哪种分析顺序。」
例如之前有一篇文章中提到过用三个鼻咽癌单细胞转录组数据: GSE150825, GSE150430, GSE162025。不过这篇文献是对三组数据整合后,选取其中的一部分T细胞进行着重分析。
这里介绍其中一种方式:三个数据集分别降维分群,亚群细分后再进行整合。
篇幅有限,这里以GSE150430数据集为例。
以下代码比较详细的展示了第一次降维分群,提取髓系亚群后再次降维分群,定义细胞亚群。
最后部分还提供怎么把三组数据合并的代码。
代码语言:javascript复制rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
###### step1:导入数据 ######
library(data.table)
group=fread('group.txt' )
group=as.data.frame(t(group)[-1,])
head(group)
library(stringr)
colnames(group)='celltype'
group$patient=str_split(rownames(group),'_',simplify = T)[,1]
head(group)
Sys.time()
raw.data <- fread( 'GSE150430_npc_scRNA_hg19_processed_data.txt.gz',
data.table = F,skip = 2)
Sys.time()
dim(raw.data) #
raw.data[1:4,1:4]
rownames(raw.data)=raw.data[,1]
raw.data=raw.data[,-1]
head(group)
colnames(raw.data)=rownames(group)
head(colnames(raw.data))
sce.all <- CreateSeuratObject(counts = raw.data)
sce.all <- AddMetaData(object = sce.all, metadata = group)
table(sce.all@meta.data$orig.ident)
library(stringr)
sce.all@meta.data$group = substring( sce.all$orig.ident ,1,1)
table(sce.all@meta.data$group)
「这里省略展示QC代码」
代码语言:javascript复制dir.create("2-harmony")
getwd()
setwd("2-harmony")
sce=sce.all.filt
sce
sce <- NormalizeData(sce,
normalization.method = "LogNormalize",
scale.factor = 1e4)
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))
library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj, dims = 1:15,
reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )
sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
dims = 1:15)
sce.all=sce
#设置不同的分辨率,观察分群效果(选择哪一个?)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
sce.all=FindClusters(sce.all, #graph.name = "CCA_snn",
resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)
#接下来分析,按照分辨率为0.8进行
sel.clust = "RNA_snn_res.0.8"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident)
saveRDS(sce.all, "sce.all_int.rds")
setwd("../")
定义细胞亚群
代码语言:javascript复制rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
setwd('3-cell/')
sce.all=readRDS( "../2-harmony/sce.all_int.rds")
sce.all
# 需要自行看图,定细胞亚群:
library(ggplot2)
genes_to_check =list(
CM=c("TTN","MYH7","MYH6","TNNT2") ,
EC=c("VWF", "IFI27", "PECAM1","MGP"),
FB=c("DCN", "C7" ,"LUM","FBLN1","COL1A2"),
MP=c("CD163", "CCL4", "CXCL8","PTPRC"),
SMC=c("ACTA2", "CALD1", "MYH11"),
Tc=c("CD3D","CD3E"),
peric=c("ABCC9","PDGFRB","RGS5")
)
p_all_markers=DotPlot(sce.all,
features = genes_to_check,
scale = T,assay='RNA' )
theme(axis.text.x=element_text(angle=45,hjust = 1))
p_all_markers
celltype=data.frame(ClusterID=0:21,
celltype= 0:21)
#定义细胞亚群
celltype[celltype$ClusterID %in% c(0,3,10),2]='Bcells'
celltype[celltype$ClusterID %in% c( 7 ),2]='plasma'
celltype[celltype$ClusterID %in% c( 8),2]='Cycl-T'
celltype[celltype$ClusterID %in% c(1,17),2]='NKT'
celltype[celltype$ClusterID %in% c(2),2]='Treg'
celltype[celltype$ClusterID %in% c(4),2]='naive-T'
celltype[celltype$ClusterID %in% c(6,11 ),2]='Mac'
celltype[celltype$ClusterID %in% c(15 ),2]='DC'
celltype[celltype$ClusterID %in% c(13),2]='pDC'
celltype[celltype$ClusterID %in% c( 5,9,12,14,16,18,19,21 ),2]='epi'
celltype[celltype$ClusterID %in% c(20 ),2]='stem-epi'
head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
'CD68', 'CD163', 'CD14',
'C1QA', 'C1QB', # mac
'S100A9', 'S100A8', 'MMP19',# monocyte
'FCGR3A',
'KLRB1','NCR1', # NK
'FGF7','MME', 'ACTA2', ## fibo
'DCN', 'LUM', 'GSN' , ## mouse PDAC fibo
'ZEB1', 'ZEB2', 'PDGFRA' , 'SNAI2',
'CDKN1A','KLF4',
'MUC1' ,'EPCAM',
'MKI67' , 'TOP2A',
'PECAM1', 'VWF', ## endo
'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
genes_to_check= unique(genes_to_check)
th=theme(axis.text.x = element_text(angle = 45,
vjust = 0.5, hjust=0.5))
p <- DotPlot(sce.all, features = unique(genes_to_check),
assay='RNA' ,group.by = 'celltype' ) coord_flip() th
p
ggsave(plot=p, filename="check_marker_by_celltype.pdf",
width = 7 ,height = 8)
library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
assay='RNA' ,group.by = 'celltype' ) coord_flip() th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T)
p_all_markers p_umap
ggsave('markers_umap_by_celltype.pdf',width = 12,height = 8)
phe=sce.all@meta.data
save(phe,file = 'phe-by-markers.Rdata')
image.png
之后提取了 GSE150825, GSE150430, GSE162025 每个数据集中的髓系进行亚群细分,还是以GSE150430为例
代码语言:javascript复制library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
library(patchwork)
getwd()
dir.create("./outputs")
setwd("./outputs/")
sce =readRDS( "../../2-harmony/sce.all_int.rds")
sce
load('../../3-cell/phe-by-markers.Rdata')
colnames(sce@meta.data)
DimPlot(sce, reduction = "umap", label = T)
DimPlot(sce, reduction = "umap",group.by = 'orig.ident', label = T)
ggsave('umap_check_harmony.pdf',width = 15)
sce@meta.data = phe
table(sce$orig.ident)
table(sce$celltype)
table(sce$group)
sce=sce[,sce$celltype %in% c('DC','Mac','pDC')]
sce
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
meta.data = sce@meta.data[,1:9])
# sce@meta.data[,1:9]
save(sce,file = 'myeloids_sce.Rdata')
#load(file = 'myeloids_sce.Rdata')
降维分群
代码语言:javascript复制sce <- NormalizeData(sce, normalization.method = "LogNormalize",
scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce,
selection.method = "vst", nfeatures = 2000)
sce <- ScaleData(sce)
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce))
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce)
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)
set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "tsne",label=T)
ggsave(filename = 'before_harmony_tsne.pdf')
sce <- RunUMAP(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(sce,reduction = "umap",label=T)
ggsave(filename = 'before_harmony_umap.pdf')
head(sce@meta.data)
DimPlot(sce,reduction = "umap",label=T,group.by = 'orig.ident')
ggsave(filename = 'before_harmony_umap_orig.ident.pdf')
sce <- FindClusters(sce, resolution = 0.1)
DimPlot(sce,reduction = "umap",label=T)
ggsave(filename = 'before_harmony_umap_recluster_by_0.1.pdf')
DimPlot(sce,reduction = "umap",label=T,
group.by = 'orig.ident')
ggsave(filename = 'before_harmony_umap_sce_recluster_by_orig.ident.pdf')
tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)
library(gplots)
tab.1
pro='before_harmony'
pdf(file = paste0(pro,' seurat_clusters VS orig.ident.pdf'),
width = 9,height = 8)
balloonplot(tab.1, main =" seurat_clusters VS orig.ident ", xlab ="", ylab="",
label = T, show.margins = F)
dev.off()
colnames(sce@meta.data)
library(harmony)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj, dims = 1:15,
reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )
sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
dims = 1:15)
sce <- FindClusters(sce, resolution = 0.1)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T)
ggsave(filename = 'harmony_umap_recluster_by_0.1.pdf')
DimPlot(sce,reduction = "umap",label=T,
group.by = 'orig.ident')
ggsave(filename = 'harmony_umap_sce_recluster_by_orig.ident.pdf')
tab.1=table(sce@meta.data$orig.ident,sce@meta.data$seurat_clusters)
colnames(sce@meta.data)
library(gplots)
tab.1
pro='after_harmony'
pdf(file = paste0(pro,' seurat_clusters VS orig.ident.pdf'),
width = 8,height = 8)
balloonplot(tab.1, main =" seurat_clusters VS orig.ident ",
xlab ="", ylab="",
label = T, show.margins = F)
dev.off()
save(sce,file = 'first_sce.Rdata')
定义细胞亚群
代码语言:javascript复制rm(list=ls())
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
setwd("./outputs/")
load(file = 'first_sce.Rdata')
colnames(sce@meta.data)
table(sce$orig.ident)
table(sce$seurat_clusters)
sce <- FindClusters(sce, resolution = 0.8)
DimPlot(sce,reduction = "umap",label=T )
sel.clust = "RNA_snn_res.0.8"
sce <- SetIdent(sce, value = sel.clust)
table(sce@active.ident)
Mac=c("CD14", "CD163", "APOE", "C1QA", "C1QB", "C1QC")
pDC=c("LILRA4", "IL3RA","TCF4","TCL1A","CLEC4C")
DC1=c("CLEC9A", "XCR1", "BATF3")
DC2=c("CD1A", "FCER1A", "CD1C","CD1E","CLEC10A")
DC3=c("CCR7", "LAMP3", "FSCN1","CCL22","BIRC3")
Mono=c("VCVN", "FCN1", "S100A12", "S100A8", "S100A9",'FCGR3A')
genes_to_check =list(
Mac=Mac,
pDC=pDC,
DC1=DC1,
DC2=DC2,
DC3=DC3 ,
Mono=Mono
)
library(stringr)
genes_to_check = lapply(genes_to_check, str_to_upper)
p_all_markers=DotPlot(sce ,
features = genes_to_check,
scale = T,assay='RNA' )
theme(axis.text.x=element_text(angle=45,hjust = 1))
p_all_markers
ggsave('check_myeloids_markers.pdf',height = 11,width = 11)
celltype=data.frame(ClusterID=0:13,
celltype=0:13 )
celltype[celltype$ClusterID %in% c( 13 ),2]='mast'
celltype[celltype$ClusterID %in% c( 9),2]='DC1'
celltype[celltype$ClusterID %in% c( 2,10,11),2]='DC2'
celltype[celltype$ClusterID %in% c( 6),2]='DC3'
celltype[celltype$ClusterID %in% c( 1),2]='pDC'
celltype[celltype$ClusterID %in% c( 5),2]='Mac-mono'
celltype[celltype$ClusterID %in% c( 4 ),2]='mono'
celltype[celltype$ClusterID %in% c( 0,3 ),2]='mac'
celltype[celltype$ClusterID %in% c( 8),2]='cycling'
celltype[celltype$ClusterID %in% c( 7,12),2]='B-meyiod'
head(celltype)
celltype
table(celltype$celltype)
sce@meta.data$celltype = "NA"
table(sce@meta.data$RNA_snn_res.0.8 )
for(i in 1:nrow(celltype)){
sce@meta.data[which(sce@meta.data$RNA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce@meta.data$celltype)
table(sce@meta.data$celltype)
DimPlot(sce, reduction = "umap",
group.by = "celltype",label = T,label.box = T)
ggsave('umap_bycelltype.pdf',height = 7,width = 8)
sce.all=sce
save(sce.all,file = 'second_sce.Rdata')
在GSE150825, GSE150430, GSE162025 每个数据集中的髓系亚群细分结束后,会存成Rdata数据,之后将三组数据整合读取重新降维分群,再继续分析。
代码语言:javascript复制library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
load('./inputs/GSE150430/second_sce.Rdata')
GSE150430 = sce.all
GSE150430$study = 'GSE150430'
load('./inputs/GSE150825/second_sce2.Rdata')
GSE150825 = sce.all
GSE150825$study = 'GSE150825'
load('./inputs/GSE162025/second_sce.Rdata')
GSE162025 = sce
GSE162025$study = 'GSE162025'
sce = merge(GSE150430,list(GSE150825,GSE162025),
add.cell.ids = c('GSE150430','GSE150825','GSE162025'))
table(sce$study)
colnames(sce@meta.data)
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
meta.data = sce@meta.data )
sce
table(sce$group)
save(sce,file = 'input.Rdata')