单细胞测序—PDA文章复现_单分组(Fig.1_Fig.2)
最近在学习复现Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution,这篇文章的内容。这里记录下Fig1和Fig2的复现过程。
1 文章简介
文章的Fig1,Fig2只涉及GSE125588的前三个数据集,是基于0.6的resolution进行降维聚类,采用tsne降维方法。文章中并未提及细胞过滤等参数。因此我做出的结果会与文章中略微存在差异。
值得说明的是文章中并不是把三个数据集去除批次效应后整合在一起进行分析的,而是每个数据集单独进行分析。
GSM3577882 | Normal Pancreas |
---|---|
GSM3577883 | Early KIC |
GSM3577884 | Late KIC |
1.1 Fig1总览
1.2 Fig2总览
2 文章复现
GEO上下载整理好数据之后,不需要去批次整合数据,而是每个数据集单独读取,进行独立的分析
2.1 early-KIC
以step1-run-basic-early-KIC.R为例
代码语言:r复制rm(list=ls())
options(stringsAsFactors = F)
source('scRNA_scripts/lib.R')
###### step1:导入数据 ######
library(ReactomePA)
library(ggplot2)
dir.create("./early-KIC")
setwd('./early-KIC/')
sce.data <- Read10X("../raw_data/GSM3577883_early_KIC/")
sce.all <- CreateSeuratObject(counts = sce.data,
project = 'early-KIC',
min.cells = 5,
min.features = 300)
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident)
###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# 如果过滤的太狠,就需要去修改这个过滤代码
source('../../scRNA_scripts/qc.R')
sce.all.filt = basic_qc(sce.all)
print(dim(sce.all))
print(dim(sce.all.filt))
setwd('../')
sp='mouse'
###### step3: 无需harmony整合多个单细胞样品 ######
###### 可用如下简单方法 ######
f = "obj.Rdata"
if(!file.exists(f)){
sce.all.filt = sce.all.filt %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(features = VariableFeatures(.)) %>%
FindNeighbors(dims = 1:15) %>%
FindClusters(resolution = 0.6) %>%
RunTSNE(dims = 1:15)
save(sce.all.filt,file = f)
}
p1 <- DimPlot(sce.all.filt, reduction = "tsne",label = T) NoLegend();p1
###### 或标准流程 ######
sce.all.int <- sce.all.filt
table(Idents(sce.all.int))
table(sce.all.int$seurat_clusters)
table(sce.all.int$RNA_snn_res.0.6)
dir.create('check-by-0.6')
setwd('check-by-0.6')
sel.clust = "RNA_snn_res.0.6"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident)
source('../../scRNA_scripts/check-all-markers.R')
{
marker_paper <- c("Amy1", "Amy2a2", "Pyy", "Sst", "Ins1", "Ins2",
"Sox9", "Krt18", "Col1a1", "Col1a2", "Cd52", "Cd14",
"Cdh5", "Pecam1")
paper_marker <- DotPlot(sce.all.int , features = marker_paper )
coord_flip()
theme(axis.text.x=element_text(angle=45,hjust = 1))
p_tsne=DimPlot(sce.all.int, reduction = "tsne",raster = F,
label = T,repel = T)
paper_marker p_tsne
ggsave(paste('paper_marker_and_tsne.pdf'),width = 12,height = 10)
}
setwd('../')
getwd()
###### step5: 确定单细胞亚群生物学名字 ######
VlnPlot(sce.all.int, features = marker_paper ,pt.size = 0,ncol = 1)
ggsave("marker_paper_vio_0.6.pdf",width = 3.5,height = 20)
colnames(sce.all.int@meta.data)
table(sce.all.int$RNA_snn_res.0.6)
if(F){
sce.all.int
celltype=data.frame(ClusterID=0:13 ,
celltype= 0:13)
#定义细胞亚群
celltype[celltype$ClusterID %in% c( 8),2]='Endothelial cells'
celltype[celltype$ClusterID %in% c( 1,5,7 ),2]='Macrophages'
celltype[celltype$ClusterID %in% c( 13 ),2]='Fibroblasts-3'
celltype[celltype$ClusterID %in% c( 0,2,9),2]='Fibroblasts-2'
celltype[celltype$ClusterID %in% c( 6 ),2]='Fibroblasts-1'
celltype[celltype$ClusterID %in% c( 3 ),2]='Cancer cells'
celltype[celltype$ClusterID %in% c( 11),2]='Beta cells'
celltype[celltype$ClusterID %in% c( 10,12),2]='Islet cells'
celltype[celltype$ClusterID %in% c( 4),2]='Acinar cells'
head(celltype)
celltype
table(celltype$celltype)
sce.all.int@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all.int@meta.data[which(sce.all.int@meta.data$RNA_snn_res.0.6 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
Idents(sce.all.int)=sce.all.int$celltype
}
DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'RNA_snn_res.0.6',
label = T,repel = T)
DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'celltype',
label = T,repel = T)
ggsave(paste('paper_anno_and_tsne.pdf'),width = 12,height = 10)
DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'celltype',
label = T,repel = T)
ggsave(paste('tsne.pdf'),width = 8,height = 6)
#marker可视化
VlnPlot(sce.all.int, features = marker_paper ,pt.size = 0,ncol = 1)
ggsave("marker_paper_vio_celltype.pdf",width = 3.5,height = 35)
#计算比例
df=data.frame(clu=names(table(sce.all.int$seurat_clusters)),
per=sprintf("%1.2f%%",100*table(sce.all.int$seurat_clusters)/length(sce.all.int$seurat_clusters)))
sce.all.int$per=df[match(sce.all.int$seurat_clusters,df$clu),2]
sce.all.int$new=paste0(sce.all.int$celltype,"(",sce.all.int$per,")")
table(sce.all.int$new)
DimPlot(sce.all.int,reduction='tsne',
group.by='new',
label.box=T,label=T,repel=T)
ggsave(paste('paper_anno_percentage.pdf'),width = 12,height = 10)
#可视化优化
PropPlot <- function(object, groupBy){
# (1)获取绘图数据
plot_data = object@meta.data %>%
dplyr::select(orig.ident, {{groupBy}}) %>%
dplyr::rename(group = as.name(groupBy))
# (2)绘图
figure = ggbarstats(data = plot_data,
x = group, y = orig.ident,
package = 'ggsci',
palette = 'category20c_d3',
results.subtitle = FALSE,
bf.message = FALSE,
proportion.test = FALSE,
label.args = list(size = 2,
fill = 'white',
alpha = 0.85,
family = 'sans',
fontface = 'bold'),
perc.k = 2,
title = '',
xlab = '',
legend.title = 'Seurat Cluster',
ggtheme = ggpubr::theme_pubclean())
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_line(color = 'black', lineend = 'round'),
legend.position = 'right',
axis.text.x = element_text(size = 15, color = 'black', family = 'Arial'),
axis.text.y = element_text(size = 15, color = 'black', family = 'Arial'),
legend.text = element_text(family = 'Arial', size = 10, color = 'black'),
legend.title = element_text(family = 'Arial', size = 13, color = 'black'))
# (3)去除柱子下面的样本量标识:
gginnards::delete_layers(x = figure, match_type = 'GeomText')
}
library(ggstatsplot)
library(gginnards)
# pdf('percentage.pdf',width = 5,height = 8)
# PropPlot(object = sce.all.int, groupBy = 'celltype')
# dev.off()
CairoPDF("percentage.pdf", width = 5, height = 8)
PropPlot(object = sce.all.int, groupBy = 'celltype')
dev.off()
# 如果前面成功的给各个细胞亚群命名了
# 就可以运行下面的代码
sce.all.int$celltype = as.character(sce.all.int$celltype)
if("celltype" %in% colnames(sce.all.int@meta.data ) ){
sel.clust = "celltype"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident)
dir.create('check-by-celltype')
setwd('check-by-celltype')
source('../../scRNA_scripts/check-all-markers.R')
setwd('../')
getwd()
phe=sce.all.int@meta.data
save(phe,file = 'phe.Rdata')
pdf('celltype-vs-orig.ident.pdf',width = 10)
gplots::balloonplot(table(sce.all.int$orig.ident,
sce.all.int$celltype))
dev.off()
}
###### FeaturePlot 基因可视化 ######
FeaturePlot(sce.all.int, features = c("Cdh1", "Epcam", "Gjb1",
"Cdh2", "Cd44", "Axl"),ncol = 3)
ggsave("FeaturePlot.pdf",width = 9,height = 4.5)
save(sce.all.int,file = "early-KIC.Rdata")
因为文章中给了mark基因,因此设置marker_paper <- c("Amy1",...),向量。做了同文章中一样的mark基因小提琴图,首先根据0.6的分辨率分的0-14簇分组进行绘图。
marker_paper_vio_0.6.pdf
对比文章中的图进行手动细胞注释
获得降维聚类分群的结果
paper_anno_and_tsne.pdf
接下来再定义和调用PropPlot函数,得到亚群的比例图。
左上为原文章细胞分群及比例。左下(tsne.pdf)和右图(percentage.pdf)为复现过程中的细胞分群与比例
Mark基因(右图为marker_paper_vio_celltype.pdf)对比
以上是early-KIC分组中Fig1的内容,Fig暂时只做了FeaturePlot.pdf部分
总体对比下
2.2 late-KIC
另外两个分组操作类似,这里不做赘述。