单细胞测序—PDA文章复现_单分组(Fig.1_Fig.2)

最近在学习复现Cellular heterogeneity during mouse pancreatic ductal adenocarcinoma progression at single-cell resolution，这篇文章的内容。这里记录下Fig1和Fig2的复现过程。

1 文章简介

文章的Fig1,Fig2只涉及GSE125588的前三个数据集，是基于0.6的resolution进行降维聚类，采用tsne降维方法。文章中并未提及细胞过滤等参数。因此我做出的结果会与文章中略微存在差异。

值得说明的是文章中并不是把三个数据集去除批次效应后整合在一起进行分析的，而是每个数据集单独进行分析。

GSM3577882	Normal Pancreas
GSM3577883	Early KIC
GSM3577884	Late KIC

1.1 Fig1总览

1.2 Fig2总览

2 文章复现

GEO上下载整理好数据之后，不需要去批次整合数据，而是每个数据集单独读取，进行独立的分析

2.1 early-KIC

以step1-run-basic-early-KIC.R为例

代码语言：r复制

rm(list=ls())
options(stringsAsFactors = F) 
source('scRNA_scripts/lib.R')

###### step1:导入数据 ######    
library(ReactomePA)
library(ggplot2)
dir.create("./early-KIC")
setwd('./early-KIC/')
sce.data <- Read10X("../raw_data/GSM3577883_early_KIC/")

sce.all <- CreateSeuratObject(counts = sce.data,
                              project = 'early-KIC',
                              min.cells = 5, 
                              min.features = 300)
as.data.frame(sce.all@assays$RNA$counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident) 
 

###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
# 如果过滤的太狠，就需要去修改这个过滤代码
source('../../scRNA_scripts/qc.R')
sce.all.filt = basic_qc(sce.all)
print(dim(sce.all))
print(dim(sce.all.filt))
setwd('../')


sp='mouse'

###### step3: 无需harmony整合多个单细胞样品 ######
###### 可用如下简单方法 ######
f = "obj.Rdata"
if(!file.exists(f)){
  sce.all.filt = sce.all.filt %>% 
    NormalizeData() %>%  
    FindVariableFeatures() %>%  
    ScaleData(features = rownames(.)) %>%  
    RunPCA(features = VariableFeatures(.))  %>%
    FindNeighbors(dims = 1:15) %>% 
    FindClusters(resolution = 0.6) %>% 
    RunTSNE(dims = 1:15)
  save(sce.all.filt,file = f)
}

p1 <- DimPlot(sce.all.filt, reduction = "tsne",label = T) NoLegend();p1

###### 或标准流程 ######
sce.all.int <- sce.all.filt

table(Idents(sce.all.int))
table(sce.all.int$seurat_clusters)
table(sce.all.int$RNA_snn_res.0.6) 


dir.create('check-by-0.6')
setwd('check-by-0.6')
sel.clust = "RNA_snn_res.0.6"
sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all.int@active.ident) 
source('../../scRNA_scripts/check-all-markers.R')
{
  marker_paper <- c("Amy1", "Amy2a2", "Pyy", "Sst", "Ins1", "Ins2",
                    "Sox9", "Krt18", "Col1a1", "Col1a2", "Cd52", "Cd14", 
                    "Cdh5", "Pecam1")
  paper_marker <- DotPlot(sce.all.int , features = marker_paper )    
    coord_flip()   
    theme(axis.text.x=element_text(angle=45,hjust = 1))
  p_tsne=DimPlot(sce.all.int, reduction = "tsne",raster = F,
                 label = T,repel = T)

  paper_marker   p_tsne
  ggsave(paste('paper_marker_and_tsne.pdf'),width  = 12,height = 10)
  }
setwd('../') 
getwd()


###### step5: 确定单细胞亚群生物学名字 ######
 VlnPlot(sce.all.int, features = marker_paper ,pt.size = 0,ncol = 1)
 ggsave("marker_paper_vio_0.6.pdf",width = 3.5,height = 20)


colnames(sce.all.int@meta.data)
table(sce.all.int$RNA_snn_res.0.6)

if(F){
  sce.all.int
  celltype=data.frame(ClusterID=0:13 ,
                      celltype= 0:13) 
  #定义细胞亚群        
  celltype[celltype$ClusterID %in% c( 8),2]='Endothelial cells'   
  celltype[celltype$ClusterID %in% c( 1,5,7 ),2]='Macrophages' 
  celltype[celltype$ClusterID %in% c( 13 ),2]='Fibroblasts-3' 
  celltype[celltype$ClusterID %in% c( 0,2,9),2]='Fibroblasts-2'
  celltype[celltype$ClusterID %in% c( 6 ),2]='Fibroblasts-1'
  celltype[celltype$ClusterID %in% c( 3 ),2]='Cancer cells'
  celltype[celltype$ClusterID %in% c( 11),2]='Beta cells'
  celltype[celltype$ClusterID %in% c( 10,12),2]='Islet cells'
  celltype[celltype$ClusterID %in% c( 4),2]='Acinar cells'
  
  head(celltype)
  celltype
  table(celltype$celltype)
  sce.all.int@meta.data$celltype = "NA"
  
  for(i in 1:nrow(celltype)){
    sce.all.int@meta.data[which(sce.all.int@meta.data$RNA_snn_res.0.6 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
  Idents(sce.all.int)=sce.all.int$celltype
  
  
}

DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'RNA_snn_res.0.6',
        label = T,repel = T)  
  DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'celltype',
          label = T,repel = T) 
ggsave(paste('paper_anno_and_tsne.pdf'),width  = 12,height = 10)

DimPlot(sce.all.int, reduction = "tsne",raster = F,group.by = 'celltype',
        label = T,repel = T) 
ggsave(paste('tsne.pdf'),width  = 8,height = 6)
#marker可视化
VlnPlot(sce.all.int, features = marker_paper ,pt.size = 0,ncol = 1)
ggsave("marker_paper_vio_celltype.pdf",width = 3.5,height = 35)

#计算比例
df=data.frame(clu=names(table(sce.all.int$seurat_clusters)),
                per=sprintf("%1.2f%%",100*table(sce.all.int$seurat_clusters)/length(sce.all.int$seurat_clusters)))
sce.all.int$per=df[match(sce.all.int$seurat_clusters,df$clu),2]
sce.all.int$new=paste0(sce.all.int$celltype,"(",sce.all.int$per,")")
table(sce.all.int$new)
DimPlot(sce.all.int,reduction='tsne',
        group.by='new',
        label.box=T,label=T,repel=T)
ggsave(paste('paper_anno_percentage.pdf'),width  = 12,height = 10)

#可视化优化
PropPlot <- function(object, groupBy){
  # (1)获取绘图数据
  plot_data = object@meta.data %>% 
    dplyr::select(orig.ident, {{groupBy}}) %>% 
    dplyr::rename(group = as.name(groupBy))
  
  # (2)绘图
  figure = ggbarstats(data = plot_data, 
                      x = group, y = orig.ident,
                      package = 'ggsci',
                      palette = 'category20c_d3',
                      results.subtitle = FALSE,
                      bf.message = FALSE,
                      proportion.test = FALSE,
                      label.args = list(size = 2, 
                                        fill = 'white', 
                                        alpha = 0.85,
                                        family = 'sans',
                                        fontface = 'bold'),
                      perc.k = 2,
                      title = '',
                      xlab = '',
                      legend.title = 'Seurat Cluster',
                      ggtheme = ggpubr::theme_pubclean())  
    theme(axis.ticks.x = element_blank(),
          axis.ticks.y = element_line(color = 'black', lineend = 'round'),
          legend.position = 'right',
          axis.text.x = element_text(size = 15, color = 'black', family = 'Arial'),
          axis.text.y = element_text(size = 15, color = 'black', family = 'Arial'),
          legend.text = element_text(family = 'Arial', size = 10, color = 'black'),
          legend.title = element_text(family = 'Arial', size = 13, color = 'black')) 
  
  # (3)去除柱子下面的样本量标识：
  gginnards::delete_layers(x = figure, match_type = 'GeomText')
}
library(ggstatsplot)
library(gginnards)
# pdf('percentage.pdf',width = 5,height = 8)
# PropPlot(object  =  sce.all.int,  groupBy  =  'celltype')
# dev.off()
CairoPDF("percentage.pdf", width = 5, height = 8)
PropPlot(object = sce.all.int, groupBy = 'celltype')
dev.off()

# 如果前面成功的给各个细胞亚群命名了
# 就可以运行下面的代码
sce.all.int$celltype = as.character(sce.all.int$celltype)
if("celltype" %in% colnames(sce.all.int@meta.data ) ){
  
  sel.clust = "celltype"
  sce.all.int <- SetIdent(sce.all.int, value = sel.clust)
  table(sce.all.int@active.ident) 
  
  dir.create('check-by-celltype')
  setwd('check-by-celltype')
  source('../../scRNA_scripts/check-all-markers.R')
  setwd('../') 
  getwd()
  phe=sce.all.int@meta.data
  save(phe,file = 'phe.Rdata')
  pdf('celltype-vs-orig.ident.pdf',width = 10)
  gplots::balloonplot(table(sce.all.int$orig.ident,
                            sce.all.int$celltype))
  dev.off()
}

######  FeaturePlot 基因可视化 ######
FeaturePlot(sce.all.int, features = c("Cdh1", "Epcam", "Gjb1",
                               "Cdh2", "Cd44", "Axl"),ncol = 3)
ggsave("FeaturePlot.pdf",width = 9,height = 4.5)

save(sce.all.int,file = "early-KIC.Rdata")

因为文章中给了mark基因，因此设置marker_paper <- c("Amy1",...)，向量。做了同文章中一样的mark基因小提琴图，首先根据0.6的分辨率分的0-14簇分组进行绘图。

marker_paper_vio_0.6.pdf