单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析9

2022-08-30 17:37:38 浏览数 (3)

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析1:https://cloud.tencent.com/developer/article/2055573

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析2:https://cloud.tencent.com/developer/article/2072069

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析3:https://cloud.tencent.com/developer/article/2078159

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析4:https://cloud.tencent.com/developer/article/2078348

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析5:https://cloud.tencent.com/developer/article/2084580

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析6:https://cloud.tencent.com/developer/article/2085385

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析7:https://cloud.tencent.com/developer/article/2085705

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析8:https://cloud.tencent.com/developer/article/2086805

这节是11个样本来源的数据整合

代码解析

代码语言:javascript复制
#####################################################################
# Change to your library path
.libPaths('/home/regnerm/anaconda3/envs/scENDO_scOVAR/lib/R/library')
source("./rowr.R")
source("./stacked_violin.R")
#####################################################################
##R包的加载
library(scater)
library(dplyr)
library(Seurat)
library(patchwork)
library(SingleCellExperiment)
library(scater)
library(ComplexHeatmap)
library(ConsensusClusterPlus)
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)
library(DoubletFinder)
library(Signac)
library(ggplot2)
library(stringr)
library(SingleR)
library(psych)
# Define filepaths/variables:
################################################################################
# PanglaoDB 
tsv=gzfile("./PanglaoDB_markers_27_Mar_2020.tsv.gz")  
panglaodb <- read.csv(tsv,header=T,sep = "t") 
##数据过滤
panglaodb <- dplyr::filter(panglaodb,species == "Hs" | species == "Mm Hs")# Human subset 
panglaodb <- dplyr::filter(panglaodb,organ == "Connective tissue" |
                             organ == "Epithelium" |
                             organ == "Immune system" |
                             organ == "Reproductive"|
                             organ == "Vasculature" |
                             organ == "Smooth muscle"
)

# split()函数是对字符串进行分割成列表
panglaodb <- split(as.character(panglaodb$official.gene.symbol), panglaodb$cell.type)
ESTIMATE.signatures <- "./ESTIMATE_signatures.csv"
SAMPLE.ID = "endo_ovar_All"

# Change to your working directory
output.dir <- "."
##################################################
setwd(output.dir)
##################################################

###########################################################
# Part 1: Merge individually processed scRNA-seq datasets
###########################################################
# list processed scRNA-seq Seurat objects made in the previous R scripts
datasets <- list.files(pattern = "*_processed.rds")
endo_3533EL <- readRDS(datasets[1])
endo_3533EL$SingleR <- endo_3533EL$SingleR.endo
endo_3533EL$Sample <- "3533EL"

endo_3571DL <- readRDS(datasets[2])
endo_3571DL$SingleR <- endo_3571DL$SingleR.endo
endo_3571DL$Sample <- "3571DL"

endo_36186L <- readRDS(datasets[3])
endo_36186L$SingleR <- endo_36186L$SingleR.endo
endo_36186L$Sample <- "36186L"

endo_36639L <- readRDS(datasets[4])
endo_36639L$SingleR <- endo_36639L$SingleR.endo
endo_36639L$Sample <- "36639L"

endo_366C5L <- readRDS(datasets[5])
endo_366C5L$SingleR <- endo_366C5L$SingleR.endo
endo_366C5L$Sample <- "366C5L"

endo_ovar_37EACL <- readRDS(datasets[6])
endo_ovar_37EACL$SingleR <- endo_ovar_37EACL$SingleR.ovar
endo_ovar_37EACL$Sample <- "37EACL"

ovar_38FE7L <- readRDS(datasets[7])
ovar_38FE7L$SingleR <- ovar_38FE7L$SingleR.ovar
ovar_38FE7L$Sample <- "38FE7L"

ovar_3BAE2L <- readRDS(datasets[8])
ovar_3BAE2L$SingleR <- ovar_3BAE2L$SingleR.ovar
ovar_3BAE2L$Sample <- "3BAE2L"

ovar_3CCF1L <- readRDS(datasets[9])
ovar_3CCF1L$SingleR <- ovar_3CCF1L$SingleR.ovar
ovar_3CCF1L$Sample <- "3CCF1L"

ovar_3E4D1L <- readRDS(datasets[10])
ovar_3E4D1L$SingleR <- ovar_3E4D1L$SingleR.ovar
ovar_3E4D1L$Sample <- "3E4D1L"

ovar_3E5CFL <- readRDS(datasets[11])
ovar_3E5CFL$SingleR <- ovar_3E5CFL$SingleR.ovar
ovar_3E5CFL$Sample <- "3E5CFL"

# Merge Seurat objects
## merge () 函数用于将 2 个有序序列合并为 1 个有序序列
rna <- merge(x = endo_3533EL,
             y = list(endo_3571DL,
                      endo_36186L,
                      endo_36639L,
                      endo_366C5L,
                      endo_ovar_37EACL,
                      ovar_38FE7L,
                      ovar_3BAE2L,
                      ovar_3CCF1L,
                      ovar_3E4D1L,
                      ovar_3E5CFL))

rna <- NormalizeData(rna, normalization.method = "LogNormalize", scale.factor = 10000)
rna <- FindVariableFeatures(rna, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(rna)
rna <- ScaleData(rna,features = all.genes)
rna <- RunPCA(rna)

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
# PercentageFeatureSet:计算属于每个单元格的可能特征的一个集合的所有计数的百分比
rna[["percent.mt"]] <- PercentageFeatureSet(rna, pattern = "^MT-")

# Score cells for immune/stromal/fibroblast/endothelial signatures
############################################################################################
immune.stromal <- read.csv(ESTIMATE.signatures,header = F)

stromal <- immune.stromal$V1[1:141]
immune <- immune.stromal$V1[142:282]
fibroblast <- panglaodb$Fibroblasts
endothelial <- panglaodb$`Endothelial cells`
epithelial <- panglaodb$`Epithelial cells`
smooth <- panglaodb$`Smooth muscle cells`
plasma <- panglaodb$`Plasma cells`
feats <- list(stromal,immune,fibroblast,endothelial,epithelial,smooth,plasma)

##Seurat的打分函数AddMouduleScore
rna <- AddModuleScore(rna,features = feats,name = c("stromal.","immune.","fibroblast.","endothelial.",
                                                    "epithelial.","smooth.","plasma."),search = T)
#########################################################################

#######################################################################

# Add PC1 to metadata
rna@meta.data$PC1 <- rna@reductions$pca@cell.embeddings[,1]
count_cor_PC1 <- cor(rna$PC1,rna$nCount_RNA,method = "spearman")

# cor()只给出相关系数一个值
stromal.cor <- cor(rna$PC1,rna$stromal.1,method = "spearman")
immune.cor <- cor(rna$PC1,rna$immune.2,method = "spearman")
fibroblast.cor <- cor(rna$PC1,rna$fibroblast.3,method = "spearman")
endothelial.cor <- cor(rna$PC1,rna$endothelial.4,method = "spearman")
epithelial.cor <- cor(rna$PC1,rna$epithelial.5,method = "spearman")
smooth.cor <- cor(rna$PC1,rna$smooth.6,method = "spearman")
plasma.cor <- cor(rna$PC1,rna$plasma.7,method = "spearman")

# rna <- JackStraw(rna, num.replicate = 100,dims = 50)
# rna <- ScoreJackStraw(rna,dims = 1:50)
# JackStrawPlot(rna, dims = 1:50) ggsave("JackStraw.png")
# 

###########################################################
# Part 2: Reprocessing and CNV-read depth check
###########################################################

# If PC1 is correalted with read depth, check to see if biological variation is corralted to PC1
# ROUND函数是将某个数字四舍五入到指定的位数
if (round(abs(count_cor_PC1),2) > 0.5){
  
  if( round(abs(stromal.cor),2) >= 0.5 |
      round(abs(immune.cor),2) >= 0.5 |
      round(abs(fibroblast.cor),2) >= 0.5 |
      round(abs(endothelial.cor),2) >= 0.5 |
      round(abs(epithelial.cor),2) >= 0.5 |
      round(abs(smooth.cor),2) >= 0.5 |
      round(abs(plasma.cor),2) >= 0.5){
    rna <- FindNeighbors(rna,dims = 1:50)
    rna <- FindClusters(rna,resolution = 0.7)
    rna <- RunUMAP(rna,dims = 1:50)
    Idents(rna) <- "RNA_snn_res.0.7"
    
    rna$PC1.loading <- rna@reductions$pca@cell.embeddings[,1]
    rna$cell.barcode <- rownames(rna@meta.data)
    rna$CNV.Pos <- ifelse(rna$Total_CNVs > 0,TRUE,FALSE)
    
    boxplot.cnv <- ggplot(rna@meta.data,aes(x= RNA_snn_res.0.7,y=PC1.loading,color = as.factor(CNV.Pos))) geom_boxplot()
    boxplot.cnv ggsave("CNV_PC1_boxplot.png")
    
    data <- describeBy(boxplot.cnv$data$PC1.loading, boxplot.cnv$data$RNA_snn_res.0.7, mat = TRUE)
    
    wilcox <- wilcox.test(data = rna@meta.data,PC1.loading~CNV.Pos)
    
    if (wilcox$p.value < 0.05){
      rna <- rna
      saveRDS(rna,"./rna_PassedPC1Checks.rds")
    }else{
      all.genes <- rownames(rna)
      rna <- ScaleData(rna, features = all.genes,vars.to.regress = "nCount_RNA")
      rna <- RunPCA(rna)
      rna <- FindNeighbors(rna,dims = 1:50)
      rna <- FindClusters(rna,resolution = 0.7)
      rna <- RunUMAP(rna,dims = 1:50)
      Idents(rna) <- "RNA_snn_res.0.7"
      saveRDS(rna,"./rna_FailedCNVTest.rds")
    }
    
  }else{
    all.genes <- rownames(rna)
    rna <- ScaleData(rna, features = all.genes,vars.to.regress = "nCount_RNA")
    rna <- RunPCA(rna)
    rna <- FindNeighbors(rna,dims = 1:50)
    rna <- FindClusters(rna,resolution = 0.7)
    rna <- RunUMAP(rna,dims = 1:50)
    Idents(rna) <- "RNA_snn_res.0.7"
    saveRDS(rna,"./rna_FailedCorTest.rds")
  }
}else{
  rna <- FindNeighbors(rna,dims = 1:50)
  rna <- FindClusters(rna,resolution = 0.7)
  rna <- RunUMAP(rna,dims = 1:50)
  Idents(rna) <- "RNA_snn_res.0.7"
  saveRDS(rna,"./rna_SkipChecks.rds")
}

###########################################################
# Part 3: Cell type annotation of clusters 
###########################################################
Idents(rna) <- "RNA_snn_res.0.7"
# Visualize clusters and SingleR annotations 
##这个是比较通用的可视化的代码
DimPlot(rna,group.by = "Sample",label = T) ggsave("Patient_UMAP.pdf",width = 6,height = 4)
DimPlot(rna,group.by = "RNA_snn_res.0.7",label = T) ggsave("Cluster_UMAP.pdf",width = 6,height = 4)
DimPlot(rna,group.by = "SingleR",label = T) ggsave("SingleR_UMAP.pdf",width = 6,height = 4)
DimPlot(rna,group.by = "SingleR.HPCA",label = T) ggsave("HPCA_UMAP.pdf",width = 8,height = 4)
DimPlot(rna,group.by = "SingleR.BED",label = T) ggsave("BED_UMAP.pdf",width = 8,height = 4)

# Verify SingleR annotations and check for mast cells:
rna <- AddModuleScore(rna,features = list(panglaodb$`B cells`,
                                          panglaodb$`Plasma cells`,
                                          panglaodb$`Mast cells`,
                                          panglaodb$Macrophages,
                                          panglaodb$`Dendritic cells`,
                                          panglaodb$`T cells`,
                                          panglaodb$`NK cells`,
                                          panglaodb$`Endothelial cells`,
                                          panglaodb$Fibroblasts,
                                          panglaodb$`Epithelial cells`,
                                          panglaodb$`Smooth muscle cells`,
                                          c("TPSB2","TPSAB1","KIT")),#Three gene Mast signature
                      name = c("B.","Plasma.","Mast.","Macrophage.","DC.",
                               "T.","NK.","Endothelial.","Fibroblast.","Epithelial.","Smooth_muscle.","Mast_3_gene."),search = T)

# Visualize gene signatures in violin plots: 
##########################################################################################################
##堆叠小提琴图(StackedVlnPlot)
StackedVlnPlot(rna,features = c("B.1","Plasma.2","Mast.3","Macrophage.4",
                                "DC.5","T.6","NK.7","Endothelial.8","Fibroblast.9",
                                "Epithelial.10","Smooth_muscle.11")) ggsave("Panglaodb_Signatures_Violin.pdf",width = 8,height = 16)


StackedVlnPlot(rna,features = c("TPSB2","TPSAB1","KIT")) ggsave("Mast_Signatures_Violin.pdf",width = 8,height = 4)

######################################################################################################
# Assess Mast cell enrichment to potentially rename clusters
vln.df <- VlnPlot(rna,features = "Mast_3_gene.12")
library(psych)
data.mast <- describeBy(vln.df$data$Mast_3_gene.12, vln.df$data$ident, mat = TRUE)
data.mast <- dplyr::filter(data.mast,median > 0.225)

# Assess B cell enrichment to potentially rename clusters
vln.df <- VlnPlot(rna,features = "B.1")
library(psych)
data.B <- describeBy(vln.df$data$B.1, vln.df$data$ident, mat = TRUE)
data.B <- dplyr::filter(data.B,median > 0.225)

# Annotate mast/b cells
rna$mast.cell <- ifelse(rna$RNA_snn_res.0.7 %in% as.character(data.mast$group1),TRUE,FALSE)
rna$B.cell <- ifelse(rna$RNA_snn_res.0.7 %in% as.character(data.B$group1),TRUE,FALSE)

# Append SingleR annotations to cluster labels:
# The most common SingleR label in each cluster becomes the cluster label 

cells <- rna@meta.data %>% dplyr::group_by(RNA_snn_res.0.7) %>% dplyr::count(SingleR) 

## rep函数可以让不同的元素重复多次
cluster.ids <- rep("fill",length(levels(Idents(rna))))
names(cluster.ids) <- levels(Idents(rna))

for ( i in factor(cells$RNA_snn_res.0.7)){
  library(tidyr)
  cells.sub <- cells %>% dplyr::filter(RNA_snn_res.0.7 ==i) %>% arrange(desc(n))
  cluster.ids[[i]] <- cells.sub$SingleR[1]
  
}

# Rename cluster if median enrichment score is greater than 0.1  
## nrow ()函数用于返回指定矩阵的行数
if(nrow(data.mast) > 0){
  for (i in 1:nrow(data.mast)){
    cluster.ids[[data.mast$group1[i]]] <- "Mast cell"#Marker Mast cell cluster 
  }
}else{
  cluster.ids <- cluster.ids
}

# Rename cluster if median enrichment score is greater than 0.1  
if (nrow(data.B) >0 ){
  for (i in 1:nrow(data.B)){
    cluster.ids[[data.B$group1[i]]] <- "B cell"#Marker B cell cluster 
  }
}else{
  cluster.ids <- cluster.ids
}


cluster.ids <- as.data.frame(cluster.ids)

levels(Idents(rna)) <- cluster.ids$cluster.ids

rna$cell.type <- Idents(rna)

rna$cell.type <- paste0(rna$RNA_snn_res.0.7,"-",rna$cell.type)

Idents(rna) <- rna$cell.type

###############################################################################
DimPlot(rna,group.by = "cell.type",label = F) ggsave("cell_type_UMAP.pdf",width = 10,height = 4)

# Perform DEGs analysis with cell type annotated clusters 
# # Wilcox
Idents(rna) <- "cell.type"
Wilcox.markers <- FindAllMarkers(object =rna, min.pct = 0.25,only.pos = T,
                                 test.use = "wilcox")
saveRDS(Wilcox.markers,"./wilcox_DEGs.rds")


# Save Seurat object 
#date <- Sys.Date()
saveRDS(rna,paste0(SAMPLE.ID,"_scRNA_processed.rds"))

# ###########################################################################################################
# # Starting cells, PostQC cells, doublets, Post doublet/QC cells, Cluster #
# output.meta <- data.frame(TotalCells=length(colnames(rna)),
#                           NumClusters= length(levels(as.factor(Idents(rna)))),
#                           stringsAsFactors=FALSE)
# output <- as.data.frame(t(output.meta))
# colnames(output) <- SAMPLE.ID
# xlsx::write.xlsx(output, "scRNA_pipeline_summary.xlsx",
#                  row.names = T, col.names = TRUE)

##################################################################################
# END OF SCRIPT
###################################################################################

0 人点赞