单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析4

2022-08-21 13:03:37 浏览数 (1)

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析1:https://cloud.tencent.com/developer/article/2055573

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析2:https://cloud.tencent.com/developer/article/2072069

单细胞代码解析-妇科癌症单细胞转录组及染色质可及性分析3:https://cloud.tencent.com/developer/article/2078159

图片.png图片.png
代码语言:javascript复制
###########################################################
# Part 3: scRNA-seq processing after doublet removal
###########################################################
setwd(dir)

# Load the RNA dataset
counts.init <- Read10X_h5(filename = "./filtered_feature_bc_matrix.h5")
# Initialize the Seurat object with the raw (non-normalized data).
rna <- CreateSeuratObject(counts = counts.init, min.cells = 3)# genes not present in at least 3 cells are removed
rna

rna <- rna[,cells]
#这部分放入了以前鉴定得到的双细胞的结果;%in%:意思都是x的值是否在y里面
rna <- rna[,!(colnames(rna) %in% doublets)]
dim(rna)


#Normalize;标准换
rna <- NormalizeData(rna, normalization.method = "LogNormalize", scale.factor = 10000)

#Feature Selection;寻找高变基因
rna <- FindVariableFeatures(rna, selection.method = "vst", nfeatures = 2000)

#Scaling
all.genes <- rownames(rna)
rna <- ScaleData(rna, features = all.genes)
rna <- RunPCA(rna)


feats <- list(stromal,immune,fibroblast,endothelial,epithelial,smooth,plasma)
#利用seurat包的一个打分函数AddModuleScore,依据基因的平均表达水平进行分析
rna <- AddModuleScore(rna,features = feats,name = c("stromal.","immune.","fibroblast.","endothelial.",
                                                    "epithelial.","smooth.","plasma."),search = T)


# Add PC1 to metadata
##对前面提到的细胞类群进行相关性分析
rna@meta.data$PC1 <- rna@reductions$pca@cell.embeddings[,1]

count_cor_PC1 <- cor(rna$PC1,rna$nCount_RNA,method = "spearman")

stromal.cor <- cor(rna$PC1,rna$stromal.1,method = "spearman")
immune.cor <- cor(rna$PC1,rna$immune.2,method = "spearman")
fibroblast.cor <- cor(rna$PC1,rna$fibroblast.3,method = "spearman")
endothelial.cor <- cor(rna$PC1,rna$endothelial.4,method = "spearman")
epithelial.cor <- cor(rna$PC1,rna$epithelial.5,method = "spearman")
smooth.cor <- cor(rna$PC1,rna$smooth.6,method = "spearman")
plasma.cor <- cor(rna$PC1,rna$plasma.7,method = "spearman")


# Make JackStraw Plot
rna <- JackStraw(rna, num.replicate = 100,dims = 50)
rna <- ScoreJackStraw(rna,dims = 1:50)
JackStrawPlot(rna, dims = 1:50) ggsave("JackStraw_postdoublet.png")
##下面是循环分析
# If PC1 is correalted with read depth, check to see if biological variation is corralted to PC1
#首先确定亚群分析的resolution确定,将resolution确定为0.7
#round:它四舍五入到给定的位数,如果没有提供四舍五入的位数,它会将数字四舍五入到最接近的整数
if (round(abs(count_cor_PC1),2) > 0.5){
  
  if( round(abs(stromal.cor),2) >= 0.5 |
      round(abs(immune.cor),2) >= 0.5 |
      round(abs(fibroblast.cor),2) >= 0.5 |
      round(abs(endothelial.cor),2) >= 0.5 |
      round(abs(epithelial.cor),2) >= 0.5 |
      round(abs(smooth.cor),2) >= 0.5 |
      round(abs(plasma.cor),2) >= 0.5){
    
    rna <- FindNeighbors(rna,dims = 1:50)
    rna <- FindClusters(rna,resolution = 0.7)
    rna <- RunUMAP(rna,dims = 1:50)
    Idents(rna) <- "RNA_snn_res.0.7"
    
    
    # Verify with inferCNV: is PC1 correlated with CNV events/Malignancy?
    #########################################################################
    # inferCNV: does PC1 also correlated with CNV/malignancy status?
    #InferCNV是一个由broad研究所开发的,利用单细胞转录组数据分析肿瘤细胞拷贝数变异(CNV)的工具。基本思路是在整个基因组范围内,通过计算肿瘤细胞关于参考的“正常”细胞的相对表达,利用滑窗思想对邻近的基因相对表达,计算拷贝数。
    library(infercnv)
    library(stringr)
    library(Seurat)
    counts_matrix = GetAssayData(rna, slot="counts")
    
    # Identify immune clusters
    #######################################################
    # Find immune cells by relative enrichment of ESTIMATE immune signature
    #描述性统计 —— R 'psych' 表型数据的描述性统计,是对表型数据进行的基础分析,包括最大值、最小是、均值、方差、极差等
    library(psych)
    test <- VlnPlot(rna,features = "immune.2")
    #describeBy:分组计算
    data <- describeBy(test$data$immune.2, test$data$ident, mat = TRUE)
    data.immune <- dplyr::filter(data,median > 0.1)
    
    test <- VlnPlot(rna,features = "plasma.7")
    data <- describeBy(test$data$plasma.7, test$data$ident, mat = TRUE)
    data.plasma <- dplyr::filter(data,median > 0.1)
    
    immune.clusters <- intersect(data.immune$group1,levels(Idents(rna)))
    plasma.clusters <- intersect(data.plasma$group1,levels(Idents(rna)))
    
    immune.clusters <- unique(append(immune.clusters,plasma.clusters))
    
    for (i in 1:length(immune.clusters)){
      j <- which(levels(Idents(rna)) == immune.clusters[i])
      levels(Idents(rna))[j] <- paste0("immune.",immune.clusters[i])
    }
    rna@meta.data$postdoublet.idents <- Idents(rna)
    idents <- data.frame(rownames(rna@meta.data),rna@meta.data$postdoublet.idents)
    
    
    colnames(idents) <- c("V1","V2")
    saveRDS(rna,"./rna_postdoublet_preinferCNV.rds")
    # Make inferCNV inputs
    
    rownames(idents) <- NULL
    colnames(idents) <- NULL
    write.table(idents,"./sample_annotation_file_inferCNV.txt",sep = "t",row.names = FALSE)
    #R语言使用read.delim函数读取带分隔符的文本文件 
    ##构建注释文件
    idents <- read.delim("./sample_annotation_file_inferCNV.txt",header = F)
    gtf <- read.delim(GRCH38.annotations,header = F)
    library(EnsDb.Hsapiens.v86)
    convert.symbol = function(Data){
      ensembls <- Data$V1
      ensembls <- gsub("\.[0-9]*$", "", ensembls)
      geneIDs1 <- ensembldb::select(EnsDb.Hsapiens.v86, keys= ensembls, keytype = "GENEID", columns = "SYMBOL")
      #cbind.fill:可填充缺失值并适用于任意数据类型
      Data <- cbind.fill(Data, geneIDs1, fill = NA)
      #去除NA
      Data <- na.omit(Data)
      Data$feature <- Data$SYMBOL
      Data.new <- data.frame(Data$SYMBOL,Data$V2,Data$V3,Data$V4)
      Data.new$Data.V2 <- paste("chr",Data.new$Data.V2,sep = "")
      Data.new$Data.SYMBOL <- make.unique(Data.new$Data.SYMBOL)
      return(Data.new)
    }
    ##convert.symbol:基因id的转换
    gtf <- convert.symbol(gtf)
    head(gtf)
    
    write.table(gtf,"./Homo_sapiens.GRCh38.86.symbol.txt",sep = "t",row.names = FALSE,col.names = FALSE)
    
    #R 语言中的 length () 函数用于获取或设置向量 (列表)或其他对象的长度。
    #CreateInfercnvObject () 函数构建infercnv对象,此处必须设置gene_order_file参数,其输入是一个基因的染色体位置信息文件,以制表符分隔
    num.immune.clusters = length(immune.clusters)
    # create the infercnv object
    if ( num.immune.clusters == 1) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=paste0("immune.",immune.clusters[1]))
      
    } else if (num.immune.clusters == 2) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2])))
      
    } else if ( num.immune.clusters == 3 ) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3])))
    } else if (num.immune.clusters == 4) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4])))
    } else if (num.immune.clusters == 5) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5])))
    } else if (num.immune.clusters == 6) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5]),
                                                            paste0("immune.",immune.clusters[6])))
    }else if (num.immune.clusters == 7) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5]),
                                                            paste0("immune.",immune.clusters[6]),
                                                            paste0("immune.",immune.clusters[7])))
    }else if (num.immune.clusters == 8) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5]),
                                                            paste0("immune.",immune.clusters[6]),
                                                            paste0("immune.",immune.clusters[7]),
                                                            paste0("immune.",immune.clusters[8])))
    }else if (num.immune.clusters == 9) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5]),
                                                            paste0("immune.",immune.clusters[6]),
                                                            paste0("immune.",immune.clusters[7]),
                                                            paste0("immune.",immune.clusters[8]),
                                                            paste0("immune.",immune.clusters[9])))
    }else if (num.immune.clusters == 10) {
      infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                          annotations_file="./sample_annotation_file_inferCNV.txt",
                                          gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                          ref_group_names=c(paste0("immune.",immune.clusters[1]),
                                                            paste0("immune.",immune.clusters[2]),
                                                            paste0("immune.",immune.clusters[3]),
                                                            paste0("immune.",immune.clusters[4]),
                                                            paste0("immune.",immune.clusters[5]),
                                                            paste0("immune.",immune.clusters[6]),
                                                            paste0("immune.",immune.clusters[7]),
                                                            paste0("immune.",immune.clusters[8]),
                                                            paste0("immune.",immune.clusters[9]),
                                                            paste0("immune.",immune.clusters[10])))
    }
    #根据cluster的数量,进行else if的处理,去输出判断拷贝数的结果
    # perform infercnv operations to reveal cnv signal
    #infercnv::run():两步法进行拷贝数分析,不需要一步一步进行尝试,参考来源:[https://www.jianshu.com/p/70f7a168fe62](https://www.jianshu.com/p/70f7a168fe62)
    infercnv_obj = infercnv::run(infercnv_obj,
                                 cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                                 out_dir="./output_dir_CNV_postdoublet_PassedPC1Checks",  # dir is auto-created for storing outputs
                                 cluster_by_groups=T,   # cluster
                                 denoise=T,scale_data = T,
                                 HMM=T,HMM_type = "i6",analysis_mode = "samples",min_cells_per_gene = 10,
                                 BayesMaxPNormal = 0.4, num_threads = 8
                                 
    )
    
  
    regions <- read.delim("./output_dir_CNV_postdoublet_PassedPC1Checks/HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.4.pred_cnv_regions.dat")
    probs <- read.delim("./output_dir_CNV_postdoublet_PassedPC1Checks/BayesNetOutput.HMMi6.hmm_mode-samples/CNV_State_Probabilities.dat")
    #R 编程语言中的 as.data.frame() 函数用于将对象转换为数据框
    probs <- as.data.frame(t(probs[3,]))
    colnames(probs) <- "Prob.Normal"
    probs <- dplyr::filter(probs,Prob.Normal < 0.05)
    cnvs <- rownames(probs)
    #gsub ()可以用于字段的删减、增补、替换和切割,可以处理一个字段也可以处理由字段组成的向量
    cnvs <- gsub("\.","-",cnvs)
    
    regions <- regions[regions$cnv_name %in% cnvs, ]
    
    #sub R语言中的函数用于替换字符串中模式的第一个匹配项
    cnv.groups <- sub("\..*", "", regions$cell_group_name)
      
    length(which(rownames(rna@reductions$pca@cell.embeddings) == rownames(rna@meta.data)))
    rna$PC1.loading <- rna@reductions$pca@cell.embeddings[,1]
    rna$cell.barcode <- rownames(rna@meta.data)
    #ifelse()中的条件判断中可以得到多个逻辑结果,有多少个逻辑结果,ifelse()的返回值就有多少个元素
    rna$CNV.Pos <- ifelse(as.character(rna$postdoublet.idents) %in% cnv.groups,1,0)
    
    cnv.freq <- data.frame(table(regions$cell_group_name))
    cnv.freq$Var1 <- sub("\..*", "", cnv.freq$Var1)
    
    rna$Total_CNVs <- ifelse(as.character(rna$postdoublet.idents) %in% cnv.freq$Var1,cnv.freq$Freq,0)
    
    boxplot.cnv <- ggplot(rna@meta.data,aes(x= postdoublet.idents,y=PC1.loading,color = as.factor(CNV.Pos))) geom_boxplot()
    boxplot.cnv ggsave("postdoublet_CNV_PC1_boxplot.png")
    #describeBy:分组计算
    data <- describeBy(boxplot.cnv$data$PC1.loading, boxplot.cnv$data$postdoublet.idents, mat = TRUE)
    data$CNV <- ifelse(data$group1 %in% cnv.groups,1,0)
    
    wilcox <- wilcox.test(data = rna@meta.data,PC1.loading~CNV.Pos)
    ##根据pvalue进行判断不同的数据属性
    if (wilcox$p.value < 0.05){
      rna <- rna
      library(stringr)
      #str_remove:删除字符串中的匹配模式。
      levels(Idents(rna)) <- str_remove(levels(Idents(rna)),"immune.")
      saveRDS(rna,"./rna_postdoublet_PassedPC1Checks.rds")
    }else{
      all.genes <- rownames(rna)
      rna <- ScaleData(rna, features = all.genes,vars.to.regress = "nCount_RNA")
      rna <- FindNeighbors(rna,dims = 1:50)
      rna <- FindClusters(rna,resolution = 0.7)
      rna <- RunUMAP(rna,dims = 1:50)
      Idents(rna) <- "RNA_snn_res.0.7"
      
      library(infercnv)
      library(stringr)
      library(Seurat)
      counts_matrix = GetAssayData(rna, slot="counts")
   

总结

作者大大的代码也太长了吧,根本一篇都弄不完。上面分析的主要思路是前期通过对细胞类型鉴定,然后筛选出了双细胞结果,根据细胞类型进行下面的分析,这次加入了肿瘤变异之间的拷贝数分析,感觉自己的分析中也可以应用到这个内容。然后作者的代码里有一个提取注释文件的方法也很好,也是可以保留下来,以后提取注释文件用到的。还有作者经常用到的else if语句也很多,可以减少后面的不停调试的方法。

0 人点赞