视频地址:http://mpvideo.qpic.cn/0b2ewiaakaaahmalygztmbrvbmwdawzaabia.f10002.mp4?
参考文章:
【0代码】单基因泛癌分析教程
视频中的代码:
代码语言:javascript复制# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(reshape2)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(RColorBrewer)
library(ggsignif)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
##可以是编码蛋白或非编码蛋白的基因
genes <- c("PTEN","EGFR","TP53","ATF1")
opt <- "output/001-RNASeq_ExpInNorAndTur/"
record_files <- dir(opt,".txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
"There is no record file that can be removed")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# ###在TCGA中没有正常样本,但可以匹配GTEx数据库中的正常样本的癌症类型
# proj4 <- c("TCGA-ACC","TCGA-LGG","TCGA-OV","TCGA-SKCM","TCGA-TGCT","TCGA-UCS" )
# ###在TCGA中正常样本大于0小于10,但可以匹配GTEx数据库中的正常样本的癌症类型
# proj5 <- c("TCGA-CESC","TCGA-GBM","TCGA-PAAD")
#
# ###在TCGA中正常样本大于0小于10,在GTEx数据库中也没有正常样本的癌症类型
# proj6 <- c("TCGA-PCPG","TCGA-SARC","TCGA-CHOL","TCGA-THYM")
# proj = "TCGA-BLCA"
norn <- 10 #正常样本数最小数量
geneexpdata <- data.frame()
for(proj in project){
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm)[-c(1:3)],
typesample = c("NT","NB","NBC","NEBV","NBM"))
if(length(SamN) >= norn){
###只要mRNA表达的数据
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##判断基因是否在表达矩阵中
gs <- intersect(genes,rownames(tpm))
if(length(gs) != 0) {
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
nor_exp <- del_dup_sample(tpm[,SamN],col_rename = T)
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
###============非配对样本==
##构建数据框
nor_gsExp <- log2(data.frame(nor_exp[gs,],check.names = T) 1) %>% t() %>%
as.data.frame() %>% mutate(Sample = rep(paste0("Normal(n=",ncol(nor_exp),")"),ncol(nor_exp)),
.before = 1)%>% melt(id.vars = "Sample")
tur_gsExp <- log2(data.frame(tur_exp[gs,],check.names = T) 1) %>% t() %>%
as.data.frame() %>% mutate(Sample = rep(paste0("Tumor(n=",ncol(tur_exp),")"),ncol(tur_exp)),
.before = 1)%>% melt(id.vars = "Sample")
data <- rbind(nor_gsExp,tur_gsExp)
head(data)
colnames(data) <- c("Sample","gene","exp")
compaired <- list(unique(data$Sample))
data <- mutate(data,cancer = proj,.before = 1)
geneexpdata <- rbind(geneexpdata,data)
##======绘图===
##输出路径
# opt <- "output/001-RNASeq_ExpInNorAndTur/"
fp_boxplot <- paste0(opt,proj,"/boxplot")
ifelse(dir.exists(fp_boxplot),"The folder already exists.",dir.create(fp_boxplot,recursive = T))
###==============多个基因在同一个图中
if(length(gs)> 1){
p <- ggplot(data, aes(x= gene, y = exp,color = Sample))
#geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))
geom_boxplot(aes(color = Sample),alpha =1,
lwd = 0.1, outlier.size = 1,
outlier.colour = "white") #color = c("red", "blue"),
# geom_violin(aes(fill = Sample),trim = FALSE)
# geom_boxplot(width = 0.1,fill = "white")
theme_bw()
labs(y= expression(log[2](TPM 1)))
scale_color_manual(values = c(brewer.pal(3,"Dark2"))) #c("#3300CC","#CC0000")
stat_compare_means(label = "p.signif")
#labs(title = 'ImmuneCellAI')
theme(legend.position = "top",
axis.text.x = element_text(face = "bold",colour = "#1A1A1A"),
axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
legend.title = element_blank(),
axis.title.x = element_blank(),
legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
)
ggsave(filename = paste0(fp_boxplot,"/all_gene.pdf"),
plot = p,height=5,width= length(gs)*0.9)
}
# g <- gs[1]
####================单一绘制==============
lapply(gs, function(g){
ldat <- data[data$gene == g,]
p = ggplot(ldat,aes(Sample,exp,fill= Sample))
geom_boxplot(aes(fill = Sample),notch = FALSE,
position = position_dodge(width =0.01,preserve = "single"),
outlier.alpha =1,width=0.4)
scale_fill_manual(values=c(brewer.pal(5,"Set1")))
geom_signif(comparisons = compaired,
step_increase = 0.1,
map_signif_level = T,
margin_top = 0.2,
test = "wilcox.test")
labs(y= paste0("The expression of ",g,"nlog2(TPM 1)"),title= proj)
theme_classic()
theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
plot.title = element_text(hjust = 0.5),
axis.line=element_line(colour="black",size=0.25),
axis.title=element_text(size=10,face="plain",color="black"),
axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
axis.title.x=element_text(size=12,face="plain",color="black"),
axis.title.y=element_text(size=12,face="plain",color="black"),
axis.text = element_text(size=12,color="black"),
legend.position="none"
)
#p
ggsave(filename = paste0(fp_boxplot,"/",g,"-Unpaired .pdf"),plot = p,width = 2.5,height = 5)
})
###===============
fp_violin <- paste0(opt,proj,"/violin")
ifelse(dir.exists(fp_violin),"The folder already exists.",dir.create(fp_violin,recursive = T))
lapply(gs, function(g){
ldat <- data[data$gene == g,]
p = ggplot(ldat, aes(Sample,exp,fill= Sample))
geom_violin(aes(fill = Sample),trim = FALSE)
geom_signif(comparisons = compaired,
step_increase = 0.1,
map_signif_level = T,
margin_top=0.2,
tip_length =0.02,
test = "wilcox.test")
geom_boxplot(width = 0.1,fill = "white")
scale_fill_manual(values=c(brewer.pal(3,"Dark2")))
theme_classic()
labs(y= paste0("The expression of ",g,"nlog2(TPM 1)"),title= proj)
theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
plot.title = element_text(hjust = 0.5),
axis.line=element_line(colour="black",size=0.25),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
axis.text = element_text(size=12,face="plain",color="black"),
legend.position="none"
)
p
ggsave(filename = paste0(fp_violin,"/",g,"-Unpaired .pdf"),plot = p,width = 3,height = 5)
})
###===========配对样本=========
id <- intersect(colnames(nor_exp),colnames(tur_exp))
if(length(id) > 3){
paired_nor_gsExp <- nor_exp[gs,id]
paired_tur_gsExp <- tur_exp[gs,id]
paired_nor_gsExp <- log2(data.frame(paired_nor_gsExp,check.names = T) 1) %>% t() %>%
as.data.frame() %>% mutate(Sample = rep("Normal",ncol(paired_nor_gsExp)),
id = id,
.before = 1)%>% melt(id.vars = c("Sample","id"))
paired_tur_gsExp <- log2(data.frame(paired_tur_gsExp,check.names = T) 1) %>% t() %>%
as.data.frame() %>% mutate(Sample = rep("Tumor",ncol(paired_tur_gsExp)),
id = id,
.before = 1)%>% melt(id.vars =c("Sample","id"))
paired_data <- rbind(paired_nor_gsExp, paired_tur_gsExp)
head(paired_data)
colnames(paired_data) <- c("Sample","id","gene","exp")
paired_compaired <- list(unique(paired_data$Sample))
###===============
lapply(gs, function(g){
ldat <- paired_data[paired_data$gene == g,]
#为了防止配对样本信息错乱,先构造一个配对样本的数据集
dat1_paried <- reshape2::dcast(ldat[c('Sample', 'id', 'exp')], Sample~id)
rownames(dat1_paried) <- dat1_paried$Sample
dat1_paried <- t(dat1_paried[,-1] )%>%as.data.frame()
head(dat1_paried)
#配对 t 检验,双侧检验
p.value <- t.test(dat1_paried$Normal, dat1_paried$Tumor, paired = TRUE, alternative = 'two.sided', conf.level = 0.95)
pv <- p.value[["p.value"]]
stasig <- ifelse(pv >= 0.001,paste0('p value = ',round(pv,3)),"p value < 0.01")
p1 <- ggplot(ldat, aes(x = Sample, y = exp,colour = Sample))
# geom_boxplot(aes(fill = Sample), show.legend = FALSE, width = 0.6) #绘制箱线图展示肿瘤组织和正常组织的两组基因表达整体分布
geom_point(size = 3) #绘制散点表示单个样本的基因表达信息
geom_line(aes(group = id), color = 'black', lwd = 0.05) #绘制样本连线,通过 aes(group) 参数指定配对样本信息
scale_fill_manual(values = c('#FE7280', '#AC88FF')) #以下是颜色分配、主题更改等,不再多说
theme_classic()
labs(y= paste0("The expression of ",g,"nlog2(TPM 1)"),title= proj)
theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
plot.title = element_text(hjust = 0.5),
axis.line=element_line(colour="black",size=0.25),
axis.title.x = element_blank(),
axis.text.x = element_text(face = "plain",colour = "black"),
axis.text = element_text(size=12,face="plain",color="black"),
legend.position="none"
) annotate('text',
x= ifelse(min(dat1_paried$Normal) > min(dat1_paried$Tumor),1,2),
y= min(ldat$exp 0.5),
label = stasig ,size = 4)
ggsave(filename = paste0(fp_boxplot,"/",g,"-paired .pdf"),plot = p1,width = 3,height = 3)
})
}
##记录分析的样本信息
opinfo <- paste0(proj,":nt","Normal(n)=",length(SamN),";Tumor(n)=",length(SamT))
output <- file(paste0(opt,"annotation.txt"), open="a b")
writeLines(opinfo, con=output)
close(output)
}
}
else if(length(SamN) == 0){
output <- file(paste0(opt," zeroNormalSamples_CancerType.txt"), open="a b")
writeLines(proj, con=output)
close(output)
}else {
opinfo <- paste0(proj,":nt","Normal(n)=",length(SamN))
output <- file(paste0(opt,"lessNormalSamples_CancerType.txt"), open="a b")
writeLines(opinfo, con=output)
close(output)
}
}
###============单基因泛癌=============
###===============
fp_pan <- paste0(opt,"/pan-cancer")
ifelse(dir.exists(fp_pan),"The folder already exists.",dir.create(fp_pan,recursive = T))
lapply(unique(geneexpdata$gene), function(g){
# g = unique(geneexpdata$gene)[1]
pan_exp <- geneexpdata[geneexpdata$gene == g,]
head(pan_exp)
pan_exp$Sample[grep("Normal",pan_exp$Sample)] <- "Normal"
pan_exp$Sample[grep("Tumor",pan_exp$Sample)] <- "Tumor"
# c(brewer.pal(3,"Dark2"))
p <- ggplot(pan_exp, aes(x=cancer, y = exp,color = Sample))
#geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))
geom_boxplot(aes(color = Sample),alpha =1,
lwd = 0.1, outlier.size = 1,
outlier.colour = "white") #color = c("red", "blue"),
# geom_violin(aes(fill = Sample),trim = FALSE)
# geom_boxplot(width = 0.1,fill = "white")
theme_bw()
labs(y= paste0("The expression of ",g,"nlog2(TPM 1)"))
scale_color_manual(values = c(brewer.pal(3,"Dark2"))) #c("#3300CC","#CC0000")
stat_compare_means(label = "p.signif")
#labs(title = 'ImmuneCellAI')
theme(legend.position = "top",
axis.text.x = element_text(angle = 45,face = "bold",colour = "#1A1A1A",hjust=1,vjust=1),
axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
legend.title = element_blank(),
axis.title.x = element_blank(),
legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
)
ggsave(filename = paste0(fp_pan,"/",g,"_in_pan-cancer.pdf"),
plot = p,height=5,width=9)
})
filterGeneTypeExpr函数代码:
代码语言:javascript复制#filter :"protein_coding","lncRNA","miRNA","misc_RNA","snRNA","miRNA","scRNA",....
filterGeneTypeExpr <- function(expr,fil_col = "gene_type",filter = FALSE){
##Delete unexpressed genes(rows)in all samples from expr.
dat <- expr[apply(expr[,-c(1:3)], 1, var)!=0,]
##rowSums
dat <- dplyr::mutate(dat,Sums = rowSums(dat[,-c(1:3)]),.before = 4)
dat <- dplyr::arrange(dat,gene_name,desc(Sums))
dat <- dat[!duplicated(dat$gene_name),]
rownames(dat) <- dat$gene_name
if (filter == FALSE) {
return(dat[,-c(1:4)])
}else{
dat <- dat[dat[,fil_col] == filter,-c(1:4)]
return(dat)
}
}
del_dup_sample函数代码:
代码语言:javascript复制##TCGA肿瘤样本去重
del_dup_sample <- function(data,col_rename =T){
data <- data[,sort(colnames(data))]
pid = gsub("(.*?)-(.*?)-(.*?)-.*","\1-\2-\3",colnames(data))
library(dplyr)
reps = pid[duplicated(pid)]##重复样本
if(length(reps)== 0 ){
message("There were no duplicate patient samples for this data")
}else{
data <- data[,!duplicated(pid)]
}
if(col_rename == T){
colnames(data) <- gsub("(.*?)-(.*?)-(.*?)-.*","\1-\2-\3",colnames(data))
}
return(data)
}