R基础:生信分析的R语言基础教程都在这里了,包括语法,绘图和数据分析。
生物信息数据分析教程视频——01-TCGA数据库RNAseq数据下载与整理
生物信息数据分析教程视频——02-TCGA数据库miRNA数据下载与整理
生物信息数据分析教程视频——03-有关TCGA数据库临床数据的问题
生物信息数据分析教程视频——04-TCGA数据库中SNV和CNV数据的下载
生物信息数据分析教程视频——05-TCGA数据库中甲基化数据的下载和整理
生物信息数据分析教程视频——06-GEO数据库中芯片数据的下载和整理
生物信息数据分析教程视频——07-TCGA数据库:基因的表达探索
生物信息数据分析教程视频——08-TCGA GTEx数据库的数据整理
生物信息数据分析教程视频——09-TCGA GTEx数据库联合表达分析
生物信息数据分析教程视频——10-TCGA数据库:miRNA的表达探索
生物信息数据分析教程视频——11-筛选相关性基因
生物信息数据分析教程视频——12-基因之间的相关性分析及可视化
生物信息数据分析教程视频——13-3种R包(DESeq2、edgeR和limma)进行RNAseq的差异表达分析与比较
生物信息数据分析教程视频——14-芯片数据的表达差异分析
生物信息数据分析教程视频——15-clusterProfiler包 ClueGO做富集分析
生物信息数据分析教程视频——16-单样本基因集富集分析(ssGSEA)用于肿瘤相关免疫细胞浸润水平评估 http://mpvideo.qpic.cn/0b2ezeadqaaanaafvdb5cjrvbsodhdeqaoaa.f10002.mp4?dis_k=434c024ce3cb08025f1257e30ff4b4a4&dis_t=1671187731&vid=wxv_2606760913223974912&format_id=10002&support_redirect=0&mmversion=false
参考视频: http://mpvideo.qpic.cn/0bf2nyabqaaa3iabiz7dnnqfa3wddbxaagaa.f10002.mp4?dis_k=dc98b18769e0ad4768ae32e8f35e4b2e&dis_t=1671187731&vid=wxv_1944485866878402561&format_id=10002&support_redirect=0&mmversion=false
实际应用:
【0代码】单基因泛癌分析教程
单基因泛癌分析视频教程补充——基于R语言
estimate代码:
代码语言:javascript复制
# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(corrplot)
library(pheatmap)
library(estimate)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/estimate/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))
record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
"There is no record file that can be removed")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project[1]){
message("===============================")
message(proj)
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# ###long2转换
# tur_exp <- log2(tur_exp 1)
# dat=log2(edgeR::cpm(pd_mat) 1)
# scores = estimate_RNAseq(dat,pro)
diffgene2ent <- bitr(rownames(tur_exp),
fromType="SYMBOL", toType="ENTREZID",
OrgDb="org.Hs.eg.db")
tur_exp <- tur_exp[diffgene2ent$SYMBOL,]
rownames(tur_exp) <- diffgene2ent$ENTREZID
colnames(tur_exp) <- gsub("-","_",colnames(tur_exp))
head(tur_exp)[,1:3]
###输出表达数据,作为filterCommonGenes的输入
input.f <- paste0(opt,"sample_input.txt")
write.table(tur_exp,file = input.f,row.names = T,sep = "t",quote = F)
output.f <- paste0(opt,"estimate_input.gct")##定义输出
filterCommonGenes(input.f= input.f, output.f=output.f, id="EntrezID")#过滤掉细胞中共有的基因
### 计算免疫浸润打分 ###
estimate_path <- paste0(opt,"sample_estimate.gct")
estimateScore(output.f, estimate_path, platform="illumina")#计算肿瘤微环境的评分
# ### 可视化 ###
# # 绘制单个样本
# output.dir <- paste0(opt,"figures/")##定义输出
# plotPurity(scores= estimate_path, samples=colnames(tur_exp)[1],
# platform="illumina",
# output.dir = opt)
# # 绘制多个样本
# plotPurity(scores= estimate_path,
# platform="illumina", output.dir = opt_path)
estimate_scores = read.table(estimate_path,skip = 2,header = T)
rownames(estimate_scores)=estimate_scores[,1]
estimate_scores= t(estimate_scores[,3:ncol(estimate_scores)]) %>% as.data.frame()
head(estimate_scores)
estimate_scores$Tumour_purity <- cos(0.6049872018 0.0001467884*estimate_scores$ESTIMATEScore)
save(estimate_scores,file = paste0(opt,proj,"-estimate.Rdata"))
file.remove(dir(opt,"(.gct)|(.txt)$",full.names = T))
}
###============老版本数据
message("===============================")
message(proj)
load("H:/data_analysis/TCGA/old/TCGA/RNASeqTPM/TCGA-LUAD-protein_coding-TPM.Rdata")
tpm <- filterGeneTypeExpr(expr = tpmdata,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# ###long2转换
# tur_exp <- log2(tur_exp 1)
diffgene2ent <- bitr(rownames(tur_exp),
fromType="SYMBOL", toType="ENTREZID",
OrgDb="org.Hs.eg.db")
tur_exp <- tur_exp[diffgene2ent$SYMBOL,]
rownames(tur_exp) <- diffgene2ent$ENTREZID
colnames(tur_exp) <- gsub("-","_",colnames(tur_exp))
head(tur_exp)[,1:3]
###输出表达数据,作为filterCommonGenes的输入
input.f <- paste0(opt,"sample_input.txt")
write.table(tur_exp,file = input.f,row.names = T,sep = "t",quote = F)
output.f <- paste0(opt,"estimate_input.gct")##定义输出
filterCommonGenes(input.f= input.f, output.f=output.f, id="EntrezID")#过滤掉细胞中共有的基因
### 计算免疫浸润打分 ###
estimate_path <- paste0(opt,"sample_estimate.gct")
estimateScore(output.f, estimate_path, platform="illumina")#计算肿瘤微环境的评分
estimate_scores = read.table(estimate_path,skip = 2,header = T)
rownames(estimate_scores)=estimate_scores[,1]
estimate_scores= t(estimate_scores[,3:ncol(estimate_scores)]) %>% as.data.frame()
head(estimate_scores)
estimate_scores$Tumour_purity <- cos(0.6049872018 0.0001467884*estimate_scores$ESTIMATEScore)
save(estimate_scores,file = paste0(opt,proj,"-estimate.Rdata"))
file.remove(dir(opt,"(.gct)|(.txt)$",full.names = T))
CIBERSORTx代码:
代码语言:javascript复制
# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
#https://rdrr.io/github/singha53/amritr/src/R/supportFunc_cibersort.R
# install.packages("devtools")
# devtools::install_github("BioInfoCloud/csCIBERSORTx")
library(csCIBERSORTx)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/CIBERSORT/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))
record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
"There is no record file that can be removed")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project[2:33]){
message("===============================")
message(proj)
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# ###long2转换,该方法不需要log转化内部函数会有判断
# tur_exp <- log2(tur_exp 1)
# head(tur_exp)[,1:3]
file <- paste0(opt,proj,"-CIBERSORT-Result.txt")
cibr <- CIBERSORT2(LM22_matrix = LM22,
mixture = tur_exp,
opt_path = file,
perm=100,
QN=TRUE)
# 提取CIBERSORT的打分结果
CIBERSORT_Score <- cibr[,1:(ncol(cibr)-3)]#因为绘图不需要最后三列,所以去掉最后三列,取22列
# 生成标准化函数normalize,构建一个function
normalize <- function(x){# (x-x的最小值)/(X的最大值-X的最小值)
return((x-min(x))/(max(x)-min(x)))}
# 使用normalize标准化CIBERSORT打分
CIBERSORT_Score <- normalize(CIBERSORT_Score)
# 将处理后CIBERSORT的打分写出到文件
save(CIBERSORT_Score, file = paste0(opt,proj,"-CIBERSORT-Result.Rdata"))
# # 构建热图注释数据框
# annotation <- data.frame(rownames(CIBERSORT_Score))
# colnames(annotation) <- "sample"#设定列名
# rownames(annotation) <- rownames(CIBERSORT_Score)#设定行名
# annotation_row <- cbind(annotation, cibr[,(ncol(cibr)-2):ncol(cibr)])#合并列,CIBERSORT_Result的最后三列
#
# # 绘制热图
# pheatmap(CIBERSORT_Score,
# show_colnames = T,
# cluster_rows = F,
# cluster_cols = F,#cluster_cols = F 对列不进行聚类
# annotation_row = annotation_row,#注释数据框放在行
# cellwidth=10,
# cellheight=40,
# fontsize=5,
# fontsize_row=5,
# scale="row",
# fontsize_col=3)
}
MCPcounter代码:
代码语言:javascript复制
# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
# devtools::install_github('ebecht/MCPcounter',ref='master', subdir='Source')
library(MCPcounter)
library(preprocessCore)
# https://github.com/ebecht/MCPcounter
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/MCPcounter/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))
record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
"There is no record file that can be removed")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
# 读取探针注释结果
probesets = read.table(curl('http://raw.githubusercontent.com/ebecht/MCPcounter/master/Signatures/probesets.txt'),
sep='t',stringsAsFactors=FALSE,colClasses='character')
# 读取基因名注释结果
genes = read.table(curl('http://raw.githubusercontent.com/ebecht/MCPcounter/master/Signatures/genes.txt'),
sep='t',
stringsAsFactors=FALSE,
header=TRUE,
colClasses='character',
check.names=FALSE)
library(csGeneset)
# Probe annotation.
probesets <- annoMCPcounter[["probesets"]]
# gene annotation
genes <- annoMCPcounter[["genes"]]
?MCPcounter.estimate
for(proj in project[1]){
message("===============================")
message(proj)
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# ###long2转换
# tur_exp <- log2(tur_exp 1)
#
head(tur_exp)[,1:3]
# 进行MCPcounter分析
MCPcounterScore <- MCPcounter.estimate(tur_exp, featuresType = "HUGO_symbols",#featuresType就是根据input来变化的
probesets = probesets,genes = genes)
# 生成标准化函数normalize,构建一个function
normalize <- function(x){# (x-x的最小值)/(X的最大值-X的最小值)
return((x-min(x))/(max(x)-min(x)))}
# 使用normalize函数进行标准化
MCPcounterScore <- normalize(data.matrix(MCPcounterScore))
# 将结果写出到文件
write.table(MCPcounterScore, file = "MCPcounterScore.txt",sep="t", quote=F, col.names=T)
# 构建包含注释信息的数据框
annotation_col <- data.frame(colnames(MCPcounterScore))
colnames(annotation_col) <- "sample"
rownames(annotation_col) <- colnames(MCPcounterScore)
# 绘制热图
pheatmap(MCPcounterScore,show_colnames = F,cluster_rows = F,
cluster_cols = T,annotation_col = annotation_col,#annotation_col 注释放在列, cluster_cols = T对列也就是样本进行聚类
cellwidth=15,cellheight=15,fontsize=5,
filename = 'figures/MCPcounter-heatmap.tiff')
}
xCell代码:
代码语言:javascript复制
# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(pheatmap)
# devtools::install_github('dviraran/xCell')
library(xCell)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/xCell/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project){
message("===============================")
message(proj)
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
# 进行xCell分析
xCell <- xCellAnalysis(tur_exp)
dim(xCell)
rownames(xCell)
## 如果是RNA-seq数据,则
xCell_RNAseq <- xCellAnalysis(tur_exp,rnaseq = T)
dim(xCell_RNAseq)
# 将结果写出到文件
save(xCell_RNAseq, file = paste0(opt,proj,"-xCell.Rdata"),sep="t", quote=F, col.names=T)
}
ImmuCellAI代码:
代码语言:javascript复制# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
# devtools::install_github("lydiaMyr/ImmuCellAI@main")
# https://github.com/lydiaMyr/ImmuCellAI
library(ImmuCellAI)
# https://github.com/lydiaMyr/ImmuCellAI-mouse
# devtools::install_github("lydiaMyr/ImmuCellAI-mouse@main")
data("train_data")
data("train_tag")
data("marker_exp")
data("marker_exp_T")
data("paper_marker")
data("train_data")
data("train_tag")
data("compensation_matrix")
data("immune_infiltate_marker")
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/009-CellFractions/ImmuCellAI/"
ifelse(dir.exists(opt),FALSE,dir.create(opt,recursive = T))
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
for(proj in project){
message("===============================")
message(proj)
load(FilePath[grep(proj,FilePath)])#STARdata
tpm <- STARdata[["tpm"]]
tpm <- filterGeneTypeExpr(expr = tpm,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
tpm <- tpm[apply(tpm,1,var) !=0,]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
typesample = c("NT","NB","NBC","NEBV","NBM"))
##肿瘤组织样本ID
SamT <- setdiff(colnames(tpm),SamN)
###去除重复样本
tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
dim(tur_exp)
# group <- data.frame(group = rep(c("High","low"),each = 258)) %>% t()
# colnames(group) <- colnames(tur_exp)
# tur_exp <- rbind(group,tur_exp)
#The prediction process of ImmuCellAI is updated, which simulated the flow cytometry process to predict cell type abundance by hierarchical strategy. All 24 cell types were divided into two layers, layer1:DC, B cell, Monocyte, Macrophage, NK, Neutrophil, CD4 T, CD8 T, NKT, Tgd; layer2:CD4 naive, CD8 naive, Tc, Tex, Tr1, nTreg, iTreg, Th1, Th2, Th17, Tfh, Tcm, Tem, MAIT.
#Parameters
#sample_expression: Sample expression profile in FPKM, TPM format by RNA-seq or log2-transformed signal by microarray.
#datatype: One of "rnaseq" and "microarray"
#group_tag: One of 0 and 1, if there is the need to perform the comparision between different groups. If the value is 1, users need to add a group tag row in the input epxression matrix to explain the group of each sample.
#response_tag: One of 0 and 1, if there is the need to predict the ICB response of each sample.
ImmuCellAIscorelist <- ImmuCellAI_new(tur_exp,"rnaseq",group_tag =0,0)
ImmuCellAIscore <- ImmuCellAIscorelist[["Sample_abundance"]]
#output
# The output of the function is a list with three variables including sample immune cell abundance, group comparison result and ICB response prediction result.
# 将结果写出到文件
save(ImmuCellAIscore, file = paste0(opt,proj,"-ImmuCellAI.Rdata"),sep="t", quote=F, col.names=T)
}