转录组GSE105789_小鼠数据下游分析注意事项
简单记录下GSE105789小鼠数据的下游分析的主要事项,与human的数据分析的主要区别是在进行id转换、kegg、go、gsea时,需要注意数据库和物种信息,应该选择小鼠。
数据是简单的2分组,count矩阵并没有下自网站,而是来自我做的上游分析。参考文章转录组上游分析—使用iseq下载原始数据、小鼠基因组、单端测序数据处理。
step1 load-from-pipeline
注意行名从ENSEMBL转化为SYMBOL时的数据库为org.Mm.eg.db。
代码语言:r复制rm(list = ls())#清空当前的工作环境
options(stringsAsFactors = F)#不以因子变量读取
options(scipen = 20)#不以科学计数法显示
library(data.table)
library(tinyarray)
tbl <- as.matrix(data.table::fread("raw_counts.txt", header=T, colClasses="integer"), rownames=1)
dim(tbl)
mat=as.data.frame(tbl)
mat[1:4,1:4]
keep_feature <- rowSums (mat > 1) > 1 ;table(keep_feature)
ensembl_matrix <- mat[keep_feature, ]
rownames(ensembl_matrix)=rownames(mat)[keep_feature]
ensembl_matrix[1:4,1:4]
#行名转换
library(AnnoProbe)
library(org.Mm.eg.db)
k<-AnnotationDbi::keys(org.Mm.eg.db,keytype = "ENSEMBL")
e2s<-AnnotationDbi::select(org.Mm.eg.db,
keys= rownames(ensembl_matrix),
columns="SYMBOL",
keytype = "ENSEMBL")
head(e2s)
ids = na.omit(e2s)
ids=ids[!duplicated(ids$SYMBOL),]
ids=ids[!duplicated(ids$ENSEMBL),]
head(ids)
symbol_matrix= ensembl_matrix[match(ids$ENSEMBL,rownames(ensembl_matrix)),]
rownames(symbol_matrix) = ids$SYMBOL
symbol_matrix[1:4,1:4]
colnames(symbol_matrix)
library(AnnoProbe)
head(rownames(symbol_matrix))
#注意更改为小鼠信息
ids=annoGene(rownames(symbol_matrix),'SYMBOL','mouse')
head(ids)
tail(sort(table(ids$biotypes)))
ids=ids[ids$biotypes=='protein_coding',]
symbol_matrix=symbol_matrix[rownames(symbol_matrix) %in% ids$SYMBOL,]
#分组信息
colnames(symbol_matrix) <- c("F4/80neg_rep1","F4/80neg_rep2",
"CD301b_mphage_rep1","CD301b_mphage_rep2")
colnames(symbol_matrix)
group_list=ifelse(grepl('F4/80neg', colnames(symbol_matrix) ),
'control','case' )
group_list
group_list = factor(group_list,levels = c('control','case' ))
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')
dat=log(edgeR::cpm(symbol_matrix) 1)
save(dat,group_list,file = 'step1-output.Rdata')
step2 deg-comp
质控、差异分析等与人类数据分析并无不同,这里直接略过。
代码语言:r复制rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
library(AnnoProbe)
library(GEOquery)
library(ggplot2)
library(ggstatsplot)
library(patchwork)
library(reshape2)
library(stringr)
getOption('timeout')
options(timeout=10000)
load('./symbol_matrix.Rdata')
symbol_matrix[1:4,1:4]
#设置工作目录
gse_number <- 'deg'
gse_number
dir.create(gse_number)
setwd( gse_number )
getwd()
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')
#质控和差异分析和人类并无不同
source('../scripts/step2-qc-counts.R')
source('../scripts/step3-deg-deseq2.R')
source('../scripts/step3-deg-edgeR.R')
source('../scripts/step3-deg-limma-voom.R')
source('../scripts/step4-qc-for-deg.R') # 需要调整logFC和p值的阈值
#重点介绍下
source('../scripts/step5-anno-by-GSEA.R')
source('../scripts/step5-anno-by-ORA.R') # 需要调整logFC和p值的阈值
setwd('../')
getwd()
重点介绍下step5-anno-by-GSEA.R、step5-anno-by-ORA.R
step5-anno-by-GSEA.R
1.注意SYMBOL转ENTREZID时候,R包选择org.Mm.eg.db;
2.注意gseKEGG()函数中,物种选择mmu;
3.注意setReadable()函数中,OrgDb='org.Mm.eg.db';
代码语言:r复制rm(list = ls())
load( file = 'DEG_deseq2.Rdata' )
load( file = 'DEG_limma_voom.Rdata' )
load( file = 'DEG_edgeR.Rdata' )
colnames(DEG_deseq2)
colnames(DEG_limma_voom)
# FoxO signaling pathway
nrDEG=DEG_deseq2[,c("log2FoldChange", "padj")]
nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val" )]
head(nrDEG)
colnames(nrDEG)=c('logFC','P.Value')
library(org.Mm.eg.db)
library(clusterProfiler)
gene <- bitr(rownames(nrDEG), fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Mm.eg.db)
head(gene)
head(nrDEG)
nrDEG = nrDEG[rownames(nrDEG) %in% gene$SYMBOL,]
nrDEG$ENTREZID = gene$ENTREZID[match(rownames(nrDEG) , gene$SYMBOL)]
# https://www.ncbi.nlm.nih.gov/gene/?term=SPARCL1
geneList=nrDEG$logFC
names(geneList)=nrDEG$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)
library(clusterProfiler)
# 一本书,很多数据库,很多可视化
#~~~这里需要替换物种~~~
#mmu
#mmu
str(geneList)
kk_gse <- gseKEGG(geneList = geneList,
organism = 'mmu',#按需替换
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 0.99,
verbose = FALSE)
tmp=kk_gse@result
kk=DOSE::setReadable(kk_gse, OrgDb='org.Mm.eg.db',
keyType='ENTREZID')#按需替换
kk@result$Description=gsub(' - Mus musculus \(house mouse\)',
'',kk@result$Description )
head(kk@result$Description)
tmp=kk@result
pro='test'
write.csv(kk@result,file = file.path(paste0(pro,'_kegg.gsea.csv')))
save(kk,file = file.path( 'gsea_kk.Rdata'))
# 假如这里找不到显著下调的通路,可以选择调整阈值,或者其它。
# down_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore < -0.6,];down_kegg$group=-1
# up_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore > 0.3,];up_kegg$group=1
# down_k <- down_kegg[head(order(down_kegg$pvalue,decreasing = F)),]
# up_k <- up_kegg[head(order(up_kegg$pvalue,decreasing = F)),]
#~~~取前6个上调通路和6个下调通路~~~
up_k <- kk[head(order(kk$enrichmentScore,decreasing = T)),];up_k$group=1
down_k <- kk[tail(order(kk$enrichmentScore,decreasing = T)),];down_k$group=-1
dat=rbind(up_k,down_k)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
p7 <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group))
geom_bar(stat="identity")
scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = FALSE)
scale_x_discrete(name ="Pathway names")
scale_y_continuous(name ="log10P-value")
coord_flip()
theme_ggstatsplot()
theme(plot.title = element_text(size = 15,hjust = 0.5),
axis.text = element_text(size = 12,face = 'bold'),
panel.grid = element_blank())
ggtitle("Pathway Enrichment")
p7
ggsave(p7,filename = 'kegg_top_gsea.png' ,width = 8,height = 4)
#ggsave(p7,filename = 'step5_kegg_gsea.pdf',width = 8,height = 4)
# geneSetID对应表格的id,排序后取前6个和后六个
p8up <- enrichplot::gseaplot2( kk, geneSetID = head( rownames(up_k)) )
pdf(file=file.path("step5_kegg_up_gseaplot.pdf"),width = 7,height = 6)
print(p8up)
dev.off()
# ggsave(p8up[[1]],filename = 'step5_kegg_up_gseaplot.png',width = 7,height = 4)#直接p8up 保存是不行的
p8down <- enrichplot::gseaplot2(kk, geneSetID = head( rownames(down_k)))
pdf(file=file.path("step5_kegg_top_down_gseaplot.pdf"),width = 7,height = 6)
print(p8down)
dev.off()
step6-anno-by-ORA.R
同gsea,注意go、kegg时,R包选择org.Mm.eg.db。
代码语言:r复制rm(list = ls())
load( file = 'DEG_deseq2.Rdata' )
load( file = 'DEG_limma_voom.Rdata' )
load( file = 'DEG_edgeR.Rdata' )
colnames(DEG_deseq2)
colnames(DEG_limma_voom)
if(T){
nrDEG=DEG_deseq2[,c("log2FoldChange", "padj")]
#nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val" )]
head(nrDEG)
colnames(nrDEG)=c('logFC','P.Value')
# 凡是阈值,都是可以自定义
logFC_t <- 0.58
pvalue_t <- 0.1
gene_up= rownames( nrDEG[with(nrDEG,
logFC > logFC_t & P.Value < pvalue_t),])
gene_down=rownames( nrDEG[with(nrDEG,
logFC < -logFC_t & P.Value < pvalue_t),])
}
# 特殊情况下,如果阈值不管用,就需要人为的指定top100上下调基因
if(F){
nrDEG=DEG_deseq2[,c("log2FoldChange", "padj")]
nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val" )]
head(nrDEG)
colnames(nrDEG)=c('logFC','P.Value')
x=nrDEG$logFC #deg取logFC这列并将其重新赋值给x
names(x)=rownames(nrDEG) #deg取probe_id这列,并将其作为名字给x
cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
names(tail(sort(x),100)))
gene_up = names(head(sort(x),200))
gene_down = names(tail(sort(x),200))
}
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)
write.table(gene_up,file = 'gene_up.txt',quote = F,col.names = F,row.names = F)
write.table(gene_down,file = 'gene_down.txt',quote = F,col.names = F,row.names = F)
## 2.3 pathway----
library(org.Mm.eg.db)
library(clusterProfiler)
# 一本书,很多数据库,很多可视化
library(ggplot2)
library(stringr)
gene_up=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
gene_down=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)
## 2.3.1 KEGG----
{
kk.up <- enrichKEGG(gene = gene_up,
organism = 'mmu',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
dotplot(kk.up)
kk.up=DOSE::setReadable(kk.up, OrgDb='org.Mm.eg.db',
keyType='ENTREZID')#按需替换
tmp = kk.up@result
kk.down <- enrichKEGG(gene = gene_down,
organism = 'mmu',
#universe = gene_all,
pvalueCutoff = 0.999,
qvalueCutoff =0.999)
head(kk.down)[,1:6]
dotplot(kk.down)
kk.down=DOSE::setReadable(kk.down, OrgDb='org.Mm.eg.db',
keyType='ENTREZID')#按需替换
gene_diff=c(gene_down,gene_up)
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'mmu',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.diff)[,1:6]
dotplot(kk.diff)
kk.diff=DOSE::setReadable(kk.diff, OrgDb='org.Mm.eg.db',
keyType='ENTREZID')#按需替换
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
dat$Description=gsub(' - Mus musculus \(house mouse\)',
'', dat$Description )
g_kegg <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group))
geom_bar(stat="identity")
scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = 'none')
scale_x_discrete(name ="Pathway names")
scale_y_continuous(name ="log10P-value")
ggtitle("KEGG Enrichment")
coord_flip() theme_bw()
theme(plot.title = element_text(size = 15,face = 'bold',hjust = 0.5),
axis.text = element_text(size = 12,face = 'bold'),
panel.grid = element_blank())
}
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
scale_y_discrete(labels=function(x) str_wrap(x, width=20))
ggsave('kegg_up_down.pdf',height = 12,width = 10)
}
## 2.3.2 GO----
## up
go <- enrichGO(gene_up, OrgDb = "org.Mm.eg.db", ont="all",
pvalueCutoff = 0.999,
qvalueCutoff =0.999)
go_up=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID')
## down
go <- enrichGO(gene_down, OrgDb = "org.Mm.eg.db", ont="all",
pvalueCutoff = 0.999,
qvalueCutoff =0.999)
go_down=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID')
save(kk.up,kk.down,go_up,go_down,file = 'ORA_results.Rdata')
barplot(go_up, split="ONTOLOGY",font.size =10)
facet_grid(ONTOLOGY~., scale="free")
scale_y_discrete(labels=function(x) str_wrap(x, width=50))
ggsave('go_up.pdf',height = 12,width = 10)
## down
barplot(go_down, split="ONTOLOGY",font.size =10)
facet_grid(ONTOLOGY~., scale="free")
scale_y_discrete(labels=function(x) str_wrap(x, width=50))
ggsave('go_down.pdf',height = 12,width = 10)