GEO数据挖掘5
sunqi
2020/7/13
GEO数据挖掘5
概述
GO和KEGG富集分析
KEGG全称 Kyoto Encyclopedia of Genes and Genomes,由日本京都大学生物信息学中心的Kanahisa 实验室于1995年建立根据基因组中的信息,原理是用计算机计算或者预测出比较复杂的细胞中的通路或者生物的复杂行为。数据库能够把基因及表达信息作为一个整体的网络进行研究,通俗点讲就是通过基因寻找通路
GO全称为gene ontology,由基因本体联合会(Gene Ontology Consortium)建立的数据库,数据库对基因和蛋白功能进行限定和描述
GEO数据挖掘离不来富集分析,单纯的差异表达基因不能说明什么问题,只有对基因根据现有知识做定义定位分类,这样才能在生物学上解释这个差异,也就是故事才能讲顺了
注释:GO和KEGG的具体作用不再赘述,等代码实现完成之后后续再学习理论知识
另外,KEGG和GO分析可以通过软件实现,具体参考官网
数据预处理
用到的数据集为差异分析后得到的数据集deg,详情见上章
代码语言:javascript复制rm(list = ls())
load(file = 'deg.Rdata')
## 设置差异阈值
logFC_t=1.5
deg$g=ifelse(deg$P.Value>0.05,'stable',
ifelse( deg$logFC > logFC_t,'UP',
ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
)
# 可以看到有167个下调和196个上调基因
table(deg$g)
代码语言:javascript复制##
## DOWN stable UP
## 167 18458 196
代码语言:javascript复制# 建立基因列
deg$symbol=rownames(deg)
library(ggplot2)
# 如果需要通过下述方式下载包
# BiocManager::install("clusterProfiler")
# BiocManager::install("enrichplot")
# install.packages("tibble")
library(clusterProfiler)# 富集包
library(org.Hs.eg.db)# 基因注释包
# bitr将基因symbol转换为ENTREZID,方便后续比较
df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(df)
代码语言:javascript复制## SYMBOL ENTREZID
## 1 CD36 948
## 2 DUSP6 1848
## 3 DCT 1638
## 4 SPRY2 10253
## 5 MOXD1 26002
## 6 ETV4 2118
代码语言:javascript复制DEG=deg
head(DEG)
代码语言:javascript复制## logFC AveExpr t P.Value adj.P.Val B g
## CD36 5.780170 7.370282 79.74600 1.231803e-16 2.318376e-12 26.74346 UP
## DUSP6 -4.212683 9.106625 -62.45995 1.832302e-15 1.347565e-11 24.99635 DOWN
## DCT 5.633027 8.763220 61.56738 2.147971e-15 1.347565e-11 24.88294 UP
## SPRY2 -3.801663 9.726468 -53.97054 9.191088e-15 4.324637e-11 23.79519 DOWN
## MOXD1 3.263063 10.171635 47.09629 4.129552e-14 1.325237e-10 22.58201 UP
## ETV4 -3.843247 9.667077 -46.99899 4.224762e-14 1.325237e-10 22.56297 DOWN
## symbol
## CD36 CD36
## DUSP6 DUSP6
## DCT DCT
## SPRY2 SPRY2
## MOXD1 MOXD1
## ETV4 ETV4
代码语言:javascript复制# 通过基因名将转换后的enterz id 加入到差异比较结果中
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
# 保存
# save(DEG,file = 'anno_DEG.Rdata')
# 提取上调和下调基因
gene_up= DEG[DEG$g == 'UP','ENTREZID']
gene_down=DEG[DEG$g == 'DOWN','ENTREZID']
# 合并为差异数据gene_diff
gene_diff=c(gene_up,gene_down)
# 确保所有entrez id为字符串
gene_all=as.character(DEG[ ,'ENTREZID'] )
# 获得差异比较的数值
geneList=DEG$logFC
# 添加entrez id
names(geneList)=DEG$ENTREZID
# 按照差异大小,从大到小排列
geneList=sort(geneList,decreasing = T)
KEGG 分析
kegg普通富集分析 Over-Representation Analysis
代码语言:javascript复制# 上调富集分析,参数可以自己指定,特别是p值
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
代码语言:javascript复制## ID Description GeneRatio
## hsa00140 hsa00140 Steroid hormone biosynthesis 3/82
## hsa04512 hsa04512 ECM-receptor interaction 4/82
## hsa00982 hsa00982 Drug metabolism - cytochrome P450 3/82
## hsa04620 hsa04620 Toll-like receptor signaling pathway 4/82
## hsa00980 hsa00980 Metabolism of xenobiotics by cytochrome P450 3/82
## hsa04390 hsa04390 Hippo signaling pathway 5/82
## BgRatio pvalue p.adjust
## hsa00140 45/7299 0.01381704 0.6462342
## hsa04512 85/7299 0.01505774 0.6462342
## hsa00982 51/7299 0.01932907 0.6462342
## hsa04620 92/7299 0.01959465 0.6462342
## hsa00980 55/7299 0.02358691 0.6462342
## hsa04390 151/7299 0.02707377 0.6462342
代码语言:javascript复制# 绘制富集分析的图
dotplot(kk.up )
代码语言:javascript复制# ggsave('kk.up.dotplot.png')
# 下调富集分析,参数需要自己指定
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
代码语言:javascript复制## ID Description GeneRatio BgRatio pvalue
## hsa04110 hsa04110 Cell cycle 17/83 122/7299 1.615131e-14
## hsa03030 hsa03030 DNA replication 8/83 36/7299 4.636777e-09
## hsa05322 hsa05322 Systemic lupus erythematosus 9/83 107/7299 2.970025e-06
## hsa03440 hsa03440 Homologous recombination 6/83 38/7299 3.718118e-06
## hsa03460 hsa03460 Fanconi anemia pathway 6/83 48/7299 1.509512e-05
## hsa05206 hsa05206 MicroRNAs in cancer 12/83 252/7299 2.414870e-05
## p.adjust
## hsa04110 2.261183e-12
## hsa03030 3.245744e-07
## hsa05322 1.301341e-04
## hsa03440 1.301341e-04
## hsa03460 4.226633e-04
## hsa05206 5.634697e-04
代码语言:javascript复制# 绘制图
dotplot(kk.down )
代码语言:javascript复制# ggsave('kk.down.dotplot.png')
## 上调和下调的富集
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
代码语言:javascript复制## ID Description GeneRatio BgRatio
## hsa04110 hsa04110 Cell cycle 17/165 124/8034
## hsa03030 hsa03030 DNA replication 8/165 36/8034
## hsa03440 hsa03440 Homologous recombination 6/165 41/8034
## hsa05202 hsa05202 Transcriptional misregulation in cancer 12/165 192/8034
## hsa03460 hsa03460 Fanconi anemia pathway 6/165 54/8034
## pvalue p.adjust
## hsa04110 4.660099e-10 1.011241e-07
## hsa03030 4.954487e-07 5.375619e-05
## hsa03440 1.698195e-04 1.228361e-02
## hsa05202 5.701353e-04 3.092984e-02
## hsa03460 7.827963e-04 3.397336e-02
代码语言:javascript复制# 绘制图片
dotplot(kk.diff )
代码语言:javascript复制# ggsave('kk.diff.dotplot.png')
browseKEGG(kk.up, 'hsa04640')
# 转换为数据框
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
# 绘制条图
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
up_kegg$group=1
# 绘图函数
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group))
geom_bar(stat="identity")
scale_fill_gradient(low="blue",high="red",guide = FALSE)
scale_x_discrete(name ="Pathway names")
scale_y_continuous(name ="log10P-value")
coord_flip() theme_bw() theme(plot.title = element_text(hjust = 0.5))
ggtitle("Pathway Enrichment")
}
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
代码语言:javascript复制# ggsave(g_kegg,filename = 'kegg_up_down.png')
kegg GSEA分析
代码语言:javascript复制kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
代码语言:javascript复制# 结果解释,备查https://blog.csdn.net/weixin_43569478/article/details/83745105
# 结果解读需要重新学习
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
代码语言:javascript复制# ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
GO database analysis
电脑配置不行,计算较慢,以后想办法连接服务器算
代码语言:javascript复制g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
# 通过lapply函数,同时实现三组基因的GO分析
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
return(ego)
})
})
代码语言:javascript复制## Now process BPNow process MFNow process CCNow process BPNow process MFNow process CCNow process BPNow process MFNow process CC
代码语言:javascript复制# 保存结果,下次不算了
save(go_enrich_results,file = 'go_enrich_results.Rdata')
# 导入go分析的数据集
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
# 通过循环绘制气泡图
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}
代码语言:javascript复制## dotplot_gene_up_BP.png
代码语言:javascript复制## dotplot_gene_up_MF.png
代码语言:javascript复制## dotplot_gene_up_CC.png
代码语言:javascript复制## dotplot_gene_down_BP.png
代码语言:javascript复制## dotplot_gene_down_MF.png
代码语言:javascript复制## dotplot_gene_down_CC.png
代码语言:javascript复制## dotplot_gene_diff_BP.png
代码语言:javascript复制## dotplot_gene_diff_MF.png
代码语言:javascript复制## dotplot_gene_diff_CC.png
代码语言:javascript复制# 结果解读(略)
结束语
从这里开始,有点力不从心的感觉,生物学背景少,结果解读有点困难,不过这样整一遍,后面针对性的再学,效率也高,另外电脑需要个cpu好点的电脑,不然等的太累。
love&peace