GEO数据挖掘7
sunqi
2020/7/13
概述
GSVA分析,gene Set Variation Analysis,被称为基因集变异分析,是一种非参数的无监督分析方法,用来评估芯片核转录组的基因集富集结果。 思路
GSVA将表达矩阵转换成通路富集分数(ES)矩阵 ,再借用limma包的 lmFit 分析得到差异通路。
进行GSVA分析
代码语言:javascript复制rm(list = ls())
library(ggplot2)
library(clusterProfiler)
代码语言:javascript复制##
代码语言:javascript复制## clusterProfiler v3.16.0 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
代码语言:javascript复制##
## Attaching package: 'clusterProfiler'
代码语言:javascript复制## The following object is masked from 'package:stats':
##
## filter
代码语言:javascript复制library(org.Hs.eg.db)
代码语言:javascript复制## Loading required package: AnnotationDbi
代码语言:javascript复制## Loading required package: stats4
代码语言:javascript复制## Loading required package: BiocGenerics
代码语言:javascript复制## Loading required package: parallel
代码语言:javascript复制##
## Attaching package: 'BiocGenerics'
代码语言:javascript复制## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
代码语言:javascript复制## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
代码语言:javascript复制## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
代码语言:javascript复制## Loading required package: Biobase
代码语言:javascript复制## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
代码语言:javascript复制## Loading required package: IRanges
代码语言:javascript复制## Loading required package: S4Vectors
代码语言:javascript复制##
## Attaching package: 'S4Vectors'
代码语言:javascript复制## The following object is masked from 'package:clusterProfiler':
##
## rename
代码语言:javascript复制## The following object is masked from 'package:base':
##
## expand.grid
代码语言:javascript复制##
## Attaching package: 'IRanges'
代码语言:javascript复制## The following object is masked from 'package:clusterProfiler':
##
## slice
代码语言:javascript复制## The following object is masked from 'package:grDevices':
##
## windows
代码语言:javascript复制##
## Attaching package: 'AnnotationDbi'
代码语言:javascript复制## The following object is masked from 'package:clusterProfiler':
##
## select
代码语言:javascript复制##
代码语言:javascript复制# 主要输入文件为表达量的矩阵和基因集的文件
### 对 MigDB中的全部基因集 做GSVA分析。
load(file = 'step1-output.Rdata')
# 表达矩阵
X=dat
# 分组情况
table(group_list)
代码语言:javascript复制## group_list
## Control Vemurafenib
## 3 3
代码语言:javascript复制##导入MigDB数据集名,需要去官网下载
d='MSigDB/symbols/'
gmts=list.files(d,pattern = 'all')
gmts
代码语言:javascript复制## [1] "c1.all.v6.2.symbols.gmt" "c2.all.v6.2.symbols.gmt"
## [3] "c3.all.v6.2.symbols.gmt" "c4.all.v6.2.symbols.gmt"
## [5] "c5.all.v6.2.symbols.gmt" "c6.all.v6.2.symbols.gmt"
## [7] "c7.all.v6.2.symbols.gmt" "h.all.v6.2.symbols.gmt"
代码语言:javascript复制# 安装GSVA包
# BiocManager::install('GSVA')
library(GSEABase)
代码语言:javascript复制## Loading required package: annotate
代码语言:javascript复制## Loading required package: XML
代码语言:javascript复制## Loading required package: graph
代码语言:javascript复制##
## Attaching package: 'graph'
代码语言:javascript复制## The following object is masked from 'package:XML':
##
## addNode
代码语言:javascript复制library(GSVA)
# 通过lappy对MSIGDB的每个系列进行导入,并分析
# 结果导出为es.max
es_max <- lapply(gmts, function(gmtfile){
geneset <- getGmt(file.path(d,gmtfile))
es.max <- gsva(X, geneset,
mx.diff=FALSE, verbose=FALSE,
parallel.sz=1)
return(es.max)
})
代码语言:javascript复制## Warning in .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq,
## abs.ranking, : Some gene sets have size one. Consider setting 'min.sz > 1'.
代码语言:javascript复制## Warning in getGmt(file.path(d, gmtfile)): 1 record(s) contain duplicate ids:
## QUINTENS_EMBRYONIC_BRAIN_RESPONSE_TO_IR
代码语言:javascript复制## Warning in .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq,
## abs.ranking, : Some gene sets have size one. Consider setting 'min.sz > 1'.
## Warning in .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq,
## abs.ranking, : Some gene sets have size one. Consider setting 'min.sz > 1'.
代码语言:javascript复制# 设置阈值
adjPvalueCutoff <- 0.001
logFCcutoff <- log2(2)
# 在ex_max的基础上进行差异分析
es_deg <- lapply(es_max, function(es.max){
# 分组矩阵
design <- model.matrix(~0 factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(es.max)
# limma计算差异表达基因
library(limma)
# 比较矩阵
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),
levels = design)
# 差异比较的函数
deg = function(es.max,design,contrast.matrix){
##step1
fit <- lmFit(es.max,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
##step3
res <- decideTests(fit2, p.value=adjPvalueCutoff)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
return(nrDEG)
}
re = deg(es.max,design,contrast.matrix)
nrDEG=re
return(nrDEG)
})
代码语言:javascript复制##
## Attaching package: 'limma'
代码语言:javascript复制## The following object is masked from 'package:BiocGenerics':
##
## plotMA
代码语言:javascript复制## 保存结果
save(es_max,es_deg,file='gsva_msigdb.Rdata')
load(file='gsva_msigdb.Rdata')
绘制差异热图
代码语言:javascript复制library(pheatmap)
lapply(1:length(es_deg), function(i){
dat=es_max[[i]]
df=es_deg[[i]]
# 提取有意义的绘图
df=df[df$P.Value<0.01 & abs(df$logFC) > 0.3,]
if(nrow(df)>5){
n=rownames(df)
# 与差异比较取子集
dat=dat[match(n,rownames(dat)),]
ac=data.frame(g=group_list)
rownames(ac)=colnames(dat)
rownames(dat)=substring(rownames(dat),1,50)
# 绘图,并保存为PDF
pheatmap::pheatmap(dat,
fontsize_row = 8,height = 11,
annotation_col = ac,show_colnames = F,
filename = paste0('gsva_',strsplit(gmts[i],'[.]')[[1]][1],'.pdf'))
}
})
除了第一个图,其他的都丑的一批
代码语言:javascript复制##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
代码语言:javascript复制# 保存计算结果
df=do.call(rbind ,es_deg)
es_matrix=do.call(rbind ,es_max)
df=df[df$P.Value<0.01 & abs(df$logFC) > 0.5,]
write.csv(df,file = 'GSVA_DEG.csv')
结束语
至此,GEO数据分析的基础基本介绍完毕,后面计划解读一些geo数据挖掘的文章 love&peace