转录组GSE105789_小鼠数据下游分析注意事项

2024-08-21 15:57:59 浏览数 (2)

转录组GSE105789_小鼠数据下游分析注意事项

简单记录下GSE105789小鼠数据的下游分析的主要事项,与human的数据分析的主要区别是在进行id转换、kegg、go、gsea时,需要注意数据库和物种信息,应该选择小鼠。

数据是简单的2分组,count矩阵并没有下自网站,而是来自我做的上游分析。参考文章转录组上游分析—使用iseq下载原始数据、小鼠基因组、单端测序数据处理。

step1 load-from-pipeline

注意行名从ENSEMBL转化为SYMBOL时的数据库为org.Mm.eg.db

代码语言:r复制
rm(list = ls())#清空当前的工作环境
options(stringsAsFactors = F)#不以因子变量读取
options(scipen = 20)#不以科学计数法显示
library(data.table)
library(tinyarray)


tbl <- as.matrix(data.table::fread("raw_counts.txt", header=T, colClasses="integer"), rownames=1)
dim(tbl)
mat=as.data.frame(tbl) 
mat[1:4,1:4]
keep_feature <- rowSums (mat > 1) > 1 ;table(keep_feature)
ensembl_matrix <- mat[keep_feature, ]  
rownames(ensembl_matrix)=rownames(mat)[keep_feature]
ensembl_matrix[1:4,1:4]

#行名转换
library(AnnoProbe)
library(org.Mm.eg.db)
k<-AnnotationDbi::keys(org.Mm.eg.db,keytype = "ENSEMBL")
e2s<-AnnotationDbi::select(org.Mm.eg.db,
                           keys= rownames(ensembl_matrix),
                           columns="SYMBOL",
                           keytype = "ENSEMBL")
head(e2s)
ids = na.omit(e2s)
ids=ids[!duplicated(ids$SYMBOL),]
ids=ids[!duplicated(ids$ENSEMBL),]
head(ids)
symbol_matrix= ensembl_matrix[match(ids$ENSEMBL,rownames(ensembl_matrix)),]
rownames(symbol_matrix) = ids$SYMBOL
symbol_matrix[1:4,1:4] 
colnames(symbol_matrix)

library(AnnoProbe)
head(rownames(symbol_matrix))
#注意更改为小鼠信息
ids=annoGene(rownames(symbol_matrix),'SYMBOL','mouse')
head(ids)
tail(sort(table(ids$biotypes)))
ids=ids[ids$biotypes=='protein_coding',]
symbol_matrix=symbol_matrix[rownames(symbol_matrix) %in% ids$SYMBOL,]

#分组信息
colnames(symbol_matrix) <- c("F4/80neg_rep1","F4/80neg_rep2",
                             "CD301b_mphage_rep1","CD301b_mphage_rep2")
colnames(symbol_matrix) 
group_list=ifelse(grepl('F4/80neg',  colnames(symbol_matrix) ),
                  'control','case' )

group_list
group_list = factor(group_list,levels = c('control','case' )) 
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')

dat=log(edgeR::cpm(symbol_matrix) 1)
save(dat,group_list,file = 'step1-output.Rdata')

step2 deg-comp

质控、差异分析等与人类数据分析并无不同,这里直接略过。

代码语言:r复制
rm(list = ls())  ## 魔幻操作,一键清空~
options(stringsAsFactors = F)

library(AnnoProbe)
library(GEOquery) 
library(ggplot2)
library(ggstatsplot)
library(patchwork)
library(reshape2)
library(stringr)
getOption('timeout')
options(timeout=10000) 
load('./symbol_matrix.Rdata') 
symbol_matrix[1:4,1:4]

#设置工作目录
gse_number <- 'deg'
gse_number
dir.create(gse_number)
setwd( gse_number )
getwd()
save(symbol_matrix,group_list,file = 'symbol_matrix.Rdata')

#质控和差异分析和人类并无不同
source('../scripts/step2-qc-counts.R') 
source('../scripts/step3-deg-deseq2.R')
source('../scripts/step3-deg-edgeR.R')
source('../scripts/step3-deg-limma-voom.R')  
source('../scripts/step4-qc-for-deg.R') # 需要调整logFC和p值的阈值  

#重点介绍下
source('../scripts/step5-anno-by-GSEA.R')
source('../scripts/step5-anno-by-ORA.R')  # 需要调整logFC和p值的阈值

setwd('../')
getwd()

重点介绍下step5-anno-by-GSEA.R、step5-anno-by-ORA.R

step5-anno-by-GSEA.R

1.注意SYMBOL转ENTREZID时候,R包选择org.Mm.eg.db;

2.注意gseKEGG()函数中,物种选择mmu;

3.注意setReadable()函数中,OrgDb='org.Mm.eg.db';

代码语言:r复制
rm(list = ls()) 
load( file =  'DEG_deseq2.Rdata' )
load( file =  'DEG_limma_voom.Rdata' )
load( file =  'DEG_edgeR.Rdata' ) 
colnames(DEG_deseq2)
colnames(DEG_limma_voom)
# FoxO signaling pathway
nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
head(nrDEG) 
colnames(nrDEG)=c('logFC','P.Value')

library(org.Mm.eg.db)
library(clusterProfiler)
gene <- bitr(rownames(nrDEG), fromType = "SYMBOL",
             toType =  "ENTREZID",
             OrgDb = org.Mm.eg.db)
head(gene)
head(nrDEG)
nrDEG = nrDEG[rownames(nrDEG) %in% gene$SYMBOL,]
nrDEG$ENTREZID = gene$ENTREZID[match(rownames(nrDEG) , gene$SYMBOL)]
  # https://www.ncbi.nlm.nih.gov/gene/?term=SPARCL1

geneList=nrDEG$logFC
names(geneList)=nrDEG$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)

library(clusterProfiler)
# 一本书,很多数据库,很多可视化
#~~~这里需要替换物种~~~
#mmu
#mmu
str(geneList)
kk_gse <- gseKEGG(geneList     = geneList,
                  organism     = 'mmu',#按需替换
                  nPerm        = 1000,
                  minGSSize    = 10,
                  pvalueCutoff = 0.99,
                  verbose      = FALSE)
tmp=kk_gse@result
kk=DOSE::setReadable(kk_gse, OrgDb='org.Mm.eg.db',
                     keyType='ENTREZID')#按需替换
kk@result$Description=gsub(' - Mus musculus \(house mouse\)',
                           '',kk@result$Description )
head(kk@result$Description)
tmp=kk@result 
pro='test'
write.csv(kk@result,file = file.path(paste0(pro,'_kegg.gsea.csv')))
save(kk,file = file.path( 'gsea_kk.Rdata'))

# 假如这里找不到显著下调的通路,可以选择调整阈值,或者其它。
# down_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore < -0.6,];down_kegg$group=-1
# up_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore > 0.3,];up_kegg$group=1
# down_k <- down_kegg[head(order(down_kegg$pvalue,decreasing = F)),]
# up_k <- up_kegg[head(order(up_kegg$pvalue,decreasing = F)),]

#~~~取前6个上调通路和6个下调通路~~~
up_k <- kk[head(order(kk$enrichmentScore,decreasing = T)),];up_k$group=1
down_k <- kk[tail(order(kk$enrichmentScore,decreasing = T)),];down_k$group=-1

dat=rbind(up_k,down_k)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group 
dat=dat[order(dat$pvalue,decreasing = F),]

p7 <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group))   
  geom_bar(stat="identity")   
  scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = FALSE)   
  scale_x_discrete(name ="Pathway names")  
  scale_y_continuous(name ="log10P-value")  
  coord_flip()   
  theme_ggstatsplot() 
  theme(plot.title = element_text(size = 15,hjust = 0.5),  
        axis.text = element_text(size = 12,face = 'bold'),
        panel.grid = element_blank()) 
  ggtitle("Pathway Enrichment") 
p7
ggsave(p7,filename = 'kegg_top_gsea.png' ,width = 8,height = 4)

#ggsave(p7,filename = 'step5_kegg_gsea.pdf',width = 8,height = 4)

# geneSetID对应表格的id,排序后取前6个和后六个
p8up <- enrichplot::gseaplot2( kk, geneSetID = head( rownames(up_k)) )  
pdf(file=file.path("step5_kegg_up_gseaplot.pdf"),width = 7,height = 6)
print(p8up)
dev.off()
# ggsave(p8up[[1]],filename = 'step5_kegg_up_gseaplot.png',width = 7,height = 4)#直接p8up 保存是不行的

p8down <-   enrichplot::gseaplot2(kk, geneSetID = head( rownames(down_k)))
pdf(file=file.path("step5_kegg_top_down_gseaplot.pdf"),width = 7,height = 6)
print(p8down)
dev.off()

step6-anno-by-ORA.R

同gsea,注意go、kegg时,R包选择org.Mm.eg.db

代码语言:r复制
rm(list = ls()) 
load( file =  'DEG_deseq2.Rdata' )
load( file =  'DEG_limma_voom.Rdata' )
load( file =  'DEG_edgeR.Rdata' ) 
colnames(DEG_deseq2)
colnames(DEG_limma_voom)

if(T){
  nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
  #nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
  head(nrDEG) 
  colnames(nrDEG)=c('logFC','P.Value')
  
  # 凡是阈值,都是可以自定义 
  logFC_t <- 0.58
  pvalue_t <- 0.1
  gene_up= rownames( nrDEG[with(nrDEG,
                                logFC > logFC_t & P.Value < pvalue_t),])
  gene_down=rownames( nrDEG[with(nrDEG,
                                 logFC < -logFC_t & P.Value < pvalue_t),]) 
}
# 特殊情况下,如果阈值不管用,就需要人为的指定top100上下调基因
if(F){
  nrDEG=DEG_deseq2[,c("log2FoldChange",   "padj")]
  nrDEG=DEG_limma_voom[,c("logFC", "adj.P.Val"  )]
  head(nrDEG) 
  colnames(nrDEG)=c('logFC','P.Value')
  x=nrDEG$logFC #deg取logFC这列并将其重新赋值给x
  names(x)=rownames(nrDEG) #deg取probe_id这列,并将其作为名字给x
  cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
       names(tail(sort(x),100)))
  gene_up = names(head(sort(x),200))
  gene_down = names(tail(sort(x),200))
}
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)
write.table(gene_up,file = 'gene_up.txt',quote = F,col.names = F,row.names = F)
write.table(gene_down,file = 'gene_down.txt',quote = F,col.names = F,row.names = F)


## 2.3 pathway----
library(org.Mm.eg.db)
library(clusterProfiler)
# 一本书,很多数据库,很多可视化
library(ggplot2)
library(stringr)

gene_up=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
gene_down=as.character(na.omit(AnnotationDbi::select(org.Mm.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
length(gene_up);length(gene_down)
head(gene_up);head(gene_down)

## 2.3.1 KEGG----
{
  
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'mmu',
                      #universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff =0.9)
  head(kk.up)[,1:6]
  dotplot(kk.up)
  
  kk.up=DOSE::setReadable(kk.up, OrgDb='org.Mm.eg.db',
                          keyType='ENTREZID')#按需替换
  tmp = kk.up@result
  
  kk.down <- enrichKEGG(gene      = gene_down,
                        organism     = 'mmu',
                        #universe     = gene_all,
                        pvalueCutoff = 0.999,
                        qvalueCutoff =0.999)
  head(kk.down)[,1:6]
  dotplot(kk.down)
  
  kk.down=DOSE::setReadable(kk.down, OrgDb='org.Mm.eg.db',
                            keyType='ENTREZID')#按需替换
 
  gene_diff=c(gene_down,gene_up)
  kk.diff <- enrichKEGG(gene      = gene_diff,
                        organism     = 'mmu',
                        #universe     = gene_all,
                        pvalueCutoff = 0.9,
                        qvalueCutoff =0.9)
  head(kk.diff)[,1:6]
  dotplot(kk.diff)


  kk.diff=DOSE::setReadable(kk.diff, OrgDb='org.Mm.eg.db',
                            keyType='ENTREZID')#按需替换

  kegg_diff_dt <- as.data.frame(kk.diff)
  
  kegg_down_dt <- as.data.frame(kk.down)
  kegg_up_dt <- as.data.frame(kk.up)
  down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
  up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
 
  
  kegg_plot <- function(up_kegg,down_kegg){
    dat=rbind(up_kegg,down_kegg)
    colnames(dat)
    dat$pvalue = -log10(dat$pvalue)
    dat$pvalue=dat$pvalue*dat$group 
    
    dat=dat[order(dat$pvalue,decreasing = F),]
    dat$Description=gsub(' - Mus musculus \(house mouse\)',
                               '', dat$Description ) 
    g_kegg <- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group))   
      geom_bar(stat="identity")   
      scale_fill_gradient(low="#34bfb5",high="#ff6633",guide = 'none')   
      scale_x_discrete(name ="Pathway names")  
      scale_y_continuous(name ="log10P-value")  
      ggtitle("KEGG Enrichment")  
      coord_flip()   theme_bw() 
      theme(plot.title = element_text(size = 15,face = 'bold',hjust = 0.5),  
            axis.text = element_text(size = 12,face = 'bold'),
            panel.grid = element_blank())
    
  }
  g_kegg=kegg_plot(up_kegg,down_kegg)
  print(g_kegg)  
    scale_y_discrete(labels=function(x) str_wrap(x, width=20)) 
  ggsave('kegg_up_down.pdf',height = 12,width = 10)
}


## 2.3.2 GO----
## up
go <- enrichGO(gene_up, OrgDb = "org.Mm.eg.db", ont="all",  
               pvalueCutoff = 0.999,
               qvalueCutoff =0.999)  
go_up=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID') 
## down
go <- enrichGO(gene_down, OrgDb = "org.Mm.eg.db", ont="all",
               pvalueCutoff = 0.999,
               qvalueCutoff =0.999)  
go_down=DOSE::setReadable(go, OrgDb='org.Mm.eg.db',keyType='ENTREZID')
save(kk.up,kk.down,go_up,go_down,file = 'ORA_results.Rdata')


barplot(go_up, split="ONTOLOGY",font.size =10)  
  facet_grid(ONTOLOGY~., scale="free")   
  scale_y_discrete(labels=function(x) str_wrap(x, width=50)) 
ggsave('go_up.pdf',height = 12,width = 10) 

## down 
barplot(go_down, split="ONTOLOGY",font.size =10)  
  facet_grid(ONTOLOGY~., scale="free")   
  scale_y_discrete(labels=function(x) str_wrap(x, width=50)) 
ggsave('go_down.pdf',height = 12,width = 10) 

0 人点赞