RNA-seq 231023

2023-10-31 21:28:02 浏览数 (1)


title: "RNAseq231026"

output: html_document

date: "2023-10-31"


参考https://www.zhihu.com/people/gu_chen/posts?page=2

R Markdown

代码语言:text复制
###环境设置
rm(list=ls())
options(stringsAsFactors = F) 
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(data.table) #多核读取文件
setwd("D:/huage1/rnaseq1026")
#### 对counts进行处理筛选得到表达矩阵 ####
a1 <- fread('./counts.txt',
            header = T,data.table = F)#载入counts,第一列设置为列名
colnames(a1)
代码语言:txt复制
##  [1] "Geneid"                                                                      
##  [2] "Chr"                                                                         
##  [3] "Start"                                                                       
##  [4] "End"                                                                         
##  [5] "Strand"                                                                      
##  [6] "Length"                                                                      
##  [7] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207279_sorted.bam"
##  [8] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207280_sorted.bam"
##  [9] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207283_sorted.bam"
## [10] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207284_sorted.bam"
代码语言:text复制
counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts 
rownames(counts) <- a1$Geneid
colnames(counts) <- gsub('/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/','', #删除样品名前缀
                         gsub('_sorted.bam','',  colnames(counts))) #删除样品名后缀


#### 导入或构建样本信息,  进行列样品名的重命名和分组####
b <- read.csv('./SraRunTable.txt')
b
代码语言:txt复制
##           Run Assay.Type AvgSpotLen      Bases  BioProject
## 1 SRR12207279    RNA-Seq        295 4512474269 PRJNA645812
## 2 SRR12207280    RNA-Seq        296 6542603004 PRJNA645812
## 3 SRR12207283    RNA-Seq        300 4108168500 PRJNA645812
## 4 SRR12207284    RNA-Seq        300 4536921000 PRJNA645812
##      BioSample      Bytes cell_type_source Center.Name Consent
## 1 SAMN15517122 1855257229       stem cells         GEO  public
## 2 SAMN15517121 2616595592       stem cells         GEO  public
## 3 SAMN15517118 1777556258       stem cells         GEO  public
## 4 SAMN15517117 1699519747       stem cells         GEO  public
##   DATASTORE.filetype DATASTORE.provider               DATASTORE.region
## 1   sra,run.zq,fastq         s3,gs,ncbi gs.US,s3.us-east-1,ncbi.public
## 2   sra,run.zq,fastq         s3,gs,ncbi s3.us-east-1,ncbi.public,gs.US
## 3   sra,fastq,run.zq         gs,s3,ncbi s3.us-east-1,gs.US,ncbi.public
## 4   sra,fastq,run.zq         ncbi,s3,gs gs.US,ncbi.public,s3.us-east-1
##   Experiment genotype GEO_Accession..exp.  Instrument LibraryLayout
## 1 SRX8718088       WT          GSM4668511 HiSeq X Ten        PAIRED
## 2 SRX8718089       WT          GSM4668512 HiSeq X Ten        PAIRED
## 3 SRX8718092       WT          GSM4668515 HiSeq X Ten        PAIRED
## 4 SRX8718093       WT          GSM4668516 HiSeq X Ten        PAIRED
##   LibrarySelection  LibrarySource     Organism Platform
## 1             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 2             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 3             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 4             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
##            ReleaseDate          create_date version Sample.Name
## 1 2021-02-19T00:00:00Z 2020-07-13T09:00:00Z       1  GSM4668511
## 2 2021-02-19T00:00:00Z 2020-07-13T09:01:00Z       1  GSM4668512
## 3 2021-02-19T00:00:00Z 2020-07-13T08:58:00Z       1  GSM4668515
## 4 2021-02-19T00:00:00Z 2020-07-13T09:03:00Z       1  GSM4668516
##   source_name SRA.Study chip_antibody
## 1   RNA_mESCs SRP271532            NA
## 2   RNA_mESCs SRP271532            NA
## 3  RNA_EpiSCs SRP271532            NA
## 4  RNA_EpiSCs SRP271532            NA
代码语言:text复制
name_list <- b$source_name[match(colnames(counts),b$Run)]; name_list  #选择所需要的样品信息列
代码语言:txt复制
## [1] "RNA_mESCs"  "RNA_mESCs"  "RNA_EpiSCs" "RNA_EpiSCs"
代码语言:text复制
class(name_list)
代码语言:txt复制
## [1] "character"
代码语言:text复制
##以上做完
nlgl <- data.frame(row.names=colnames(counts),
                   name_list=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2"),
                   group_list=c("naive","naive","primed","primed"))
name_list <- nlgl$name_list
group_list <- nlgl$group_list
gl <- data.frame(row.names=colnames(counts), #构建样品名与分组对应的数据框
                 group_list=group_list)
class(nlgl)
代码语言:txt复制
## [1] "data.frame"
代码语言:text复制
geneid_efflen <- subset(a1,select = c("Geneid","Length"))
colnames(geneid_efflen) <- c("geneid","efflen")  
efflen <- geneid_efflen[match(rownames(counts),
                              geneid_efflen$geneid),
                        "efflen"]
class(efflen)
代码语言:txt复制
## [1] "integer"
代码语言:text复制
counts2TPM <- function(count=count, efflength=efflen){
  RPK <- count/(efflength/1000)   #每千碱基reads (Reads Per Kilobase) 长度标准化
  PMSC_rpk <- sum(RPK)/1e6        #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化
  RPK/PMSC_rpk              
}  

tpm <- as.data.frame(apply(counts,2,counts2TPM))
colSums(tpm)
代码语言:txt复制
## SRR12207279 SRR12207280 SRR12207283 SRR12207284 
##       1e 06       1e 06       1e 06       1e 06
代码语言:text复制
##新方法 合并
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(data.table) #多核读取文件

g2s <- fread('g2s_vm33_gencode.txt',header = F,data.table = F) #载入从gencode的gtf文件中提取的信息文件
colnames(g2s) <- c("geneid","symbol")

symbol <- g2s[match(rownames(counts),g2s$geneid),"symbol"] #匹配counts行名对应的symbol
table(duplicated(symbol))
代码语言:txt复制
## 
## FALSE  TRUE 
## 56791   150
代码语言:text复制
#FALSE  TRUE 
#55492   770 

##使用aggregate根据symbol列中的相同基因进行合并
代码语言:text复制
counts <- aggregate(counts, by=list(symbol), FUN=sum)
counts <- column_to_rownames(counts,'Group.1')
#在 aggregate 函数的使用过程中, Group.1 列是通过对 counts 数据框中的计数进行求和而生成的临时列。 
dim(counts)
代码语言:txt复制
## [1] 56791     4
代码语言:text复制
#[1] 55492     4
tpm <- aggregate(tpm, by=list(symbol), FUN=sum)
tpm <- column_to_rownames(tpm,'Group.1')
#在第一句代码 tpm <- aggregate(tpm, by=list(symbol), FUN=sum) 中, aggregate 函数会根据 symbol 列对 tpm 数据框进行聚合操作,并将每个组的TPM值求和。但是,聚合操作会生成一个新的数据框,其中 symbol 列被重命名为 Group.1 ,而原始的行名可能会丢失。 
#为了保留原始的行名,第二句代码 tpm <- column_to_rownames(tpm,'Group.1') 将 Group.1 列的值作为新的行名,以确保每个基因的标识信息仍然保留在结果中。 
#因此,两句代码的组合将按照 symbol 分组并计算总和,然后使用 Group.1 列的值作为新的行名。 

#初步过滤低表达基因与保存counts数据
keep_feature <- rowSums(counts>1) >= 2
table(keep_feature)
代码语言:txt复制
## keep_feature
## FALSE  TRUE 
## 42689 14102
代码语言:text复制
counts_filt <- counts[keep_feature, ] 
tpm_filt <- tpm[keep_feature, ]
counts_raw=counts
counts=counts_filt
tpm=tpm_filt
save(counts_raw,counts,tpm,group_list,gl,file='./1.counts.Rdata')

#四、差异分析前的准备工作
#1.数据预处理,使样本间具有可比性
options(stringsAsFactors = F)
library(FactoMineR)
library(factoextra)  
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(pheatmap)
library(DESeq2)
library(RColorBrewer)
#数据预处理  # (任选以下一种作为dat即可,主要是进行样本间归一化,使得样本具有可比性)
dat <- log2(tpm 1)
colnames(dat) <- c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
#2.boxplot查看样本的基因整体表达情况
### boxplot 查看样本的基因整体表达情况
p=boxplot(dat, col="blue", ylab="dat", main=" normalized data ",
        outline = F, notch = F)
代码语言:text复制
dev.off()
代码语言:txt复制
## null device 
##           1
代码语言:text复制
#3查看聚类情况 hclust 图、距离热图、PCA图、差异基因热图、相关性热图
#聚类
sampleDists <- dist(t(dat))   #dist默认计算矩阵行与行的距离, 因此需要转置
sampleDistMatrix <- as.matrix(sampleDists)  
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)  #选取热图的颜色
p0 <- pheatmap::pheatmap(sampleDistMatrix,
                         fontsize=7,
                         clustering_distance_rows=sampleDists,
                         clustering_distance_cols=sampleDists,
                         angle_col=45,
                         col=colors)
代码语言:text复制
ggsave(p0,filename = 'check_dist.png',width = 7.5,height =6)
dev.off()
代码语言:txt复制
## null device 
##           1
代码语言:text复制
#PCA图
dat.pca <- PCA(t(dat) , graph = F) 
percentVar <- get_eigenvalue(dat.pca)
pca <- fviz_pca_ind(dat.pca,
                    title = "Principal Component Analysis",
                    legend.title = "Groups",
                    geom.ind = c("point", "text"), 
                    pointsize = 1.5,
                    labelsize = 3,
                    col.ind = group_list, # 分组上色
                    axes.linetype=NA,  # remove axeslines
                    mean.point=F#去除分组中心点
)   
  coord_fixed(ratio = 1)   #坐标轴的纵横比
  xlab(paste0("PC1 (",round(percentVar[1,'variance.percent'],1),"%)"))  
  ylab(paste0("PC2 (",round(percentVar[2,'variance.percent'],1),"%)"))
pca
代码语言:text复制
#heatmap检测——取500高表达基因 看一下样本相关性 
dat_500 <- dat[names(sort(apply(dat,1,mad),decreasing = T)[1:500]),]#取高表达量前500基因
M <- cor(dat_500)
gl$name_list <-nlgl$name_list
rownames(gl)<-gl$name_list
p2 <-pheatmap::pheatmap(M,
                        show_rownames = T,
                        angle_col=45,
                        fontsize=7,
                        annotation_col = gl) 
p2
代码语言:text复制
ggsave(p2,filename = 'check_cor_top500new.png',width = 7.5,height =6)
代码语言:text复制
#
####################### heatmap检测——取500差异大的基因 差异基因热图##########################################
cg <- names(tail(sort(apply(dat,1,sd)),500)) #取每一行的方差,从小到大排序,取最大的500个
p1 <- pheatmap::pheatmap(dat[cg, ],
                         show_colnames =T,show_rownames = F,
                         fontsize=7,
                         legend_breaks = -3:3,
                         #scale = "row",
                         angle_col=45,
                         annotation_col=gl) 
代码语言:text复制
ggsave(p1,filename = 'check_heatmap_top500_sdnew.png',width = 7.5,height =6)
代码语言:text复制
dev.off()
代码语言:txt复制
## null device 
##           1
代码语言:text复制
#差异分析 DESeq2
exp="primed"
ctr="naive"
colnames(counts)=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
gl1 <-gl$group_list
group_list <- gl$name_list
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = gl,
                              design = ~ group_list)

dds$group_list <- relevel(dds$group_list, ref = ctr)
keep <- rowSums(counts(dds)) >= 1.5*ncol(counts)
dds <- dds[keep,]
dds <- DESeq(dds,quiet = F) 
res <- results(dds,contrast=c("group_list", exp, ctr))
resOrdered <- res[order(res$padj),]
class(resOrdered)
代码语言:txt复制
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
代码语言:text复制
tempDEG <- as.data.frame(resOrdered)
DEG_DEseq2 <- na.omit(tempDEG)

#差异分析绘制 火山图和热图
#1.火山图的绘制
library(ggplot2)
library(pheatmap)
##筛选条件设置
log2FC_cutoff = log2(10)
padj_cutoff = 0.001
##选取差异分析结果
need_DEG <- DEG_DEseq2[,c(2,6)] #选取log2FoldChange, padj信息
colnames(need_DEG) <- c('log2FoldChange','padj') 

need_DEG$significance  <- as.factor(ifelse(need_DEG$padj < padj_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff,
                                           ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT'))

title <- paste0(' Up :  ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
                'n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
                'n FoldChange >= ',round(2^log2FC_cutoff,3))

g <- ggplot(data=need_DEG, 
            aes(x=log2FoldChange, y=-log10(padj), 
                color=significance))  
  #点和背景
  geom_point(alpha=0.4, size=1)  
  theme_classic()  #无网格线
  #坐标轴
  xlab("log2 ( FoldChange )")   
  ylab("-log10 ( P.adjust )")  
  #标题文本
  ggtitle( title )  
  #分区颜色                  
  scale_colour_manual(values = c('blue','grey','red'))  
  #辅助线
  geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.8)  
  geom_hline(yintercept = -log10(padj_cutoff),lty=4,col="grey",lwd=0.8)  
  #图例标题间距等设置
  theme(plot.title = element_text(hjust = 0.5), 
        plot.margin=unit(c(2,2,2,2),'lines'), #上右下左
        legend.title = element_blank(), #不显示图例标题
        legend.position="right")  #图例位置

ggsave(g,filename = 'volcano_padj.png',width =8,height =7.5)
代码语言:text复制
#2.热图的绘制
gene_up <- rownames(need_DEG[with(need_DEG,log2FoldChange>log2FC_cutoff & padj<padj_cutoff),])
gene_down <- rownames(need_DEG[with(need_DEG,log2FoldChange< -log2FC_cutoff & padj<padj_cutoff),])
cg1 <- c(head(gene_up, 50),   #取前50 padj上下调基因名
        head(gene_down, 50))
cg2 <- na.omit(match(cg1,rownames(dat))) 
cg2
代码语言:txt复制
##   [1]  3740 10121   131  2383  8147  8036  2357  9217  1899  3244  5806
##  [12]  5732  8893  4012  4852  8462 13189  6102  4017  5444  1409 12960
##  [23] 13436 10641  3974 10090 12177  7734  7873 10060 10127   588   889
##  [34] 13432  3576  8306  2405 14063 13735 12994  7930  3363  2389  6126
##  [45] 11482 10539  8824  7887  2394 13154  8254  9052  1243 12776  1067
##  [56]  1076  9567 11852  2080  3805  3050  2180   482  1537  9713  8113
##  [67] 10540  2994  9188   470  7715   242 13609  3232  9276   878  2107
##  [78]  8736 10270   562 11424  1848 12751   931  9172 12255  3542  3726
##  [89]  8097   380 12296  7640  4097  2818  3838  3921 13630  8898  5832
## [100] 13874
代码语言:text复制
pheatmap::pheatmap(dat[cg2,],
                   show_colnames =T,
                   show_rownames = F,
                   #scale = "row",
                   fontsize = 7 ,
                   cluster_cols = T,
                   annotation_col=gl,
                   filename = 'heatmap_top50up&down_DEG.png')
代码语言:text复制
#3.GO KEGG富集分析与enrichplot 设定pvalue是防止其被过度矫正,能够提供更多关于富集结果显著性的信息
log2FC_cutoff = log2(10)
pvalue_cutoff = 0.001
padj_cutoff = 0.001
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
library(ggnewscale)
# library(org.Hs.eg.db)
#BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
library(DOSE)
library(pathview) 
class(DEG_DEseq2)
代码语言:txt复制
## [1] "data.frame"
代码语言:text复制
need_DEG18 <- DEG_DEseq2[,c(2,5,6)]  
head(need_DEG18)
代码语言:txt复制
##               log2FoldChange        pvalue          padj
## Flnc                8.985284 3.700584e-168 4.733047e-164
## Peg10               6.585819 2.187517e-153 1.398917e-149
## 2010310C07Rik       7.431042 7.463596e-130 3.181980e-126
## Col1a2              9.694994 9.271884e-120 2.964685e-116
## Kcnj10              6.039922 3.968897e-118 1.015244e-114
## Klf4               -6.287716 1.096622e-114 2.337633e-111
代码语言:text复制
colnames(need_DEG18) <- c('log2FoldChange','pvalue','padj')
gene_up=rownames(need_DEG18[with(need_DEG18,log2FoldChange > log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])
gene_down=rownames(need_DEG18[with(need_DEG18,log2FoldChange < -log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])

#### 转化基因名为entrez ID ####
gene_up_entrez <- as.character(na.omit(bitr(gene_up, #数据集
                                            fromType="SYMBOL", #输入格式
                                            toType="ENTREZID", # 转为ENTERZID格式
                                            OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_down_entrez <- as.character(na.omit(bitr(gene_down, #数据集
                                              fromType="SYMBOL", #输入格式
                                              toType="ENTREZID", # 转为ENTERZID格式
                                              OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_diff_entrez <- unique(c(gene_up_entrez ,gene_down_entrez ))
#利用clusterProfiler进行KEGG与GO富集
kegg_enrich_results <- enrichKEGG(gene  = gene_up_entrez,
                                  organism  = "mmu", #物种人类 hsa 小鼠mmu
                                  pvalueCutoff = 0.05,
                                  qvalueCutoff = 0.2)
kk_read <- DOSE::setReadable(kegg_enrich_results, 
                             OrgDb="org.Mm.eg.db", 
                             keyType='ENTREZID')#ENTREZID to gene Symbol
write.csv(kk_read@result,'KEGG_gene_up_enrichresults.csv') 
save(kegg_enrich_results, file ='KEGG_gene_up_enrichresults.Rdata')
go_enrich_results <- enrichGO(gene = gene_up_entrez,
                              OrgDb = "org.Mm.eg.db",
                              ont   = "ALL"  ,     #One of "BP", "MF"  "CC"  "ALL" 
                              pvalueCutoff  = 0.05,
                              qvalueCutoff  = 0.2,
                              readable      = TRUE)
write.csv(go_enrich_results@result, 'GO_gene_up_BP_enrichresults.csv') 
save(go_enrich_results, file ='GO_gene_up_enrichresults.Rdata')

genelist <- as.numeric(DEG_DEseq2[,2]) 
names(genelist) <- row.names(DEG_DEseq2)
# genelist ID转化
genelist_entrez <- genelist
names(genelist_entrez) <- bitr(names(genelist),
                               fromType="SYMBOL",toType="ENTREZID", 
                               OrgDb="org.Mm.eg.db")[,2]  
genelist_entrez <- genelist_entrez[!is.na(names(genelist_entrez))]

##查看与选择所需通路
kk_read@result$Description[1:10] #查看前10通路
代码语言:txt复制
##  [1] "Proteoglycans in cancer - Mus musculus (house mouse)"         
##  [2] "Protein digestion and absorption - Mus musculus (house mouse)"
##  [3] "Focal adhesion - Mus musculus (house mouse)"                  
##  [4] "ECM-receptor interaction - Mus musculus (house mouse)"        
##  [5] "Amphetamine addiction - Mus musculus (house mouse)"           
##  [6] "Dopaminergic synapse - Mus musculus (house mouse)"            
##  [7] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"      
##  [8] "Hedgehog signaling pathway - Mus musculus (house mouse)"      
##  [9] "Relaxin signaling pathway - Mus musculus (house mouse)"       
## [10] "JAK-STAT signaling pathway - Mus musculus (house mouse)"
代码语言:text复制
i=3 
select_pathway <- kk_read@result$ID[i] #选择所需通路的ID号

pathview(gene.data     = genelist_entrez,
         pathway.id    = select_pathway,
         species       = 'mmu' ,      # 人类hsa 小鼠mmu 
         kegg.native   = T,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件 
         new.signature = F, #pdf是否显示pathway标注
         limit         = list(gene=2.5, cpd=1) #图例color bar范围调整 
)
代码语言:text复制
pathview(gene.data     = genelist_entrez,
         pathway.id    = select_pathway,
         species       = 'mmu' ,      # 人类hsa 小鼠mmu 
         kegg.native   = F,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件 
         new.signature = F, #pdf是否显示pathway标注
         limit         = list(gene=2.5, cpd=1) #图例color bar范围调整 
)
# kegg.native = T : 这个参数设置为 TRUE ,表示输出为KEGG原生的可视化图像,通常是PNG格式的图像。 
#kegg.native = F : 这个参数设置为 FALSE ,表示输出为基因列表的PDF文件,而不是完整的pathway的PNG文件。
代码语言:text复制
#dotplot与barplot
barp <- barplot(go_enrich_results, font.size=14, showCategory=12) 
  theme(plot.margin=unit(c(1,1,1,1),'lines')) 
barp
代码语言:text复制
ggsave(barp,filename = 'KEGGnew.png', width=10, height=10)
代码语言:text复制
barp1 <- barplot(go_enrich_results, split= "ONTOLOGY", font.size =12,showCategory=6) 
  facet_grid(ONTOLOGY~., scale="free")      
  theme(plot.margin=unit(c(1,1,1,1),'lines'))
barp1
代码语言:text复制
ggsave(barp1,filename = 'GO.png', width=10, height=10)
dev.off
代码语言:txt复制
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x000001f5738bcfe0>
## <environment: namespace:grDevices>
代码语言:text复制
#
dotp <- enrichplot::dotplot(go_enrich_results,font.size =14,showCategory = 12) 
  theme(legend.key.size = unit(10, "pt"),#调整图例大小
        plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
dotp
代码语言:text复制
ggsave(dotp,filename = 'KEGGdot.png', width=10, height=10)
代码语言:text复制
dotp1 <- enrichplot::dotplot(go_enrich_results,font.size =12,split = 'ONTOLOGY',showCategory = 6) 
  facet_grid(ONTOLOGY~., scale="free")      
  theme(legend.key.size = unit(10, "pt"),#调整图例大小
        plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
ggsave(dotp1,filename = 'GOdot.png', width=10, height=10)
代码语言:text复制
#构建各通路下的基因网络
genelist <- as.numeric(DEG_DEseq2[,2]) 
names(genelist) <- row.names(DEG_DEseq2)

cnetp1 <- cnetplot(go_enrich_results,  foldChange = genelist,
                   showCategory = 8,
                   colorEdge = T,
                   node_label = 'all',
                   color_category ='steelblue') 
                   labs(x = "") 
                   labs(y = "") 
                   theme_bw() 
                    theme(panel.grid=element_blank()) 
                    theme(panel.border = element_blank()) 
                    theme(axis.text = element_blank())     ## 删去刻度标签
                    theme(axis.ticks = element_blank())     ## 删去刻度线
                    theme(panel.border = element_blank())   ## 删去外层边框
代码语言:text复制
cnetp2 <- cnetplot(go_enrich_results,  foldChange = genelist,
                   showCategory = 8,
                   node_label = 'gene',
                   circular = T, 
                   colorEdge = T) 
                   labs(x = "") 
                   labs(y = "") 
                   theme_bw() 
                   theme(panel.grid=element_blank()) 
                   theme(panel.border = element_blank()) 
                   theme(axis.text = element_blank())     ## 删去刻度标签
                   theme(axis.ticks = element_blank())     ## 删去刻度线
                   theme(panel.border = element_blank())
ggsave(cnetp1,filename ='cnetplot.png', width =12,height =10)
ggsave(cnetp2,filename = 'cnetplot_cir.png', width =15,height =10)
代码语言:text复制
#emapplot-Enrichment Map
pt <- pairwise_termsim(go_enrich_results)
emapp <- emapplot(pt,
                  showCategory = 30, 
                  node_label   = 'category')  # 'category', 'group', 'all' and 'none'.)
ggsave(emapp,filename = 'emapplot.png',width =10,height =10)
代码语言:text复制
#GSEA分析
library(org.Mm.eg.db)
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
## 物种设置
organism = 'mmu'    #  人类'hsa' 小鼠'mmu'   
OrgDb = 'org.Mm.eg.db'#人类"org.Hs.eg.db" 小鼠"org.Mm.eg.db"

#### 按照需要可选择不同的DEG方法数据集 ####
need_DEG <- DEG_DEseq2
need_DEG <- need_DEG[,c(2,5)] #选择log2FoldChange和pvalue(凑成数据框)

colnames(need_DEG) <- c('log2FoldChange','pvalue')
need_DEG$SYMBOL <- rownames(need_DEG)

##### 创建gsea分析的geneList(包含从大到小排列的log2FoldChange和ENTREZID信息)####
#转化id  
df <- bitr(rownames(need_DEG), 
           fromType = "SYMBOL",
           toType =  "ENTREZID",
           OrgDb = OrgDb) #人数据库org.Hs.eg.db 小鼠org.Mm.eg.db
need_DEG <- merge(need_DEG, df, by='SYMBOL')  #按照SYMBOL合并注释信息
geneList <- need_DEG$log2FoldChange
names(geneList) <- need_DEG$ENTREZID
geneList <- sort(geneList, decreasing = T)   #从大到小排序

##### gsea富集 ####
KEGG_kk_entrez <- gseKEGG(geneList     = geneList,
                          organism     = organism, #人hsa 鼠mmu
                          pvalueCutoff = 0.25)  #实际为padj阈值,可调整 
KEGG_kk <- DOSE::setReadable(KEGG_kk_entrez, 
                             OrgDb=OrgDb,
                             keyType='ENTREZID')#转化id             

GO_kk_entrez <- gseGO(geneList     = geneList,
                      ont          = "ALL",  # "BP"、"MF"和"CC"或"ALL"
                      OrgDb        = OrgDb,#人类org.Hs.eg.db 鼠org.Mm.eg.db
                      keyType      = "ENTREZID",
                      pvalueCutoff = 0.25)   #实际为padj阈值可调整
GO_kk <- DOSE::setReadable(GO_kk_entrez, 
                           OrgDb=OrgDb,
                           keyType='ENTREZID')#转化id 

save(KEGG_kk_entrez, GO_kk_entrez, file = "GSEA_result.RData")
代码语言:text复制
##选取富集结果
kk_gse <- KEGG_kk
kk_gse_entrez <- KEGG_kk_entrez

###条件筛选 
#一般认为|NES|>1,NOM pvalue<0.05,FDR(padj)<0.25的通路是显著富集的
kk_gse_cut <- kk_gse[kk_gse$pvalue<0.05 & kk_gse$p.adjust<0.25 & abs(kk_gse$NES)>1]
kk_gse_cut_down <- kk_gse_cut[kk_gse_cut$NES < 0,]
kk_gse_cut_up <- kk_gse_cut[kk_gse_cut$NES > 0,]

#选择展现NES前几个通路 
down_gsea <- kk_gse_cut_down[tail(order(kk_gse_cut_down$NES,decreasing = T),10),]
up_gsea <- kk_gse_cut_up[head(order(kk_gse_cut_up$NES,decreasing = T),10),]
diff_gsea <- kk_gse_cut[head(order(abs(kk_gse_cut$NES),decreasing = T),10),]


#### 经典的GSEA图 
up_gsea$Description
代码语言:txt复制
## [1] "ECM-receptor interaction - Mus musculus (house mouse)"        
## [2] "Protein digestion and absorption - Mus musculus (house mouse)"
## [3] "Hedgehog signaling pathway - Mus musculus (house mouse)"      
## [4] "Focal adhesion - Mus musculus (house mouse)"                  
## [5] "Proteoglycans in cancer - Mus musculus (house mouse)"         
## [6] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"      
## [7] "Human papillomavirus infection - Mus musculus (house mouse)"
代码语言:text复制
i=2
gseap1 <- gseaplot2(kk_gse,
                    up_gsea$ID[i],#富集的ID编号
                    title = up_gsea$Description[i],#标题
                    color = "red", #GSEA线条颜色
                    base_size = 18,#基础字体大小
                    rel_heights = c(1.5, 0.5, 1),#副图的相对高度
                    subplots = 1:3,   #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
                    ES_geom = "line", #enrichment score用线还是用点"dot"
                    pvalue_table = T) #显示pvalue等信息
ggsave(gseap1, filename = 'GSEA_up_1.pdf', width =10, height =8)
代码语言:text复制
#### 合并 GSEA通路 
gseap2 <- gseaplot2(kk_gse,
                    up_gsea$ID,#富集的ID编号
                    title = "UP_GSEA_all",#标题
                    color = "red",#GSEA线条颜色
                    base_size = 18,#基础字体大小
                    rel_heights = c(1.4, 0.5, 1),#副图的相对高度
                    subplots = 1:3, #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
                    ES_geom = "line",#enrichment score用线还是用点"dot"
                    pvalue_table = T) #显示pvalue等信息
ggsave(gseap2, filename = "GSEA_up_all.png",width =13,height =12)

0 人点赞