title: "RNAseq231026"
output: html_document
date: "2023-10-31"
参考https://www.zhihu.com/people/gu_chen/posts?page=2
R Markdown
代码语言:text复制###环境设置
rm(list=ls())
options(stringsAsFactors = F)
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr tibble forcats
library(data.table) #多核读取文件
setwd("D:/huage1/rnaseq1026")
#### 对counts进行处理筛选得到表达矩阵 ####
a1 <- fread('./counts.txt',
header = T,data.table = F)#载入counts,第一列设置为列名
colnames(a1)
代码语言:txt复制## [1] "Geneid"
## [2] "Chr"
## [3] "Start"
## [4] "End"
## [5] "Strand"
## [6] "Length"
## [7] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207279_sorted.bam"
## [8] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207280_sorted.bam"
## [9] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207283_sorted.bam"
## [10] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207284_sorted.bam"
代码语言:text复制counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts
rownames(counts) <- a1$Geneid
colnames(counts) <- gsub('/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/','', #删除样品名前缀
gsub('_sorted.bam','', colnames(counts))) #删除样品名后缀
#### 导入或构建样本信息, 进行列样品名的重命名和分组####
b <- read.csv('./SraRunTable.txt')
b
代码语言:txt复制## Run Assay.Type AvgSpotLen Bases BioProject
## 1 SRR12207279 RNA-Seq 295 4512474269 PRJNA645812
## 2 SRR12207280 RNA-Seq 296 6542603004 PRJNA645812
## 3 SRR12207283 RNA-Seq 300 4108168500 PRJNA645812
## 4 SRR12207284 RNA-Seq 300 4536921000 PRJNA645812
## BioSample Bytes cell_type_source Center.Name Consent
## 1 SAMN15517122 1855257229 stem cells GEO public
## 2 SAMN15517121 2616595592 stem cells GEO public
## 3 SAMN15517118 1777556258 stem cells GEO public
## 4 SAMN15517117 1699519747 stem cells GEO public
## DATASTORE.filetype DATASTORE.provider DATASTORE.region
## 1 sra,run.zq,fastq s3,gs,ncbi gs.US,s3.us-east-1,ncbi.public
## 2 sra,run.zq,fastq s3,gs,ncbi s3.us-east-1,ncbi.public,gs.US
## 3 sra,fastq,run.zq gs,s3,ncbi s3.us-east-1,gs.US,ncbi.public
## 4 sra,fastq,run.zq ncbi,s3,gs gs.US,ncbi.public,s3.us-east-1
## Experiment genotype GEO_Accession..exp. Instrument LibraryLayout
## 1 SRX8718088 WT GSM4668511 HiSeq X Ten PAIRED
## 2 SRX8718089 WT GSM4668512 HiSeq X Ten PAIRED
## 3 SRX8718092 WT GSM4668515 HiSeq X Ten PAIRED
## 4 SRX8718093 WT GSM4668516 HiSeq X Ten PAIRED
## LibrarySelection LibrarySource Organism Platform
## 1 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 2 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 3 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 4 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## ReleaseDate create_date version Sample.Name
## 1 2021-02-19T00:00:00Z 2020-07-13T09:00:00Z 1 GSM4668511
## 2 2021-02-19T00:00:00Z 2020-07-13T09:01:00Z 1 GSM4668512
## 3 2021-02-19T00:00:00Z 2020-07-13T08:58:00Z 1 GSM4668515
## 4 2021-02-19T00:00:00Z 2020-07-13T09:03:00Z 1 GSM4668516
## source_name SRA.Study chip_antibody
## 1 RNA_mESCs SRP271532 NA
## 2 RNA_mESCs SRP271532 NA
## 3 RNA_EpiSCs SRP271532 NA
## 4 RNA_EpiSCs SRP271532 NA
代码语言:text复制name_list <- b$source_name[match(colnames(counts),b$Run)]; name_list #选择所需要的样品信息列
代码语言:txt复制## [1] "RNA_mESCs" "RNA_mESCs" "RNA_EpiSCs" "RNA_EpiSCs"
代码语言:text复制class(name_list)
代码语言:txt复制## [1] "character"
代码语言:text复制##以上做完
nlgl <- data.frame(row.names=colnames(counts),
name_list=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2"),
group_list=c("naive","naive","primed","primed"))
name_list <- nlgl$name_list
group_list <- nlgl$group_list
gl <- data.frame(row.names=colnames(counts), #构建样品名与分组对应的数据框
group_list=group_list)
class(nlgl)
代码语言:txt复制## [1] "data.frame"
代码语言:text复制geneid_efflen <- subset(a1,select = c("Geneid","Length"))
colnames(geneid_efflen) <- c("geneid","efflen")
efflen <- geneid_efflen[match(rownames(counts),
geneid_efflen$geneid),
"efflen"]
class(efflen)
代码语言:txt复制## [1] "integer"
代码语言:text复制counts2TPM <- function(count=count, efflength=efflen){
RPK <- count/(efflength/1000) #每千碱基reads (Reads Per Kilobase) 长度标准化
PMSC_rpk <- sum(RPK)/1e6 #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化
RPK/PMSC_rpk
}
tpm <- as.data.frame(apply(counts,2,counts2TPM))
colSums(tpm)
代码语言:txt复制## SRR12207279 SRR12207280 SRR12207283 SRR12207284
## 1e 06 1e 06 1e 06 1e 06
代码语言:text复制##新方法 合并
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr tibble forcats
library(data.table) #多核读取文件
g2s <- fread('g2s_vm33_gencode.txt',header = F,data.table = F) #载入从gencode的gtf文件中提取的信息文件
colnames(g2s) <- c("geneid","symbol")
symbol <- g2s[match(rownames(counts),g2s$geneid),"symbol"] #匹配counts行名对应的symbol
table(duplicated(symbol))
代码语言:txt复制##
## FALSE TRUE
## 56791 150
代码语言:text复制#FALSE TRUE
#55492 770
##使用aggregate根据symbol列中的相同基因进行合并
代码语言:text复制counts <- aggregate(counts, by=list(symbol), FUN=sum)
counts <- column_to_rownames(counts,'Group.1')
#在 aggregate 函数的使用过程中, Group.1 列是通过对 counts 数据框中的计数进行求和而生成的临时列。
dim(counts)
代码语言:txt复制## [1] 56791 4
代码语言:text复制#[1] 55492 4
tpm <- aggregate(tpm, by=list(symbol), FUN=sum)
tpm <- column_to_rownames(tpm,'Group.1')
#在第一句代码 tpm <- aggregate(tpm, by=list(symbol), FUN=sum) 中, aggregate 函数会根据 symbol 列对 tpm 数据框进行聚合操作,并将每个组的TPM值求和。但是,聚合操作会生成一个新的数据框,其中 symbol 列被重命名为 Group.1 ,而原始的行名可能会丢失。
#为了保留原始的行名,第二句代码 tpm <- column_to_rownames(tpm,'Group.1') 将 Group.1 列的值作为新的行名,以确保每个基因的标识信息仍然保留在结果中。
#因此,两句代码的组合将按照 symbol 分组并计算总和,然后使用 Group.1 列的值作为新的行名。
#初步过滤低表达基因与保存counts数据
keep_feature <- rowSums(counts>1) >= 2
table(keep_feature)
代码语言:txt复制## keep_feature
## FALSE TRUE
## 42689 14102
代码语言:text复制counts_filt <- counts[keep_feature, ]
tpm_filt <- tpm[keep_feature, ]
counts_raw=counts
counts=counts_filt
tpm=tpm_filt
save(counts_raw,counts,tpm,group_list,gl,file='./1.counts.Rdata')
#四、差异分析前的准备工作
#1.数据预处理,使样本间具有可比性
options(stringsAsFactors = F)
library(FactoMineR)
library(factoextra)
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr tibble forcats
library(pheatmap)
library(DESeq2)
library(RColorBrewer)
#数据预处理 # (任选以下一种作为dat即可,主要是进行样本间归一化,使得样本具有可比性)
dat <- log2(tpm 1)
colnames(dat) <- c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
#2.boxplot查看样本的基因整体表达情况
### boxplot 查看样本的基因整体表达情况
p=boxplot(dat, col="blue", ylab="dat", main=" normalized data ",
outline = F, notch = F)
代码语言:text复制dev.off()
代码语言:txt复制## null device
## 1
代码语言:text复制#3查看聚类情况 hclust 图、距离热图、PCA图、差异基因热图、相关性热图
#聚类
sampleDists <- dist(t(dat)) #dist默认计算矩阵行与行的距离, 因此需要转置
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) #选取热图的颜色
p0 <- pheatmap::pheatmap(sampleDistMatrix,
fontsize=7,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
angle_col=45,
col=colors)
代码语言:text复制ggsave(p0,filename = 'check_dist.png',width = 7.5,height =6)
dev.off()
代码语言:txt复制## null device
## 1
代码语言:text复制#PCA图
dat.pca <- PCA(t(dat) , graph = F)
percentVar <- get_eigenvalue(dat.pca)
pca <- fviz_pca_ind(dat.pca,
title = "Principal Component Analysis",
legend.title = "Groups",
geom.ind = c("point", "text"),
pointsize = 1.5,
labelsize = 3,
col.ind = group_list, # 分组上色
axes.linetype=NA, # remove axeslines
mean.point=F#去除分组中心点
)
coord_fixed(ratio = 1) #坐标轴的纵横比
xlab(paste0("PC1 (",round(percentVar[1,'variance.percent'],1),"%)"))
ylab(paste0("PC2 (",round(percentVar[2,'variance.percent'],1),"%)"))
pca
代码语言:text复制#heatmap检测——取500高表达基因 看一下样本相关性
dat_500 <- dat[names(sort(apply(dat,1,mad),decreasing = T)[1:500]),]#取高表达量前500基因
M <- cor(dat_500)
gl$name_list <-nlgl$name_list
rownames(gl)<-gl$name_list
p2 <-pheatmap::pheatmap(M,
show_rownames = T,
angle_col=45,
fontsize=7,
annotation_col = gl)
p2
代码语言:text复制ggsave(p2,filename = 'check_cor_top500new.png',width = 7.5,height =6)
代码语言:text复制#
####################### heatmap检测——取500差异大的基因 差异基因热图##########################################
cg <- names(tail(sort(apply(dat,1,sd)),500)) #取每一行的方差,从小到大排序,取最大的500个
p1 <- pheatmap::pheatmap(dat[cg, ],
show_colnames =T,show_rownames = F,
fontsize=7,
legend_breaks = -3:3,
#scale = "row",
angle_col=45,
annotation_col=gl)
代码语言:text复制ggsave(p1,filename = 'check_heatmap_top500_sdnew.png',width = 7.5,height =6)
代码语言:text复制dev.off()
代码语言:txt复制## null device
## 1
代码语言:text复制#差异分析 DESeq2
exp="primed"
ctr="naive"
colnames(counts)=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
gl1 <-gl$group_list
group_list <- gl$name_list
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = gl,
design = ~ group_list)
dds$group_list <- relevel(dds$group_list, ref = ctr)
keep <- rowSums(counts(dds)) >= 1.5*ncol(counts)
dds <- dds[keep,]
dds <- DESeq(dds,quiet = F)
res <- results(dds,contrast=c("group_list", exp, ctr))
resOrdered <- res[order(res$padj),]
class(resOrdered)
代码语言:txt复制## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
代码语言:text复制tempDEG <- as.data.frame(resOrdered)
DEG_DEseq2 <- na.omit(tempDEG)
#差异分析绘制 火山图和热图
#1.火山图的绘制
library(ggplot2)
library(pheatmap)
##筛选条件设置
log2FC_cutoff = log2(10)
padj_cutoff = 0.001
##选取差异分析结果
need_DEG <- DEG_DEseq2[,c(2,6)] #选取log2FoldChange, padj信息
colnames(need_DEG) <- c('log2FoldChange','padj')
need_DEG$significance <- as.factor(ifelse(need_DEG$padj < padj_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff,
ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT'))
title <- paste0(' Up : ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
'n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
'n FoldChange >= ',round(2^log2FC_cutoff,3))
g <- ggplot(data=need_DEG,
aes(x=log2FoldChange, y=-log10(padj),
color=significance))
#点和背景
geom_point(alpha=0.4, size=1)
theme_classic() #无网格线
#坐标轴
xlab("log2 ( FoldChange )")
ylab("-log10 ( P.adjust )")
#标题文本
ggtitle( title )
#分区颜色
scale_colour_manual(values = c('blue','grey','red'))
#辅助线
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.8)
geom_hline(yintercept = -log10(padj_cutoff),lty=4,col="grey",lwd=0.8)
#图例标题间距等设置
theme(plot.title = element_text(hjust = 0.5),
plot.margin=unit(c(2,2,2,2),'lines'), #上右下左
legend.title = element_blank(), #不显示图例标题
legend.position="right") #图例位置
ggsave(g,filename = 'volcano_padj.png',width =8,height =7.5)
代码语言:text复制#2.热图的绘制
gene_up <- rownames(need_DEG[with(need_DEG,log2FoldChange>log2FC_cutoff & padj<padj_cutoff),])
gene_down <- rownames(need_DEG[with(need_DEG,log2FoldChange< -log2FC_cutoff & padj<padj_cutoff),])
cg1 <- c(head(gene_up, 50), #取前50 padj上下调基因名
head(gene_down, 50))
cg2 <- na.omit(match(cg1,rownames(dat)))
cg2
代码语言:txt复制## [1] 3740 10121 131 2383 8147 8036 2357 9217 1899 3244 5806
## [12] 5732 8893 4012 4852 8462 13189 6102 4017 5444 1409 12960
## [23] 13436 10641 3974 10090 12177 7734 7873 10060 10127 588 889
## [34] 13432 3576 8306 2405 14063 13735 12994 7930 3363 2389 6126
## [45] 11482 10539 8824 7887 2394 13154 8254 9052 1243 12776 1067
## [56] 1076 9567 11852 2080 3805 3050 2180 482 1537 9713 8113
## [67] 10540 2994 9188 470 7715 242 13609 3232 9276 878 2107
## [78] 8736 10270 562 11424 1848 12751 931 9172 12255 3542 3726
## [89] 8097 380 12296 7640 4097 2818 3838 3921 13630 8898 5832
## [100] 13874
代码语言:text复制pheatmap::pheatmap(dat[cg2,],
show_colnames =T,
show_rownames = F,
#scale = "row",
fontsize = 7 ,
cluster_cols = T,
annotation_col=gl,
filename = 'heatmap_top50up&down_DEG.png')
代码语言:text复制#3.GO KEGG富集分析与enrichplot 设定pvalue是防止其被过度矫正,能够提供更多关于富集结果显著性的信息
log2FC_cutoff = log2(10)
pvalue_cutoff = 0.001
padj_cutoff = 0.001
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
library(ggnewscale)
# library(org.Hs.eg.db)
#BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
library(DOSE)
library(pathview)
class(DEG_DEseq2)
代码语言:txt复制## [1] "data.frame"
代码语言:text复制need_DEG18 <- DEG_DEseq2[,c(2,5,6)]
head(need_DEG18)
代码语言:txt复制## log2FoldChange pvalue padj
## Flnc 8.985284 3.700584e-168 4.733047e-164
## Peg10 6.585819 2.187517e-153 1.398917e-149
## 2010310C07Rik 7.431042 7.463596e-130 3.181980e-126
## Col1a2 9.694994 9.271884e-120 2.964685e-116
## Kcnj10 6.039922 3.968897e-118 1.015244e-114
## Klf4 -6.287716 1.096622e-114 2.337633e-111
代码语言:text复制colnames(need_DEG18) <- c('log2FoldChange','pvalue','padj')
gene_up=rownames(need_DEG18[with(need_DEG18,log2FoldChange > log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])
gene_down=rownames(need_DEG18[with(need_DEG18,log2FoldChange < -log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])
#### 转化基因名为entrez ID ####
gene_up_entrez <- as.character(na.omit(bitr(gene_up, #数据集
fromType="SYMBOL", #输入格式
toType="ENTREZID", # 转为ENTERZID格式
OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_down_entrez <- as.character(na.omit(bitr(gene_down, #数据集
fromType="SYMBOL", #输入格式
toType="ENTREZID", # 转为ENTERZID格式
OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_diff_entrez <- unique(c(gene_up_entrez ,gene_down_entrez ))
#利用clusterProfiler进行KEGG与GO富集
kegg_enrich_results <- enrichKEGG(gene = gene_up_entrez,
organism = "mmu", #物种人类 hsa 小鼠mmu
pvalueCutoff = 0.05,
qvalueCutoff = 0.2)
kk_read <- DOSE::setReadable(kegg_enrich_results,
OrgDb="org.Mm.eg.db",
keyType='ENTREZID')#ENTREZID to gene Symbol
write.csv(kk_read@result,'KEGG_gene_up_enrichresults.csv')
save(kegg_enrich_results, file ='KEGG_gene_up_enrichresults.Rdata')
go_enrich_results <- enrichGO(gene = gene_up_entrez,
OrgDb = "org.Mm.eg.db",
ont = "ALL" , #One of "BP", "MF" "CC" "ALL"
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE)
write.csv(go_enrich_results@result, 'GO_gene_up_BP_enrichresults.csv')
save(go_enrich_results, file ='GO_gene_up_enrichresults.Rdata')
genelist <- as.numeric(DEG_DEseq2[,2])
names(genelist) <- row.names(DEG_DEseq2)
# genelist ID转化
genelist_entrez <- genelist
names(genelist_entrez) <- bitr(names(genelist),
fromType="SYMBOL",toType="ENTREZID",
OrgDb="org.Mm.eg.db")[,2]
genelist_entrez <- genelist_entrez[!is.na(names(genelist_entrez))]
##查看与选择所需通路
kk_read@result$Description[1:10] #查看前10通路
代码语言:txt复制## [1] "Proteoglycans in cancer - Mus musculus (house mouse)"
## [2] "Protein digestion and absorption - Mus musculus (house mouse)"
## [3] "Focal adhesion - Mus musculus (house mouse)"
## [4] "ECM-receptor interaction - Mus musculus (house mouse)"
## [5] "Amphetamine addiction - Mus musculus (house mouse)"
## [6] "Dopaminergic synapse - Mus musculus (house mouse)"
## [7] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"
## [8] "Hedgehog signaling pathway - Mus musculus (house mouse)"
## [9] "Relaxin signaling pathway - Mus musculus (house mouse)"
## [10] "JAK-STAT signaling pathway - Mus musculus (house mouse)"
代码语言:text复制i=3
select_pathway <- kk_read@result$ID[i] #选择所需通路的ID号
pathview(gene.data = genelist_entrez,
pathway.id = select_pathway,
species = 'mmu' , # 人类hsa 小鼠mmu
kegg.native = T,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件
new.signature = F, #pdf是否显示pathway标注
limit = list(gene=2.5, cpd=1) #图例color bar范围调整
)
代码语言:text复制pathview(gene.data = genelist_entrez,
pathway.id = select_pathway,
species = 'mmu' , # 人类hsa 小鼠mmu
kegg.native = F,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件
new.signature = F, #pdf是否显示pathway标注
limit = list(gene=2.5, cpd=1) #图例color bar范围调整
)
# kegg.native = T : 这个参数设置为 TRUE ,表示输出为KEGG原生的可视化图像,通常是PNG格式的图像。
#kegg.native = F : 这个参数设置为 FALSE ,表示输出为基因列表的PDF文件,而不是完整的pathway的PNG文件。
代码语言:text复制#dotplot与barplot
barp <- barplot(go_enrich_results, font.size=14, showCategory=12)
theme(plot.margin=unit(c(1,1,1,1),'lines'))
barp
代码语言:text复制ggsave(barp,filename = 'KEGGnew.png', width=10, height=10)
代码语言:text复制barp1 <- barplot(go_enrich_results, split= "ONTOLOGY", font.size =12,showCategory=6)
facet_grid(ONTOLOGY~., scale="free")
theme(plot.margin=unit(c(1,1,1,1),'lines'))
barp1
代码语言:text复制ggsave(barp1,filename = 'GO.png', width=10, height=10)
dev.off
代码语言:txt复制## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x000001f5738bcfe0>
## <environment: namespace:grDevices>
代码语言:text复制#
dotp <- enrichplot::dotplot(go_enrich_results,font.size =14,showCategory = 12)
theme(legend.key.size = unit(10, "pt"),#调整图例大小
plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
dotp
代码语言:text复制ggsave(dotp,filename = 'KEGGdot.png', width=10, height=10)
代码语言:text复制dotp1 <- enrichplot::dotplot(go_enrich_results,font.size =12,split = 'ONTOLOGY',showCategory = 6)
facet_grid(ONTOLOGY~., scale="free")
theme(legend.key.size = unit(10, "pt"),#调整图例大小
plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
ggsave(dotp1,filename = 'GOdot.png', width=10, height=10)
代码语言:text复制#构建各通路下的基因网络
genelist <- as.numeric(DEG_DEseq2[,2])
names(genelist) <- row.names(DEG_DEseq2)
cnetp1 <- cnetplot(go_enrich_results, foldChange = genelist,
showCategory = 8,
colorEdge = T,
node_label = 'all',
color_category ='steelblue')
labs(x = "")
labs(y = "")
theme_bw()
theme(panel.grid=element_blank())
theme(panel.border = element_blank())
theme(axis.text = element_blank()) ## 删去刻度标签
theme(axis.ticks = element_blank()) ## 删去刻度线
theme(panel.border = element_blank()) ## 删去外层边框
代码语言:text复制cnetp2 <- cnetplot(go_enrich_results, foldChange = genelist,
showCategory = 8,
node_label = 'gene',
circular = T,
colorEdge = T)
labs(x = "")
labs(y = "")
theme_bw()
theme(panel.grid=element_blank())
theme(panel.border = element_blank())
theme(axis.text = element_blank()) ## 删去刻度标签
theme(axis.ticks = element_blank()) ## 删去刻度线
theme(panel.border = element_blank())
ggsave(cnetp1,filename ='cnetplot.png', width =12,height =10)
ggsave(cnetp2,filename = 'cnetplot_cir.png', width =15,height =10)
代码语言:text复制#emapplot-Enrichment Map
pt <- pairwise_termsim(go_enrich_results)
emapp <- emapplot(pt,
showCategory = 30,
node_label = 'category') # 'category', 'group', 'all' and 'none'.)
ggsave(emapp,filename = 'emapplot.png',width =10,height =10)
代码语言:text复制#GSEA分析
library(org.Mm.eg.db)
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
## 物种设置
organism = 'mmu' # 人类'hsa' 小鼠'mmu'
OrgDb = 'org.Mm.eg.db'#人类"org.Hs.eg.db" 小鼠"org.Mm.eg.db"
#### 按照需要可选择不同的DEG方法数据集 ####
need_DEG <- DEG_DEseq2
need_DEG <- need_DEG[,c(2,5)] #选择log2FoldChange和pvalue(凑成数据框)
colnames(need_DEG) <- c('log2FoldChange','pvalue')
need_DEG$SYMBOL <- rownames(need_DEG)
##### 创建gsea分析的geneList(包含从大到小排列的log2FoldChange和ENTREZID信息)####
#转化id
df <- bitr(rownames(need_DEG),
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = OrgDb) #人数据库org.Hs.eg.db 小鼠org.Mm.eg.db
need_DEG <- merge(need_DEG, df, by='SYMBOL') #按照SYMBOL合并注释信息
geneList <- need_DEG$log2FoldChange
names(geneList) <- need_DEG$ENTREZID
geneList <- sort(geneList, decreasing = T) #从大到小排序
##### gsea富集 ####
KEGG_kk_entrez <- gseKEGG(geneList = geneList,
organism = organism, #人hsa 鼠mmu
pvalueCutoff = 0.25) #实际为padj阈值,可调整
KEGG_kk <- DOSE::setReadable(KEGG_kk_entrez,
OrgDb=OrgDb,
keyType='ENTREZID')#转化id
GO_kk_entrez <- gseGO(geneList = geneList,
ont = "ALL", # "BP"、"MF"和"CC"或"ALL"
OrgDb = OrgDb,#人类org.Hs.eg.db 鼠org.Mm.eg.db
keyType = "ENTREZID",
pvalueCutoff = 0.25) #实际为padj阈值可调整
GO_kk <- DOSE::setReadable(GO_kk_entrez,
OrgDb=OrgDb,
keyType='ENTREZID')#转化id
save(KEGG_kk_entrez, GO_kk_entrez, file = "GSEA_result.RData")
代码语言:text复制##选取富集结果
kk_gse <- KEGG_kk
kk_gse_entrez <- KEGG_kk_entrez
###条件筛选
#一般认为|NES|>1,NOM pvalue<0.05,FDR(padj)<0.25的通路是显著富集的
kk_gse_cut <- kk_gse[kk_gse$pvalue<0.05 & kk_gse$p.adjust<0.25 & abs(kk_gse$NES)>1]
kk_gse_cut_down <- kk_gse_cut[kk_gse_cut$NES < 0,]
kk_gse_cut_up <- kk_gse_cut[kk_gse_cut$NES > 0,]
#选择展现NES前几个通路
down_gsea <- kk_gse_cut_down[tail(order(kk_gse_cut_down$NES,decreasing = T),10),]
up_gsea <- kk_gse_cut_up[head(order(kk_gse_cut_up$NES,decreasing = T),10),]
diff_gsea <- kk_gse_cut[head(order(abs(kk_gse_cut$NES),decreasing = T),10),]
#### 经典的GSEA图
up_gsea$Description
代码语言:txt复制## [1] "ECM-receptor interaction - Mus musculus (house mouse)"
## [2] "Protein digestion and absorption - Mus musculus (house mouse)"
## [3] "Hedgehog signaling pathway - Mus musculus (house mouse)"
## [4] "Focal adhesion - Mus musculus (house mouse)"
## [5] "Proteoglycans in cancer - Mus musculus (house mouse)"
## [6] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"
## [7] "Human papillomavirus infection - Mus musculus (house mouse)"
代码语言:text复制i=2
gseap1 <- gseaplot2(kk_gse,
up_gsea$ID[i],#富集的ID编号
title = up_gsea$Description[i],#标题
color = "red", #GSEA线条颜色
base_size = 18,#基础字体大小
rel_heights = c(1.5, 0.5, 1),#副图的相对高度
subplots = 1:3, #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
ES_geom = "line", #enrichment score用线还是用点"dot"
pvalue_table = T) #显示pvalue等信息
ggsave(gseap1, filename = 'GSEA_up_1.pdf', width =10, height =8)
代码语言:text复制#### 合并 GSEA通路
gseap2 <- gseaplot2(kk_gse,
up_gsea$ID,#富集的ID编号
title = "UP_GSEA_all",#标题
color = "red",#GSEA线条颜色
base_size = 18,#基础字体大小
rel_heights = c(1.4, 0.5, 1),#副图的相对高度
subplots = 1:3, #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
ES_geom = "line",#enrichment score用线还是用点"dot"
pvalue_table = T) #显示pvalue等信息
ggsave(gseap2, filename = "GSEA_up_all.png",width =13,height =12)