#RSEM定量后直接生成FPKM,无需标准化
#RNA-seq下游-1有些混乱,重新整理
#与原文存在差异的原因是原文mRNA-seq要对注释gtf文件对进行过滤甲基化区域和polyA尾以及原文用的hg19 74gtf 本文用的hg38.
title: "RNAseq-下游分析-2"
output: html_document
date: "2023-10-26"
R Markdown
代码语言:text复制#RSEM定量后直接生成FPKM,无需标准化
#RNA-seq下游-1有些混乱,重新整理
#与原文存在差异的原因是mRNA-seq要对gtf文件去
database <- read.table(file = "D:/huage1/rnaseq1014/xiaohe/output.matrix", sep = "t", header = T, row.names = 1)
library(magrittr)
library(dplyr)
database$ensembl_id<-rownames(database)
database$ensembl_id <- as.character(database$ensembl_id)
filtered_database <- database %>%
filter(!grepl("_PAR_Y$", database$ensembl_id))
names(filtered_database)[names(filtered_database) == 'ensembl_id'] <- 'v1'
library(tidyr)
database1 <- separate(filtered_database,v1,into = "ensembl_id",sep = "[.]")
names(database1)[5] <- "ensembl_id"
rownames(database1)=database1$ensembl_id
database2=database1[,c(1,2,3,4)]
colnames(database2) <- c("BHLHE40-rep1","BHLHE40-rep2","Control-rep1","Control-rep2")
library(DESeq2)
library(stringr)
library(dbplyr)
#BiocManager::install("DESeq2")
#BiocManager::install("dbplyr")
database <- round(as.matrix(database2))
condition <- factor(c(rep("treat",2),rep("control",2)))
coldata <- data.frame(row.names =colnames(database),condition)
library(tidyr)
dds <- DESeqDataSetFromMatrix(countData = database,colData = coldata,design = ~condition)
##countData用于说明数据来源,colData用于说明不同组数据的实验操作类型,design用于声明自变量,即谁和谁进行对比
nrow(dds)
代码语言:txt复制## [1] 61498
代码语言:text复制dds2 <- dds[rowSums(counts(dds))>10,]
nrow(dds2)
代码语言:txt复制## [1] 18084
代码语言:text复制#上面这步操作的目的是为了筛选数据。这步操作可以删除那些在所有样本中的表达计数总和不大于1的基因,因为这些基因的表达水平可能过于微弱,对于后续的分析可能不具有重要性或者可靠性。通过这种方式,可以减少噪声和潜在的误差,提高数据分析的准确性。
dds3 <- DESeq(dds2)#用于对dds数据进行运算及分析
res <- results(dds3)
res=as.data.frame(res)
class(res)
代码语言:txt复制## [1] "data.frame"
代码语言:text复制res1 <- res[order(res$padj, res$log2FoldChange, decreasing = c(FALSE, TRUE)), ]
#依次按照pvalue值log2FoldChange值进行排序
res1_up<- res1[which(res1$log2FoldChange >= 1 & res1$padj < 0.05),]
res1_down<- res1[which(res1$log2FoldChange <= -1 & res1$padj < 0.05),]
res1_total <- rbind(res1_up,res1_down)
#1.热图
library(pheatmap)
df <- database[intersect(rownames(database),rownames(res1_total)),]
df2<- as.matrix(df)
class(df2)
代码语言:txt复制## [1] "matrix" "array"
代码语言:text复制df3<-df2[, c(1,2,3,4)]
library(pheatmap)
pheatmap(df3,
show_rownames = F,
show_colnames = T,
cluster_cols = F,
cluster_rows=T,
height=10,
scale = "row",
frontsize = 10,
angle_col=45,
color =colorRampPalette(c("blue", "white","red"))(100))
library(pheatmap)
代码语言:text复制#2.火山图
res11 <- data.frame(res1, stringsAsFactors = FALSE, check.names = FALSE)
genes<- res11
class(genes)
代码语言:txt复制## [1] "data.frame"
代码语言:text复制library(ggplot2)
library(ggrepel)
genes$significant="stable"
genes$significant[genes$log2FoldChange>= 1 & genes$pvalue <0.05]="up"
genes$significant[genes$log2FoldChange<= -1 & genes$pvalue <0.05]="down"
p=ggplot(genes,aes(x=log2FoldChange,y=-1*log10(padj))) xlim(-3,3) ylim(0,9)
geom_point(aes(color=significant),size=0.8) theme_classic()
scale_color_manual(values = c("#2a9d8f","#f8961e","#EE7AE9"))
geom_hline(yintercept = 1.3,linetype=4,size=0.3)
geom_vline(xintercept = c(-1.5,1.5),linetype=4,size=0.3)
theme(title=element_text(size = 18),text = element_text(size=18))
labs(x="log2 fold change ",y="-log10(p_value)")
p
代码语言:text复制#3.富集分析
library(dplyr)
library(tidyverse)
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
library(ggnewscale)
library(statsExpressions)
#install.packages("statsExpressions")
library(ggstatsplot)
options(pkgType="binary")
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
代码语言:txt复制## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GENETYPE" "GO"
## [13] "GOALL" "IPI" "MAP" "OMIM"
## [17] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
## [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL"
## [25] "UCSCKG" "UNIPROT"
代码语言:text复制res1_up$ensembl_id=rownames(res1_up)
ego_cc<-enrichGO(gene = res1_up$ensembl_id,
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.3,
qvalueCutoff = 0.3)
barplot(ego_cc,showCategory = 10)
代码语言:text复制dotplot(ego_cc,showCategory = 10)
代码语言:text复制library(ggplot2)
dotplot(ego_cc,showCategory = 10)
代码语言:text复制ego_MF = enrichGO(gene =res1_up$ensembl_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "MF",
pvalueCutoff = 0.3,
qvalueCutoff = 0.15)
dotplot(ego_MF,showCategory = 10)
代码语言:text复制barplot(ego_MF,showCategory = 10)
代码语言:text复制ego_BP = enrichGO(gene =res1_up$ensembl_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "BP",
pvalueCutoff = 0.25,
qvalueCutoff = 0.1)
dotplot(ego_BP,showCategory = 7)
代码语言:text复制barplot(ego_BP,showCategory = 7)
](figure/unnamed-chunk-1-9.png)
代码语言:text复制library(clusterProfiler)
gene.df<-bitr(res1_up$ensembl_id, fromType = "ENSEMBL",
toType = c("SYMBOL","ENTREZID"),
OrgDb = org.Hs.eg.db)
kk<-enrichKEGG(gene = as.numeric(gene.df$ENTREZID),
organism = 'hsa',
pvalueCutoff = 0.2,
qvalueCutoff =0.2)
dotplot(kk)
代码语言:text复制barplot(kk)
代码语言:text复制library(VennDiagram)
dev.off()
代码语言:txt复制## png
## 4
代码语言:text复制res1_down$ensembl_id=rownames(res1_down)
venn_list <- list(group1 = res1_up$ensembl_id, group2 = res1_down$ensembl_id)
class(res1_up$ensembl_id)
代码语言:txt复制## [1] "character"
代码语言:text复制a=venn.diagram(venn_list, filename = 'vennzhongji.png', imagetype = 'png',
fill = c('red', 'blue'), alpha = 0.50, cat.col = rep('black', 2),
col = 'black', cex = 1,fontfamily = 'serif',
cat.cex = 1, cat.fontfamily = 'serif')
a
代码语言:txt复制## [1] 1
代码语言:text复制class(venn_list)
代码语言:txt复制## [1] "list"
代码语言:text复制grid.force("venn2.png")
ggsave("venn2.png")
library(ggplot2)
#聚类图
vsd <- vst(dds2, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
hc <- hclust(sampleDists, method = "ward.D2")
plot(hc, hang = -1)
代码语言:text复制#vsd <- vst(dds2, blind = FALSE) 是在对一个数据集 dds2 进行标准化处理。vst 是一个函数,用于进行标准化处理,其中 blind = FALSE 表示不是盲标准化,即默认情况下,每个特征(基因)都被除以各自的方差进行标准化。
#sampleDists <- dist(t(assay(vsd))) 是计算样本间的距离。dist 函数用于计算欧几里得距离,t 函数用于转置矩阵,assay 函数用于提取数据集 vsd 的样本。
#hc <- hclust(sampleDists, method = "ward.D2") 是进行层次聚类。hclust 函数用于进行层次聚类,其中 method = "ward.D2" 表示使用最小方差法(ward's method)进行聚类,并且计算的是D2距离。
#plot(hc, hang = -1) 是绘制树状图。plot 函数用于绘制树状图,其中 hang = -1 表示树状图的横轴从底部开始绘制。
#BiocManager::install("factoextra")
library(factoextra)
#美观一下
#install.packages("ggplot2")
ress <- hcut(sampleDists, k = 2, stand = TRUE)
# Visualize 聚类
fviz_dend(ress,
# 加边框
rect = TRUE,
# 边框颜色
rect_border="cluster",
# 边框线条类型
rect_lty=2,
# 边框线条粗细
lwd=1.2,
# 边框填充
rect_fill = T,
# 字体大小
cex = 1,
# 字体颜色
color_labels_by_k=T,
# 平行放置
horiz=T)
代码语言:text复制#PCA图
vsd <- vst(dds2, blind = FALSE)
plotPCA(vsd, intgroup = "condition")
代码语言:text复制#看一下基因是否发生变化
plotCounts(dds3, gene = "ENSG00000000003", intgroup=c("condition"))
代码语言:text复制#美化一下图
plotdata <- plotCounts(dds3, gene = "ENSG00000000003", intgroup=c("condition"),returnData = T)
library(ggplot2)
ggplot(plotdata,aes(x=condition,y=count,col=condition))
geom_point()
theme_bw()
代码语言:text复制library(DESeq2)
#MA图
dds4 <- results(dds3,contrast = c("condition","treat","control"),alpha = 0.05)
plotMA(dds4, ylim=c(-2,2))
代码语言:text复制#我们发现在左侧,有很多counts很小的基因,发生了很大的变化,但是没有明显意义。他们的counts很小,但波动性很大,对logFC产生了很大的影响。
#矫正后的MA图 在这句代码中,dd2 <- lfcShrink(dds, contrast=contrast, res=dd1),lfcShrink是一个函数,它对数据集dds进行某种形式的"收缩"处理。这种处理可能涉及到统计假设检验中的标准化或者归一化等步骤。
#数据收缩:lfcShrink:两种数据特别需要:低表达量占比高的 & 数据特别分散的:
#normal 是DESeq2包原始的收缩估计量(shrikage estimator),自适应正态先验分布(adaptive normal prior)
#apeglm是apeglm包中的收缩估计量,自适应t先验分布(adaptive t prior)
#ashr是ashr包中的收缩估计量。
#BiocManager::install("apeglm")
library(apeglm)
#第一种校正方法
#resAsh <- lfcShrink(dds, coef="group_list_1_vs_0", type="ashr")
#plotMA(resAsh, ylim=c(-3,3))
##第二种
#resLFC <- lfcShrink(dds3, coef=2)
#plotMA(resLFC, ylim=c(-3,3))
#适合本文校正方法
resNorm <- lfcShrink(dds3, coef=2, type="normal")
plotMA(resNorm, ylim=c(-3,3))
代码语言:text复制#获取与数据集 dds3 相关的结果或结果的名称
resultsNames(dds3)
代码语言:txt复制## [1] "Intercept" "condition_treat_vs_control"
代码语言:text复制summary(resNorm, alpha = 0.05)
代码语言:txt复制##
## out of 18084 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 49, 0.27%
## LFC < 0 (down) : 36, 0.2%
## outliers [1] : 0, 0%
## low counts [2] : 3156, 17%
## (mean count < 11)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
代码语言:text复制#把差异分析的结果转化成data.frame的格式
library(dplyr)
library(tibble)
res2 <- resNorm %>%
data.frame() %>%
rownames_to_column("ensembl_id")
#基因id转换
library(AnnotationDbi)
library(org.Hs.eg.db)
res2$symbol <- mapIds(org.Hs.eg.db,
keys=res2$ensembl_id,
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
res2$entrez <- mapIds(org.Hs.eg.db,
keys=res2$ensembl_id,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
#制作genelist
library(dplyr)
gene_df <- res2 %>%
dplyr::select(ensembl_id,log2FoldChange,symbol,entrez) %>%
## 去掉NA
filter(entrez!="NA") %>%
## 去掉重复
distinct(entrez,.keep_all = T)
geneList <- gene_df$log2FoldChange
names(geneList) = gene_df$entrez
geneList = sort(geneList, decreasing = TRUE)
head(geneList)
代码语言:txt复制## 10628 4210 23581 56667 720 8322
## 1.279122 1.261769 1.122895 1.068193 1.053421 1.044479
代码语言:text复制#GSEA分析
library(clusterProfiler)
gseaKEGG <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 20,
pvalueCutoff = 0.05,
verbose = FALSE)
library(ggplot2)
dotplot(gseaKEGG,showCategory=4,split=".sign") facet_grid(~.sign)
代码语言:text复制gseaKEGG_results <- gseaKEGG@result
library(enrichplot)
pathway.id = "hsa04060"
gseaplot2(gseaKEGG,
color = "red",
geneSetID = pathway.id,
pvalue_table = T)
代码语言:text复制library(pathview)
class(geneList)
代码语言:txt复制## [1] "numeric"
代码语言:text复制geneList_df <- data.frame(geneList)
pv.out <- pathview(gene.data = geneList,
pathway.id = "hsa05322",
species = "hsa",
same.layer = T)
pv.out2 <- pathview(gene.data = geneList,
pathway.id = "hsa05133",
species = "hsa",
same.layer = T,
kegg.native = F)
#我们现在知道cell cycle是被抑制的,如果还想看一下这个通路里面的基因是如何变化的,应该怎么办呢,pathview 可以帮到我们。