转录组GSE157718_Tpm与Count差异分析的比较
在尝试复现GSE157718数据集的时候,发现网站同时提供了表达矩阵tpm形式与count形式,因此分别用这两种形式进行基因差异与富集分析,再进行对比。
注:有count矩阵就用count矩阵
1 Count形式
以count给出的表达矩阵是我们最为熟悉的形式,这里只稍加记录下数据整理的代码,具体的差异富集分析,与其他的流程并无不同。
1 以fread函数导入的数据形式为data.table,设置行名很麻烦,这里先转化为data.frame形式
2 行名或(GeneID列)为ENTREZID,需要转化为SYMBOL
3 归根结底是表达矩阵的形式需要行名为基因名,列为数据集,所有的操作往这个方向努力就行
表达矩阵exp
代码语言:r复制library(data.table)
library(tinyarray)
dat = fread("GSE157718_raw_counts_GRCh38.p13_NCBI.tsv.gz")
dim(dat)
#> [1] 39376 7
colnames(dat)
#> [1] "GeneID" "GSM4773688" "GSM4773689" "GSM4773690" "GSM4773691"
#> [6] "GSM4773692" "GSM4773693"
#data.table转化为data.frame
dat <- as.data.frame(dat)
class(dat)
#> [1] "data.frame"
#ENTREZID转化为SYMBOL
library(org.Hs.eg.db)
library(clusterProfiler)
output <- bitr(dat$GeneID,
fromType = 'ENTREZID',
toType = 'SYMBOL',
OrgDb = 'org.Hs.eg.db')
#> Warning in bitr(dat$GeneID, fromType = "ENTREZID", toType = "SYMBOL", OrgDb =
#> "org.Hs.eg.db"): 4.28% of input gene IDs are fail to map...
colnames(output)
#> [1] "ENTREZID" "SYMBOL"
exp <- merge(dat, output, by.x = "GeneID", by.y = "ENTREZID")
exp <- exp[!duplicated(exp$SYMBOL),]
rownames(exp) <- exp$SYMBOL
exp <- exp[,c(-1,-8)]
exp = as.matrix(exp)
基因过滤与分组信息
代码语言:r复制#基因过滤
#分组信息
colnames(exp) <- c("NS1","NS2","NS3","ES1","ES2","ES3")
library(stringr)
Group = ifelse(str_detect(colnames(exp),"ES"),"ES","NS")
Group = factor(Group,levels = c("ES","NS"))
table(Group)
#> Group
#> ES NS
#> 3 3
data.frame(colnames(exp),Group)
#> colnames.exp. Group
#> 1 NS1 NS
#> 2 NS2 NS
#> 3 NS3 NS
#> 4 ES1 ES
#> 5 ES2 ES
#> 6 ES3 ES
以logFC_t = 2,pvalue_t = 0.05为阈值,以DEseq2,edgeR,limma三个R包分别进行差异分析,最好再去交集进行富集分析的结果如下
2 Tpm形式
Tpm也可以勉强进行差异分析,但是只能取log后,用limma做差异分析
fpkm、rpkm需先转换为Tpm形式,用limma做差异分析
limma差异分析参考基于芯片的分析流程
表达矩阵exp
这次需要将ENSEMBL转换为SYMBOL
代码语言:r复制proj = "GSE157718"
library(data.table)
library(tinyarray)
dat = fread("GSE157718_gene_tpm_matrix.txt")
#data.table转化为data.frame
dat <- as.data.frame(dat)
class(dat)
#> [1] "data.frame"
rownames(dat) <- dat$gene_id
dat <- dat[,-1]
exp = as.matrix(dat)
exp = trans_exp_new(exp,species = "human")
#> Warning in AnnoProbe::annoGene(rownames(exp), ID_type = "ENSEMBL", species =
#> species): 0.13% of input IDs are fail to annotate...
基因过滤与分组信息
重点是基因过滤后(或之前)执行exp <- log(exp 1)
!!!!
代码语言:r复制nrow(exp)
#> [1] 60512
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
#> [1] 20938
range(exp)
#> [1] 0.00 35719.68
exp <- log(exp 1)
#分组信息
colnames(exp)
#> [1] "ES1" "ES2" "ES3" "NS1" "NS2" "NS3"
library(stringr)
Group = ifelse(str_detect(colnames(exp),"ES"),"ES","NS")
Group = factor(Group,levels = c("ES","NS"))
table(Group)
#> Group
#> ES NS
#> 3 3
data.frame(colnames(exp),Group)
#> colnames.exp. Group
#> 1 ES1 ES
#> 2 ES2 ES
#> 3 ES3 ES
#> 4 NS1 NS
#> 5 NS2 NS
#> 6 NS3 NS
基因差异分析与可视化
用limma包分析类似基于芯片平台的分析
代码语言:r复制rm(list = ls())
load("GSE157718.Rdata")
table(Group)
#> Group
#> ES NS
#> 3 3
range(exp)
#> [1] 0.00000 10.48349
# 使用limma包进行差异分析
library(limma)
library(dplyr)
design = model.matrix(~Group)
fit = lmFit(exp,design)
fit = eBayes(fit)
deg = topTable(fit,coef = 2,number = Inf)
# 使用阈值
logFC_t = 1
p_t = 0.05
k1 = (deg$P.Value < p_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < p_t)&(deg$logFC > logFC_t)
deg = mutate(deg,change = ifelse(k1,"Down",ifelse(k2,"Up","Not")))
#Down Not Up
# 158 20605 175
table(deg$change)
#>
#> Down Not Up
#> 158 20605 175
DEG = mutate(deg,symbol = rownames(deg))
save(DEG,Group,file = paste0(proj,"_tpm_DEG.Rdata"))
#可视化
library(ggplot2)
this_title <- paste0('Threshold of logFC is 1',
'nThe number of up gene is ',nrow(deg[deg$change == 'Up',]) ,
'nThe number of down gene is ',nrow(deg[deg$change == 'Down',])
)
p1 <- ggplot(data = DEG, aes(x = logFC, y = -log10(P.Value)))
geom_point(alpha=0.4, size=3.5, aes(color=change))
scale_color_manual(values=c("blue", "grey","red"))
geom_hline(yintercept = -log10(p_t),lty=4,col="black",linewidth=0.8)
theme_bw()
ggtitle(this_title )
theme(panel.grid = element_blank(),
plot.title = element_text(size=8,hjust = 0.5),
legend.title = element_blank(),
legend.text = element_text(size=8))
ggsave("DEG_tpm.png",plot = p1,height = 15,width = 13)
富集分析
代码语言:r复制rm(list = ls())
library(clusterProfiler)
library(ggthemes)
library(org.Hs.eg.db)
library(dplyr)
library(ggplot2)
library(stringr)
library(enrichplot)
#(1)输入数据
load("GSE157718_tpm_DEG.Rdata")
library(tinyarray)
g <- rownames(DEG)[DEG$change!="Not"]
output <- bitr(g,
fromType = 'SYMBOL',
toType = 'ENTREZID',
OrgDb = 'org.Hs.eg.db')
gene_diff = output$ENTREZID
#(2)富集
ekk <- enrichKEGG(gene = gene_diff,
organism = 'hsa')
ekk <- setReadable(ekk,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID")
dotplot(ekk)
3 Count形式 Vs Tpm形式
代码语言:r复制rm(list = ls())
library(data.table)
load("GSE157718_count_DEG.Rdata")
#DEG3为limma包分析的结果
DEG_count <- DEG3
rm(DEG1);rm(DEG2);rm(DEG3)
load("GSE157718_tpm_DEG.Rdata")
DEG_tpm <- DEG;
rm(DEG)
ids=intersect(rownames(DEG_count),
rownames(DEG_tpm))
df1= data.frame(
DEG_count1 = DEG_count[ids,"logFC"],
DEG_tpm1 = DEG_tpm[ids,"logFC"]
)
library(ggpubr)
comp_between_DEG1 <- ggscatter(df1, x = "DEG_count1", y = "DEG_tpm1",
color = "black", shape = 21, size = 3, # Points color, shape and size
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.sep = "n")
)
ggsave("comp_between_count_tpm.png",width = 15,height = 13)
代码语言:r复制df2= data.frame(
DEG_count2 = DEG_count[ids,"change"],
DEG_tpm2 = DEG_tpm[ids,"change"]
)
p2 <- gplots::balloonplot(table(df2))
ggsave("balloon.png",plot = p2,width = 15,height = 13)
代码语言:r复制symbols_list = split(ids,paste(df2[,1],df2[,2]))
library(clusterProfiler)
library(org.Hs.eg.db)
library(ReactomePA)
library(ggplot2)
library(stringr)
# 首先全部的symbol 需要转为 entrezID
gcSample = lapply(symbols_list, function(y){
y=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,
keys = y,
columns = 'ENTREZID',
keytype = 'SYMBOL')[,2])
)
y
})
#gcSample
pro='comp'
# 第1个注释是 KEGG
xx <- compareCluster(gcSample, fun="enrichKEGG",
organism="hsa", pvalueCutoff=0.3)
dotplot(xx) theme(axis.text.x=element_text(angle=45,hjust = 1))
scale_y_discrete(labels=function(x) str_wrap(x, width=50))
ggsave(paste0(pro,'comp_kegg.pdf'),width = 10,height = 8)
由此可见,同一个数据集采用Count与Tpm形式做出来的差异与富集分析结果还是有较大差别的,这里的Tpm logFC的阈值为1(设置为2的话分析出来的差异基因只有30左右),同Count 的logFC的阈值为2相比,富集的通路类型反而少了很多。