转录组差异分析—基本流程
1 背景知识
抓住主要矛盾
只需要认准count数据即可
自己的数据、公共数据、数据库、背景知识均不影响差异分析
2 读取与整理
2.1 表达矩阵
读取RawCounts.csv文件,其文件形式如下图行名为ensembleid,列名为样本名称。
代码语言:r复制dat = read.csv("RawCounts.csv",
check.names = F,
row.names = 1)
#read.csv读取后是数据框,需要转换为矩阵
exp = as.matrix(dat)
本次是采用自己的数据作为测试的,如果从GEO上下载数据可以参考如下:
代码语言:r复制library(tinyarray)
get_count_txt("GSE193861")
dat = read.delim("GSE193861_raw_counts_GRCh38.p13_NCBI.tsv.gz",row.names = 1)
dat[1:4,1:4]
GSM5822748 GSM5822749 GSM5822750 GSM5822751
100287102 3 2 1 2
653635 53 39 15 45
102466751 4 1 0 6
107985730 0 0 0 1
# 转换为整数矩阵
exp = round(dat)
# 进行ID转化
exp = trans_entrezexp(exp)
exp[1:4,1:4]
GSM5822748 GSM5822749 GSM5822750 GSM5822751
DDX11L1 3 2 1 2
WASH7P 53 39 15 45
MIR6859-1 4 1 0 6
MIR1302-2HG 0 0 0 1
还有一种常见的格式是每一个GSM数据集被单独做成一个txt.gz文件(如GSE193861)
代码语言:r复制r1 = function(b){
read.delim(paste0("GSE193861_RAW/",b),header = F,row.names = 1)
}
bs = dir("GSE193861_RAW/")
[1] "GSM5822748_con1.txt.gz" "GSM5822749_con2.txt.gz" "GSM5822750_con3.txt.gz"
[4] "GSM5822751_con4.txt.gz" "GSM5822752_con5.txt.gz" "GSM5822753_DOX1.txt.gz"
[7] "GSM5822754_DOX2.txt.gz" "GSM5822755_DOX3.txt.gz" "GSM5822756_DOX4.txt.gz"
[10] "GSM5822757_DOX6.txt.gz"
#lapply返回的是一个列表
dat = lapply(bs, r1)
#新函数 do.call 对列表进行批量操作,对dat中每个元素按照列拼接在一起
exp = do.call(cbind,dat)
在额外添加列名,获得完整的表达矩阵
代码语言:r复制library(stringr)
colnames(exp) = str_split_i(bs,"_|\.",2)
2.2 临床信息
代码语言:r复制clinical = 1 #有临床信息就读取,没有临床信息就写1,占位置,不用改后面代码
2.3 表达矩阵行名ID转换
注意:trans_exp_new ()仅用于ensemble 转symbol
代码语言:r复制library(tinyarray)
exp = trans_exp_new(exp,species = "human")#注意物种
exp[1:4,1:4]
#输出,检查数据
KO1-H1299-SETDB1-KO1 KO2-H1299-SETDB1-KO2 KO3-H1299-SETDB1-KO3
DDX11L1 37 37 45
WASH7P 200 242 286
AL627309.1 27 22 16
AL627309.6 219 222 227
KOC1-H1299-KO-control1
DDX11L1 23
WASH7P 247
AL627309.1 22
AL627309.6 289
2.4 基因过滤
需要过滤一下那些在很多样本里表达量都为0或者表达量很低的基因。过滤标准不唯一。这里使用常用过滤标准2。
代码语言:r复制#过滤之前基因数量
nrow(exp)
#常用过滤标准1:仅去除在所有样本里表达量都为零的基因
exp1 = exp[rowSums(exp)>0,]
nrow(exp1)
# 常用过滤标准2(推荐):仅保留在一半以上样本里表达的基因
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
2.5 分组信息的获取
TCGA的数据,直接用make_tcga_group给样本分组(tumor和normal),其他地方的数据分组方式参考芯片数据。
代码语言:r复制colnames(exp)
library(stringr)
Group = ifelse(str_detect(colnames(exp),"control"),"control","KO")
Group = factor(Group,levels = c("control","KO"))
table(Group)
Group
control KO
3 3
#保存为Rdata
#proj <- pip
save(exp,Group,proj,clinical,file = paste0(proj,".Rdata"))
3 基因差异分析
使用三种包差异分析后取交集
代码语言:r复制rm(list = ls())
load("pip.Rdata")
3.1 DEG2
代码语言:r复制library(DESeq2)
colData <- data.frame(row.names =colnames(exp),
condition=Group)
#注意事项:如果多次运行,表达矩阵改动过的话,需要从工作目录下删除下面if括号里的文件
if(!file.exists(paste0(proj,"_dd.Rdata"))){
dds <- DESeqDataSetFromMatrix(
countData = exp,
colData = colData,
design = ~ condition)
dds <- DESeq(dds)
save(dds,file = paste0(proj,"_dd.Rdata"))
}
load(file = paste0(proj,"_dd.Rdata"))
class(dds)
res <- results(dds, contrast = c("condition",rev(levels(Group))))
#constrast
c("condition",rev(levels(Group)))
class(res)
DEG1 <- as.data.frame(res)
library(dplyr)
DEG1 <- arrange(DEG1,pvalue)
DEG1 = na.omit(DEG1)
head(DEG1)
#添加change列标记基因上调下调
logFC_t = 2
pvalue_t = 0.05
k1 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange < -logFC_t);table(k1)
FALSE TRUE
18213 225
k2 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange > logFC_t);table(k2)
FALSE TRUE
17941 497
DEG1$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG1$change)
DOWN NOT UP
225 17716 497
head(DEG1)
baseMean log2FoldChange lfcSE stat pvalue padj change
ALCAM 4577.774 2.330250 0.06006937 38.79265 0 0 UP
MYO10 2236.271 4.048245 0.07620441 53.12350 0 0 UP
MAP1B 1975.327 -2.601941 0.06879953 -37.81917 0 0 DOWN
HSPB1 6026.809 2.555337 0.06596062 38.74034 0 0 UP
LAMC3 1162.593 3.512105 0.08905396 39.43794 0 0 UP
AL161431.1 2288.839 5.465526 0.10476661 52.16859 0 0 UP
3.2 edgeR
代码语言:r复制library(edgeR)
dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit <- glmLRT(fit)
DEG2=topTags(fit, n=Inf)
class(DEG2)
DEG2=as.data.frame(DEG2)
head(DEG2)
k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t);table(k1)
FALSE TRUE
18219 219
k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t);table(k2)
FALSE TRUE
17947 491
DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG2$change)
DOWN NOT UP
219 17728 491
#pValue太小,显示为0了
head(DEG2)
logFC logCPM LR PValue FDR change
FSTL5 10.057039 4.222769 2564.032 0 0 UP
BST2 9.350844 3.520750 2140.220 0 0 UP
CTCFL 9.228345 6.052543 4721.104 0 0 UP
PCDH15 8.658005 2.835447 1507.044 0 0 UP
FLRT2 8.595937 4.851412 3462.456 0 0 UP
PRSS21 8.017133 3.502738 2164.292 0 0 UP
3.3 limma
代码语言:r复制library(limma)
dge <- edgeR::DGEList(counts=exp)
dge <- edgeR::calcNormFactors(dge)
design <- model.matrix(~Group)
v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)
fit= eBayes(fit)
DEG3 = topTable(fit, coef=2, n=Inf)
DEG3 = na.omit(DEG3)
k1 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC < -logFC_t)
k2 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC > logFC_t)
DEG3$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG3$change)
DOWN NOT UP
309 17726 403
head(DEG3)
logFC AveExpr t P.Value adj.P.Val B change
DPEP3 4.631296 5.555757 60.13280 7.628686e-21 5.507727e-17 37.62656 UP
NEFH 4.326803 4.906266 60.14049 7.612615e-21 5.507727e-17 37.38817 UP
MYO10 4.045039 5.019841 59.54911 8.961483e-21 5.507727e-17 37.37704 UP
FHL1 2.930173 5.822318 49.83428 1.690593e-19 5.195192e-16 34.98868 UP
NTSR1 3.379769 5.688196 49.88634 1.661755e-19 5.195192e-16 34.97232 UP
DHRS2 6.873437 4.445805 51.64176 9.398283e-20 4.332139e-16 33.89399 UP
三种方法做差异分析得出来的结果对比
代码语言:r复制tj = data.frame(deseq2 = as.integer(table(DEG1$change)),
edgeR = as.integer(table(DEG2$change)),
limma_voom = as.integer(table(DEG3$change)),
row.names = c("down","not","up")
);tj
deseq2 edgeR limma_voom
down 225 219 309
not 17716 17728 17726
up 497 491 403
save(DEG1,DEG2,DEG3,Group,tj,file = paste0(proj,"_DEG.Rdata"))
4 差异基因可视化
在这一步做个处理去除文库大小的影响
代码语言:r复制library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
# cpm 去除文库大小的影响
dat = log2(cpm(exp) 1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
代码语言:r复制cg1 = rownames(DEG1)[DEG1$change !="NOT"]
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
cg3 = rownames(DEG3)[DEG3$change !="NOT"]
h1 = draw_heatmap(dat[cg1,],Group)
h2 = draw_heatmap(dat[cg2,],Group)
h3 = draw_heatmap(dat[cg3,],Group)
v1 = draw_volcano(DEG1,pkg = 1,logFC_cutoff = logFC_t)
v2 = draw_volcano(DEG2,pkg = 2,logFC_cutoff = logFC_t)
v3 = draw_volcano(DEG3,pkg = 3,logFC_cutoff = logFC_t)
library(patchwork)
(h1 h2 h3) / (v1 v2 v3) plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
代码语言:r复制UP=function(df){
rownames(df)[df$change=="UP"]
}
DOWN=function(df){
rownames(df)[df$change=="DOWN"]
}
up = intersect(intersect(UP(DEG1),UP(DEG2)),UP(DEG3))
down = intersect(intersect(DOWN(DEG1),DOWN(DEG2)),DOWN(DEG3))
dat = log2(cpm(exp) 1)
hp = draw_heatmap(dat[c(up,down),],Group)
#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DEG1),
edgeR = UP(DEG2),
limma = UP(DEG3))
down_genes = list(Deseq2 = DOWN(DEG1),
edgeR = DOWN(DEG2),
limma = DOWN(DEG3))
up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")
#维恩图拼图
library(patchwork)
#up.plot down.plot
# 拼图
pca.plot hp up.plot down.plot plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)
代码语言:r复制#分组聚类的热图
library(ComplexHeatmap)
library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), c("#2fa1dd", "white", "#f87669"))
top_annotation = HeatmapAnnotation(
cluster = anno_block(gp = gpar(fill = c("#f87669","#2fa1dd")),
labels = levels(Group),
labels_gp = gpar(col = "white", fontsize = 12)))
m = Heatmap(t(scale(t(exp[c(up,down),]))),name = " ",
col = col_fun,
top_annotation = top_annotation,
column_split = Group,
show_heatmap_legend = T,
border = F,
show_column_names = F,
show_row_names = F,
use_raster = F,
cluster_column_slices = F,
column_title = NULL)
m