表观调控13张图之三。。。

2019-12-26 15:40:24 浏览数 (2)

本章节我们的视频审查员(刘博-二货潜)将继续带领大家学习视频,并且复现附件中Figure S13的一张图,如下:

本文的目录如下:

  1. DESeq2 寻找差异基因
  2. 可视化

(1)MAplot:图a和图b

(2)差异基因韦恩图:UpSetR/VennDiagram

(3)两样本 log2FC 相关性散点图

DESeq2寻找差异基因

万事开头前先看包的说明书:DESeq2 manual ,里面是讲的很详细很全面的,所以下面与文章不相关的图就不展示出来了。

代码语言:javascript复制
rm(list = ls())
options(stringsAsFactors = F)
a = read.table('../figure-01-check-gene-expression/all.counts.id.txt', header = T)
dim(a)
dat = a[,7:16]

# 第一列为基因名,将其赋值给行名, 做韦恩图需要
rownames(dat)=a[, 1]

# 查看前四行和前四列信息
dat[1:4, 1:4]
library(stringr)

# 要擅用 str_split 切割字符串,表示按照下划线 "_" 对列名进行切割,取第一列;即样本名
# 三组,每个样品一组,即 PhoKO、SppsKO、WT
group_list = str_split(colnames(dat), '_', simplify = T)[, 1] 
table(group_list)

######################################################################
###################      Firstly for DEseq2      #####################
######################################################################
# 一步运行
# 这里我们主要是得到差异基因中间部分就不细说
if(T){
  # 赋值表达矩阵和分组信息为一个新的变量,方便以后直接调用
  exprSet = dat
  suppressMessages(library(DESeq2)) # 加载包

  (colData <- data.frame(row.names = colnames(exprSet), 
                         group_list = group_list) )

  # 构建一个表达矩阵
  dds <- DESeqDataSetFromMatrix(countData = exprSet,
                                colData = colData,
                                design = ~ group_list)
  dds <- DESeq(dds)
  png("qc_dispersions.png", 1000, 1000, pointsize = 20)
  plotDispEsts(dds, main="Dispersion plot")
  dev.off()


  rld <- rlogTransformation(dds)
  exprMatrix_rlog = assay(rld) 
  #write.csv(exprMatrix_rlog,'exprMatrix.rlog.csv' )

  normalizedCounts1 <- t( t(counts(dds)) / sizeFactors(dds) )
  # normalizedCounts2 <- counts(dds, normalized=T) # it's the same for the tpm value
  # we also can try cpm or rpkm from edgeR pacage
  exprMatrix_rpm = as.data.frame(normalizedCounts1) 
  head(exprMatrix_rpm)
  #write.csv(exprMatrix_rpm,'exprMatrix.rpm.csv' )

  png("DEseq_RAWvsNORM.png", height = 800, width = 800)
  par(cex = 0.7)
  n.sample = ncol(exprSet)
  if(n.sample > 40) par(cex = 0.5)
  cols <- rainbow(n.sample*1.2)
  par(mfrow = c(2, 2))
  boxplot(exprSet, col = cols,main = "expression value", las = 2)
  boxplot(exprMatrix_rlog, col = cols, main = "expression value", las = 2)
  hist(as.matrix(exprSet))
  hist(exprMatrix_rlog)
  dev.off()

  library(RColorBrewer)
  (mycols <- brewer.pal(8, "Dark2")[1:length(unique(group_list))])
  cor(as.matrix(exprSet))
  # Sample distance heatmap
  sampleDists <- as.matrix(dist(t(exprMatrix_rlog)))
  #install.packages("gplots",repos = "http://cran.us.r-project.org")
  library(gplots)

  png("qc-heatmap-samples.png", w = 1000, h = 1000, pointsize = 20)
  heatmap.2(as.matrix(sampleDists), key = F, trace = "none",
            col = colorpanel(100, "black", "white"),
            ColSideColors = mycols[group_list], RowSideColors = mycols[group_list],
            margin = c(10, 10), main="Sample Distance Matrix")
  dev.off()

  cor(exprMatrix_rlog) 

  table(group_list)
  res <- results(dds, 
                 contrast=c("group_list", "SppsKO", "WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_SppsKO = as.data.frame(resOrdered)
  DEG_SppsKO = na.omit(DEG_SppsKO)

  table(group_list)
  res <- results(dds, 
                 contrast = c("group_list","PhoKO","WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_PhoKO = as.data.frame(resOrdered)
  DEG_PhoKO = na.omit(DEG_PhoKO)
  save(DEG_PhoKO, DEG_SppsKO,file = 'deg_output.Rdata')
}

简化一下,如果不需要中间的信息,我们只需要差异基因的话,那么只需要运行以下代码:

代码语言:javascript复制
if(T){
  # 赋值表达矩阵和分组信息为一个新的变量,方便以后直接调用
  exprSet = dat
  suppressMessages(library(DESeq2)) # 加载包

  (colData <- data.frame(row.names = colnames(exprSet), 
                         group_list = group_list) )

  # 构建一个表达矩阵
  dds <- DESeqDataSetFromMatrix(countData = exprSet,
                                colData = colData,
                                design = ~ group_list)
  dds <- DESeq(dds)

# 下面我们得到 Spps 突变后的差异基因    
  res <- results(dds, 
                 contrast=c("group_list", "SppsKO", "WT")) 
# 注意这里是前面比后面,别把顺序搞反了,到时候一不注意结果就是反的。所以拿差异倍数对着原始 reads 比较一下。

  resOrdered <- res[order(res$padj),] # 按照 padj 值进行排序
  head(resOrdered)
  DEG_SppsKO = as.data.frame(resOrdered) # 将数据转变为 data.frame 数据框
  DEG_SppsKO = na.omit(DEG_SppsKO) # 去除包含 NA 值的行

# 下面我们得到 Pho 突变后的差异基因 
  table(group_list)
  res <- results(dds, 
                 contrast = c("group_list","PhoKO","WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_PhoKO = as.data.frame(resOrdered)
  DEG_PhoKO = na.omit(DEG_PhoKO)

# 将关键结果以 Rdata 形式保存到本地,下次如有需要就不需要重新用 DESeq2 计算了    
  save(DEG_PhoKO, DEG_SppsKO,file = 'deg_output.Rdata')
}

好了,上面我们就得到了差异基因。


可视化

1

MAplot: 图 a 和 图 b

什么是 MAplot ?DESeq2 包说明书中的一段话:

In DESeq2, the function plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored red if the adjusted p value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down.

也就是说表示的是变化倍数 log2(Fold change) 与所有样本标准化后的 counts 数的平均值之间的关系。那么怎么画呢 ?如果看过 DESeq2 说明书就知道这是 MA-plot 部分。由于我们这里是将三组都放在一个 dds 中,所以我们得分别挑出 Pho 和 Spps 进行可视化。

首先查看 dds 中的分组情况:

代码语言:javascript复制
resultsNames(dds)

[1] "Intercept"                  "group_list_SppsKO_vs_PhoKO" "group_list_WT_vs_PhoKO" 

lfcShrink 收缩 FC 三种方法如下( 这里直接放原文):

  • normal is the the original DESeq2 shrinkage estimator, an adaptive Normal distribution as prior. This is currently the default, although the default will likely change to apeglm in the October 2018 release given apeglm’s superior performance.
  • apeglm is the adaptive t prior shrinkage estimator from the apeglm package (Zhu, Ibrahim, and Love 2018).
  • ashr is the adaptive shrinkage estimator from the ashr package (Stephens 2016). Here DESeq2 uses the ashr option to fit a mixture of Normal distributions to form the prior, with method="shrinkage".

绘制 Spps

代码语言:javascript复制
# apeglm 方法需要安装 apeglm 包
# BiocManager::install("apeglm")

# ashr 方法同样需要安装额外依赖的包
# BiocManager::install("ashr")

Spps_resLFC <- lfcShrink(dds, coef  = "group_list_SppsKO_vs_PhoKO", type = "apeglm")
Spps_resNorm <- lfcShrink(dds, coef = "group_list_SppsKO_vs_PhoKO", type = "normal")
Spps_resAsh <- lfcShrink(dds, coef  = "group_list_SppsKO_vs_PhoKO", type = "ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)

plotMA(Spps_resLFC, xlim  = xlim, ylim = ylim, main = "apeglm")
plotMA(Spps_resNorm, xlim = xlim, ylim = ylim, main = "normal")
plotMA(Spps_resAsh, xlim  = xlim, ylim = ylim, main = "ashr")

dev.off()

绘制 Pho

代码语言:javascript复制
Pho_resLFC <- lfcShrink(dds, coef  = "group_list_WT_vs_PhoKO", type = "apeglm")
Pho_resNorm <- lfcShrink(dds, coef = "group_list_WT_vs_PhoKO", type = "normal")
Pho_resAsh <- lfcShrink(dds, coef  = "group_list_WT_vs_PhoKO", type = "ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)

plotMA(Pho_resLFC, xlim  = xlim, ylim = ylim, main = "apeglm")
plotMA(Pho_resNorm, xlim = xlim, ylim = ylim, main = "normal")
plotMA(Pho_resAsh, xlim  = xlim, ylim = ylim, main = "ashr")
dev.off()

好了,我们选取 normal ( 开心就好,你选哪个 ),来绘制图 a 和 b

代码语言:javascript复制
par(mfrow=c(1,2), mar=c(4,4,2,1))
plotMA(Spps_resNorm, xlim = xlim, ylim = ylim, main = "Spps_normal")
plotMA(Pho_resNorm, xlim = xlim, ylim = ylim, main = "Pho_normal")
dev.off()

2

差异基因韦恩图:UpSetR/VennDiagram

我们下面将用两种方式来展示交集

第一种:我们使用 R 包 UpSetR 来绘制差异基因之间的韦恩图( 多组时候,这种更加一目了然 )

代码语言:javascript复制
library(UpSetR)
load(file = 'deg_output.Rdata')

colnames(DEG_PhoKO)
DEG_PhoKO$change = ifelse(DEG_PhoKO$padj > 0.05,'stable',
                          ifelse(DEG_PhoKO$log2FoldChange > 0, 'up', 'down'))

DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05, 'stable',
                         ifelse(DEG_SppsKO$log2FoldChange > 0, 'up', 'down'))

SppsKO_up   = rownames(DEG_SppsKO[DEG_SppsKO$change == 'up',])
SppsKO_down = rownames(DEG_SppsKO[DEG_SppsKO$change == 'down',])
PhoKO_up    = rownames(DEG_PhoKO[DEG_PhoKO$change == 'up',])
PhoKO_down  = rownames(DEG_PhoKO[DEG_PhoKO$change == 'down',])

allG = unique(c(SppsKO_up, SppsKO_down, PhoKO_up, PhoKO_down))

df = data.frame(allG = allG,
              SppsKO_up   = as.numeric(allG %in% SppsKO_up),
              SppsKO_down = as.numeric(allG %in% SppsKO_down),
              PhoKO_up    = as.numeric(allG %in% PhoKO_up),
              PhoKO_down  = as.numeric(allG %in% PhoKO_down))

upset(df)
第二种:我们使用 VennDiagram来展示,也是就是文中那种图
代码语言:javascript复制
# 这里直接 copy 琪同学的
# 链接: https://mp.weixin.qq.com/s/Pg0mjz7mD73atMnHz7jv1A

# 导入本地字体,不然 `Arial` 字体会警告
library("extrafont")
font_import()
loadfonts()

load(file = 'deg_output.Rdata')

colnames(DEG_PhoKO)
DEG_PhoKO$change = ifelse(DEG_PhoKO$padj > 0.05,'stable',
                          ifelse(DEG_PhoKO$log2FoldChange > 0, 'up', 'down'))

DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05, 'stable',
                         ifelse(DEG_SppsKO$log2FoldChange > 0, 'up', 'down'))

SppsKO_up   = rownames(DEG_SppsKO[DEG_SppsKO$change == 'up',])
SppsKO_down = rownames(DEG_SppsKO[DEG_SppsKO$change == 'down',])
PhoKO_up    = rownames(DEG_PhoKO[DEG_PhoKO$change == 'up',])
PhoKO_down  = rownames(DEG_PhoKO[DEG_PhoKO$change == 'down',])

library(VennDiagram)
venn.diagram(
  x = list(
    "Up in phoKO"    = PhoKO_up,
    "Down in phoKO"  = PhoKO_down,
    "Up in SppsKO"   = SppsKO_up,
    "Down in SppsKO" = SppsKO_down
    ),
  filename       = "Venn.png", # 保存到当前目录
  # stroke 颜色 形式 粗细
  col            = "#000000", lwd = 3, lty = 1,
  # 填充 透明度
  # 颜色可以参考前一篇,使用 takecolor 自己取色
  fill           = c("#D3E7F0", "#9FBEDB", "#A0D191", "#6DAE8A"),
  alpha          = 0.50,
  # 里外字体大小形式
  cex            = 1.5,
  fontfamily     = "Arial",
  fontface       = "bold",
  cat.cex        = 1.5,
  cat.dist       = 0.2,
  cat.fontfamily = "Arial",
  # 图像整体位置 分辨率
  margin         = 0.2,
  resolution     = 300)

与文章趋势基本一致。可以看出 SppsPho 共同调控许多基因,说明这两基因有一定的关系。

3

两样本 log2FC 相关性散点图

代码语言:javascript复制
load(file = 'deg_output.Rdata')
library(ggpubr)
DEG_PhoKO = DEG_PhoKO[rownames(DEG_SppsKO),]
po = data.frame(SppsKO = DEG_SppsKO$log2FoldChange,
              PhoKO = DEG_PhoKO$log2FoldChange)

sp <- ggscatter(po, 'SppsKO', 'PhoKO',
                add        = "reg.line",  # Add regressin line
                add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
                conf.int   = TRUE # Add confidence interval
)
# Add correlation coefficient
sp   stat_cor(method  = "pearson", 
              label.x = -10, label.y = 10) # 相关系数显示位置

从上图可以看出,两者的相关系数高达0.53,这在两个基因间是算具有很强的相关的相关性了,更加佐证了上图的韦恩图的结果。

好了,此部分就到这里了。

0 人点赞