之前的推文中我们介绍了如何缩小基因集范围,拿到表达矩阵,这时想要初步查看所挑选基因集在分组中是否有差异,我们用箱线图和热图尝试一下。
处理表达矩阵和生存数据
我们可以接着之前TCGA的XENA的转录组测序表达量矩阵预处理中,id转换之后的矩阵继续进行处理:
代码语言:javascript复制n_t_exp = dat
dim(n_t_exp)
#[1] 38953 151
n_t_exp[1:4,1:4]
# TCGA-AB-2949-03B TCGA-AB-2918-03A TCGA-AB-2943-03A TCGA-AB-2851-03A
#ZZZ3 3290 2830 4059 2087
#ZZEF1 7002 7297 6663 7688
#ZYX 10026 15461 10270 27210
#ZYG11B 1810 1645 1919 1718
tmp = as.numeric( substring(colnames(n_t_exp),14,15)) < 10 #用TCGA样本名确定tumor/normal
table(tmp)
#tmp
#TRUE
# 151
gp = ifelse( tmp ,'tumor','normal')
table(gp)
#gp
#tumor
# 151
#此时TCGA中LAML的数据是没有normal样本的,不过我们还是把去除normal样本的代码走一遍
#去除normal样本,后面做的是生存结局的分组,而不是tumor-normal的
dat <- dat[,tmp]
dim(dat)
#[1] 38953 151
gp = c(rep('tumor',ncol(dat)))
table(gp)
#gp
#tumor
# 151
n_t_exp = dat
colnames(n_t_exp) = substring(colnames(n_t_exp),1,15)
save(n_t_exp,gp,file = 'output/rdata/0.expr.all.Rdata')
然后处理临床信息:
代码语言:javascript复制#读入临床信息
org_surv <- fread('input/AML_survival.txt',data.table = F)
org_surv[1:4,1:4]
# sample _PATIENT OS OS.time
#1 TCGA-AB-2802-03 TCGA-AB-2802 1 365
#2 TCGA-AB-2803-03 TCGA-AB-2803 1 792
#3 TCGA-AB-2804-03 TCGA-AB-2804 0 2557
#4 TCGA-AB-2805-03 TCGA-AB-2805 1 577
colnames(org_surv)
# [1] "sample" "_PATIENT" "OS" "OS.time" "DSS" "DSS.time" "DFI" "DFI.time" "PFI"
#[10] "PFI.time" "Redaction"
mean(org_surv$OS.time,na.rm = T)
#[1] 560.8441
org_surv$OS.time <- org_surv$OS.time/365
mean(org_surv$OS.time,na.rm = T)
#[1] 1.536559
org_surv[1:4,1:4]
# sample _PATIENT OS OS.time
#1 TCGA-AB-2802-03 TCGA-AB-2802 1 1.000000
#2 TCGA-AB-2803-03 TCGA-AB-2803 1 2.169863
#3 TCGA-AB-2804-03 TCGA-AB-2804 0 7.005479
#4 TCGA-AB-2805-03 TCGA-AB-2805 1 1.580822
rownames(org_surv) <- org_surv$sample
org_surv[1:4,1:4]
# sample _PATIENT OS OS.time
#TCGA-AB-2802-03 TCGA-AB-2802-03 TCGA-AB-2802 1 1.000000
#TCGA-AB-2803-03 TCGA-AB-2803-03 TCGA-AB-2803 1 2.169863
#TCGA-AB-2804-03 TCGA-AB-2804-03 TCGA-AB-2804 0 7.005479
#TCGA-AB-2805-03 TCGA-AB-2805-03 TCGA-AB-2805 1 1.580822
survdata <- org_surv[,c(3,4)]
head(survdata)
# OS OS.time
#TCGA-AB-2802-03 1 1.0000000
#TCGA-AB-2803-03 1 2.1698630
#TCGA-AB-2804-03 0 7.0054795
#TCGA-AB-2805-03 1 1.5808219
#TCGA-AB-2806-03 1 2.5890411
#TCGA-AB-2807-03 1 0.4958904
colnames(survdata) <- c('event', 'time')
kp <- survdata$time > 0
table(kp)
#kp
#FALSE TRUE
# 13 173
head(survdata)
# event time
#TCGA-AB-2802-03 1 1.0000000
#TCGA-AB-2803-03 1 2.1698630
#TCGA-AB-2804-03 0 7.0054795
#TCGA-AB-2805-03 1 1.5808219
#TCGA-AB-2806-03 1 2.5890411
#TCGA-AB-2807-03 1 0.4958904
survdata = survdata[kp,]
survdata = survdata[substring(colnames(dat),1,15),]
survdata = na.omit(survdata)
dim(survdata)
#[1] 130 2
save( survdata,file = 'output/rdata/0.survival.Rdata')
因为有na值,所以重新再筛选一下表达矩阵,过滤样本:
代码语言:javascript复制if(F){ #如果没有na可以跳过
load( file = 'output/rdata/0.lncRNA_target_expression_data.Rdata') #lncRNA和代谢基因的表达矩阵,也就是我们要检查的基因,前几篇有讲
load( file = 'output/rdata/0.expr.all.Rdata')
load( file = 'output/rdata/0.survival.Rdata')
lnc_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1 15 14 21 11
#ZSCAN16-AS1 277 163 214 170
#ZRANB2-AS1 36 26 100 15
#ZNRD1-AS1 1880 929 1909 1663
target_dat[1:4, 1:4] #代谢基因
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN 104 117 137 93
#XYLT2 919 1305 1525 1214
#XDH 1 1 1 5
#VKORC1L1 1578 954 1818 1473
colnames(lnc_dat) =substring(colnames(lnc_dat),1,15)
colnames(target_dat)=substring(colnames(target_dat),1,15)
lnc_dat <- lnc_dat[,rownames(survdata) ]
target_dat <- target_dat[,rownames(survdata)]
n_t_exp <- n_t_exp[,rownames(survdata)]
gp = c(rep('tumor',ncol(n_t_exp)))
table(gp)
#gp
#tumor
# 130
save(lnc_dat, target_dat,
file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
save(n_t_exp,gp,file = 'output/rdata/0.expr.all.Rdata')
}
箱线图生存分组差异检查
首先检查数据:
代码语言:javascript复制load('output/rdata/0.expr.all.Rdata')
dim(n_t_exp)
#[1] 38953 130
n_t_exp[1:4,1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZZZ3 3290 2087 3233 1757
#ZZEF1 7002 7688 6292 7461
#ZYX 10026 27210 13788 16016
#ZYG11B 1810 1718 2133 1939
dat=log(edgeR::cpm(n_t_exp) 1)
load(file = 'output/rdata/0.survival.Rdata')
head(rownames(survdata))
head(colnames(n_t_exp))
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE
survdata=survdata[colnames(n_t_exp),]
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE
gp=as.character(survdata$event)
table(gp)
#gp
# 0 1
#52 78
作图:
代码语言:javascript复制load(file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
lnc_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1 15 14 21 11
#ZSCAN16-AS1 277 163 214 170
#ZRANB2-AS1 36 26 100 15
#ZNRD1-AS1 1880 929 1909 1663
target_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN 104 117 137 93
#XYLT2 919 1305 1525 1214
#XDH 1 1 1 5
#VKORC1L1 1578 954 1818 1473
target_g = rownames(target_dat)
cg_dat = dat[target_g,] #检查代谢基因
library(reshape2)
cgDat=as.data.frame(melt(cg_dat)) #取cpm以后的数据
head(cgDat)
# Var1 Var2 value
#1 ZACN TCGA-AB-2949-03 1.29216243
#2 XYLT2 TCGA-AB-2949-03 3.19188324
#3 XDH TCGA-AB-2949-03 0.02507388
#4 VKORC1L1 TCGA-AB-2949-03 3.71519992
#5 VDAC2 TCGA-AB-2949-03 4.15482557
#6 VDAC1 TCGA-AB-2949-03 4.51626321
colnames(cgDat)=c('gene','sample','expression')
# 分组
cgDat$group = as.character(survdata[cgDat$sample,'event'])
table(cgDat$group )
# 0 1
#41704 62556
length(table(cgDat$gene)) #802个基因
#[1] 802
如果画全部的基因:
代码语言:javascript复制library(ggpubr)
ggboxplot(cgDat, "gene", "expression", color = "group" ) ylab('expression value by log(CPM)')
stat_compare_means(aes(group = group ), label = "p.signif",hide.ns = T)
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pro2='expression-of-choose-genes-boxplot'
ggsave(filename = 'output/plot/step1_cgDif.pdf',
width = 10,height = 5)
直接是看不清的,选50个基因看看:
代码语言:javascript复制# 画图
a = rownames(cg_dat)[1:50]
cgDat1 = cgDat[cgDat$gene%in%a,]
ggboxplot(cgDat1, "gene", "expression", color = "group" ) ylab('expression value by log(CPM)')
stat_compare_means(aes(group = group ), label = "p.signif",hide.ns = T)
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pro2='expression-of-choose-genes-boxplot'
ggsave(filename = 'output/plot/step1_cgDif.pdf',
width = 10,height = 5)
热图生存分组差异检查
检查数据:
代码语言:javascript复制load('output/rdata/0.expr.all.Rdata')
dim(n_t_exp)
#[1] 38953 130
n_t_exp[1:4,1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZZZ3 3290 2087 3233 1757
#ZZEF1 7002 7688 6292 7461
#ZYX 10026 27210 13788 16016
#ZYG11B 1810 1718 2133 1939
dat=log(edgeR::cpm(n_t_exp) 1)
table(gp)
#gp
#tumor
# 130
load(file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
lnc_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1 15 14 21 11
#ZSCAN16-AS1 277 163 214 170
#ZRANB2-AS1 36 26 100 15
#ZNRD1-AS1 1880 929 1909 1663
target_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN 104 117 137 93
#XYLT2 919 1305 1525 1214
#XDH 1 1 1 5
#VKORC1L1 1578 954 1818 1473
identical(colnames(lnc_dat), colnames(target_dat))
#[1] TRUE
lnc_g = rownames(lnc_dat)
prg_g = rownames(target_dat)
head(prg_g)
#[1] "ZACN" "XYLT2" "XDH" "VKORC1L1" "VDAC2" "VDAC1"
cg_dat = dat[prg_g,]
load(file = 'output/rdata/0.survival.Rdata')
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE
survdata=survdata[colnames(n_t_exp),]
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE
#把分组调整好,前为0后为1,画热图才能看到差异
library(tidyverse)
survdata = arrange(survdata,event)
cg_dat = cg_dat[,rownames(survdata)]
identical(rownames(survdata),colnames(cg_dat))
gp=as.character(survdata$event)
table(gp)
#gp
# 0 1
#52 78
作图:
代码语言:javascript复制library(pheatmap)
ac=data.frame(group=gp)
rownames(ac)=colnames(cg_dat)
pheatmap(cg_dat,scale = 'row',show_colnames = F,show_rownames = F,
annotation_col = ac,cluster_cols = F,
breaks = seq(-2,2,length.out = 100))
pheatmap(cg_dat,scale = 'row',
show_colnames = F,show_rownames = F,
annotation_col = ac,cluster_cols = F,
breaks = seq(-2,2,length.out = 100),
filename = 'output/plot/step1.choose-genes-heatmap-diff1.png')
# 因为同时设置了 `scale` and `breaks`,所以下面的scale并不是从-2到2而是-4到4
dev.off()
可以看到直接画802个基因的热图是不会有显著差异的,我们后续拿做了差异分析以后的代谢基因再来看。