基因集的生存分组差异检查

2022-10-31 17:41:49 浏览数 (1)

之前的推文中我们介绍了如何缩小基因集范围,拿到表达矩阵,这时想要初步查看所挑选基因集在分组中是否有差异,我们用箱线图和热图尝试一下。

处理表达矩阵和生存数据

我们可以接着之前TCGA的XENA的转录组测序表达量矩阵预处理中,id转换之后的矩阵继续进行处理:

代码语言:javascript复制
n_t_exp = dat
dim(n_t_exp)
#[1] 38953   151
n_t_exp[1:4,1:4]
#       TCGA-AB-2949-03B TCGA-AB-2918-03A TCGA-AB-2943-03A TCGA-AB-2851-03A
#ZZZ3               3290             2830             4059             2087
#ZZEF1              7002             7297             6663             7688
#ZYX               10026            15461            10270            27210
#ZYG11B             1810             1645             1919             1718

tmp = as.numeric( substring(colnames(n_t_exp),14,15)) < 10 #用TCGA样本名确定tumor/normal
table(tmp) 
#tmp
#TRUE 
# 151 
gp = ifelse( tmp ,'tumor','normal')
table(gp)
#gp
#tumor 
#  151 

#此时TCGA中LAML的数据是没有normal样本的,不过我们还是把去除normal样本的代码走一遍
#去除normal样本,后面做的是生存结局的分组,而不是tumor-normal的
dat <- dat[,tmp]
dim(dat)
#[1] 38953   151
gp = c(rep('tumor',ncol(dat)))
table(gp)
#gp
#tumor 
#  151 
n_t_exp = dat
colnames(n_t_exp) = substring(colnames(n_t_exp),1,15)
save(n_t_exp,gp,file = 'output/rdata/0.expr.all.Rdata')

然后处理临床信息:

代码语言:javascript复制
#读入临床信息
org_surv <- fread('input/AML_survival.txt',data.table = F)
org_surv[1:4,1:4]
#           sample     _PATIENT OS OS.time
#1 TCGA-AB-2802-03 TCGA-AB-2802  1     365
#2 TCGA-AB-2803-03 TCGA-AB-2803  1     792
#3 TCGA-AB-2804-03 TCGA-AB-2804  0    2557
#4 TCGA-AB-2805-03 TCGA-AB-2805  1     577
colnames(org_surv)
# [1] "sample"    "_PATIENT"  "OS"        "OS.time"   "DSS"       "DSS.time"  "DFI"       "DFI.time"  "PFI"      
#[10] "PFI.time"  "Redaction"

mean(org_surv$OS.time,na.rm = T)
#[1] 560.8441
org_surv$OS.time <- org_surv$OS.time/365
mean(org_surv$OS.time,na.rm = T)
#[1] 1.536559
org_surv[1:4,1:4]
#           sample     _PATIENT OS  OS.time
#1 TCGA-AB-2802-03 TCGA-AB-2802  1 1.000000
#2 TCGA-AB-2803-03 TCGA-AB-2803  1 2.169863
#3 TCGA-AB-2804-03 TCGA-AB-2804  0 7.005479
#4 TCGA-AB-2805-03 TCGA-AB-2805  1 1.580822

rownames(org_surv) <- org_surv$sample
org_surv[1:4,1:4]
#                         sample     _PATIENT OS  OS.time
#TCGA-AB-2802-03 TCGA-AB-2802-03 TCGA-AB-2802  1 1.000000
#TCGA-AB-2803-03 TCGA-AB-2803-03 TCGA-AB-2803  1 2.169863
#TCGA-AB-2804-03 TCGA-AB-2804-03 TCGA-AB-2804  0 7.005479
#TCGA-AB-2805-03 TCGA-AB-2805-03 TCGA-AB-2805  1 1.580822

survdata <- org_surv[,c(3,4)]
head(survdata)
#                OS   OS.time
#TCGA-AB-2802-03  1 1.0000000
#TCGA-AB-2803-03  1 2.1698630
#TCGA-AB-2804-03  0 7.0054795
#TCGA-AB-2805-03  1 1.5808219
#TCGA-AB-2806-03  1 2.5890411
#TCGA-AB-2807-03  1 0.4958904

colnames(survdata) <- c('event', 'time')
kp <- survdata$time > 0
table(kp)
#kp
#FALSE  TRUE 
#   13   173 

head(survdata)
#                event      time
#TCGA-AB-2802-03     1 1.0000000
#TCGA-AB-2803-03     1 2.1698630
#TCGA-AB-2804-03     0 7.0054795
#TCGA-AB-2805-03     1 1.5808219
#TCGA-AB-2806-03     1 2.5890411
#TCGA-AB-2807-03     1 0.4958904

survdata = survdata[kp,]
survdata = survdata[substring(colnames(dat),1,15),]
survdata = na.omit(survdata)
dim(survdata) 
#[1] 130   2

save( survdata,file = 'output/rdata/0.survival.Rdata')

因为有na值,所以重新再筛选一下表达矩阵,过滤样本:

代码语言:javascript复制
if(F){  #如果没有na可以跳过
    
  load( file = 'output/rdata/0.lncRNA_target_expression_data.Rdata') #lncRNA和代谢基因的表达矩阵,也就是我们要检查的基因,前几篇有讲
  load( file = 'output/rdata/0.expr.all.Rdata')
  load( file = 'output/rdata/0.survival.Rdata')
  lnc_dat[1:4, 1:4]
  #            TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
  #ZSWIM8-AS1               15              14              21              11
  #ZSCAN16-AS1             277             163             214             170
  #ZRANB2-AS1               36              26             100              15
  #ZNRD1-AS1              1880             929            1909            1663

  target_dat[1:4, 1:4] #代谢基因
  #         TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
  #ZACN                 104             117             137              93
  #XYLT2                919            1305            1525            1214
  #XDH                    1               1               1               5
  #VKORC1L1            1578             954            1818            1473
    
  colnames(lnc_dat) =substring(colnames(lnc_dat),1,15)
  colnames(target_dat)=substring(colnames(target_dat),1,15)
  
  lnc_dat <- lnc_dat[,rownames(survdata) ]
  target_dat <- target_dat[,rownames(survdata)]
  n_t_exp <- n_t_exp[,rownames(survdata)]
  
  gp = c(rep('tumor',ncol(n_t_exp)))
  table(gp)
  #gp
  #tumor 
  #  130
  save(lnc_dat, target_dat,
       file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
  
  save(n_t_exp,gp,file = 'output/rdata/0.expr.all.Rdata')
  
}
箱线图生存分组差异检查

首先检查数据:

代码语言:javascript复制
load('output/rdata/0.expr.all.Rdata')
dim(n_t_exp)
#[1] 38953   130
n_t_exp[1:4,1:4] 
#       TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZZZ3              3290            2087            3233            1757
#ZZEF1             7002            7688            6292            7461
#ZYX              10026           27210           13788           16016
#ZYG11B            1810            1718            2133            1939
dat=log(edgeR::cpm(n_t_exp) 1)

load(file = 'output/rdata/0.survival.Rdata')
head(rownames(survdata))
head(colnames(n_t_exp))
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE

survdata=survdata[colnames(n_t_exp),]
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE

gp=as.character(survdata$event)
table(gp)  
#gp
# 0  1 
#52 78

作图:

代码语言:javascript复制
load(file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
lnc_dat[1:4, 1:4]
#            TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1               15              14              21              11
#ZSCAN16-AS1             277             163             214             170
#ZRANB2-AS1               36              26             100              15
#ZNRD1-AS1              1880             929            1909            1663

target_dat[1:4, 1:4]
#         TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN                 104             117             137              93
#XYLT2                919            1305            1525            1214
#XDH                    1               1               1               5
#VKORC1L1            1578             954            1818            1473
target_g = rownames(target_dat)
cg_dat = dat[target_g,] #检查代谢基因
library(reshape2)
cgDat=as.data.frame(melt(cg_dat))  #取cpm以后的数据
head(cgDat)
#      Var1            Var2      value
#1     ZACN TCGA-AB-2949-03 1.29216243
#2    XYLT2 TCGA-AB-2949-03 3.19188324
#3      XDH TCGA-AB-2949-03 0.02507388
#4 VKORC1L1 TCGA-AB-2949-03 3.71519992
#5    VDAC2 TCGA-AB-2949-03 4.15482557
#6    VDAC1 TCGA-AB-2949-03 4.51626321
colnames(cgDat)=c('gene','sample','expression')

# 分组 
cgDat$group = as.character(survdata[cgDat$sample,'event'])
table(cgDat$group )
#    0     1 
#41704 62556 

length(table(cgDat$gene)) #802个基因
#[1] 802

如果画全部的基因:

代码语言:javascript复制
library(ggpubr)
ggboxplot(cgDat, "gene", "expression", color = "group" )   ylab('expression value by log(CPM)')   
  stat_compare_means(aes(group =  group ),  label = "p.signif",hide.ns = T)  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
pro2='expression-of-choose-genes-boxplot'
ggsave(filename = 'output/plot/step1_cgDif.pdf',
       width = 10,height = 5)

直接是看不清的,选50个基因看看:

代码语言:javascript复制
# 画图
a = rownames(cg_dat)[1:50]
cgDat1 = cgDat[cgDat$gene%in%a,]

ggboxplot(cgDat1, "gene", "expression", color = "group" )   ylab('expression value by log(CPM)')   
  stat_compare_means(aes(group =  group ),  label = "p.signif",hide.ns = T)  
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
pro2='expression-of-choose-genes-boxplot'
ggsave(filename = 'output/plot/step1_cgDif.pdf',
       width = 10,height = 5)
热图生存分组差异检查

检查数据:

代码语言:javascript复制
load('output/rdata/0.expr.all.Rdata')
dim(n_t_exp)
#[1] 38953   130
n_t_exp[1:4,1:4] 
#       TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZZZ3              3290            2087            3233            1757
#ZZEF1             7002            7688            6292            7461
#ZYX              10026           27210           13788           16016
#ZYG11B            1810            1718            2133            1939
dat=log(edgeR::cpm(n_t_exp) 1)
table(gp)
#gp
#tumor 
#  130

load(file = 'output/rdata/0.lncRNA_target_expression_data.Rdata')
lnc_dat[1:4, 1:4]
#            TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1               15              14              21              11
#ZSCAN16-AS1             277             163             214             170
#ZRANB2-AS1               36              26             100              15
#ZNRD1-AS1              1880             929            1909            1663

target_dat[1:4, 1:4]
#         TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN                 104             117             137              93
#XYLT2                919            1305            1525            1214
#XDH                    1               1               1               5
#VKORC1L1            1578             954            1818            1473

identical(colnames(lnc_dat), colnames(target_dat))
#[1] TRUE
lnc_g = rownames(lnc_dat)
prg_g = rownames(target_dat)
head(prg_g)
#[1] "ZACN"     "XYLT2"    "XDH"      "VKORC1L1" "VDAC2"    "VDAC1"  

cg_dat = dat[prg_g,] 

load(file = 'output/rdata/0.survival.Rdata')
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE
survdata=survdata[colnames(n_t_exp),]
identical(rownames(survdata),colnames(n_t_exp))
#[1] TRUE

#把分组调整好,前为0后为1,画热图才能看到差异
library(tidyverse)
survdata = arrange(survdata,event)
cg_dat = cg_dat[,rownames(survdata)]
identical(rownames(survdata),colnames(cg_dat))

gp=as.character(survdata$event)
table(gp)  
#gp
# 0  1 
#52 78

作图:

代码语言:javascript复制
library(pheatmap)
ac=data.frame(group=gp)
rownames(ac)=colnames(cg_dat)

pheatmap(cg_dat,scale = 'row',show_colnames = F,show_rownames = F,
         annotation_col = ac,cluster_cols = F,
         breaks = seq(-2,2,length.out = 100))

pheatmap(cg_dat,scale = 'row',
         show_colnames = F,show_rownames = F,
         annotation_col = ac,cluster_cols = F,
         breaks = seq(-2,2,length.out = 100),
         filename = 'output/plot/step1.choose-genes-heatmap-diff1.png')
# 因为同时设置了 `scale` and `breaks`,所以下面的scale并不是从-2到2而是-4到4
dev.off()

可以看到直接画802个基因的热图是不会有显著差异的,我们后续拿做了差异分析以后的代谢基因再来看。

na

0 人点赞