单基因生信分析流程(3)一文解决生存分析和临床参数相关分析

2019-05-15 10:43:31 浏览数 (1)

本文目的

(1)绘制生存分析图 (2)临床参数相关分析

  • 加载所必须的包
代码语言:javascript复制
# ===============================================================

#load  package

# ===============================================================

library(TCGAbiolinks)
library(SummarizedExperiment)
setwd('D:\train\single_gene')
library(dplyr)
library(survival)
library(survminer)

rm(list=ls())
  • 通过TCGAbiolinks下载TCGA生存相关信息(将其中的OS和OStime留作后用)
代码语言:javascript复制
# ===============================================================

#download survival data

# ===============================================================

query <- GDCquery(project = "TCGA-KIRC", 
                  data.category = "Clinical", 
                  file.type = "xml")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")



clinical_trait <- clinical  %>%
  dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                days_to_death,days_to_last_followup,race_list,
                person_neoplasm_cancer_status,age_at_initial_pathologic_diagnosis,
                laterality,neoplasm_histologic_grade,stage_event_pathologic_stage,             
                stage_event_tnm_categories  ) %>%
  distinct( bcr_patient_barcode, .keep_all = TRUE)  


#整理死亡患者的临床信息
dead_patient <- clinical_trait  %>%
  dplyr::filter(vital_status == 'Dead') %>%
  dplyr::select(-days_to_last_followup) %>%
  reshape::rename(c(bcr_patient_barcode = 'Barcode',
                    gender = 'Gender',
                    vital_status = 'OS',
                    days_to_death='OS.Time',
                    race_list = 'Race',
                    person_neoplasm_cancer_status='cancer_status',
                    age_at_initial_pathologic_diagnosis = 'Age',
                    neoplasm_histologic_grade = 'Grade',
                    stage_event_pathologic_stage = 'Stage',
                    stage_event_tnm_categories = 'TNM' )) %>%
  mutate(OS=ifelse(OS=='Dead',1,0))%>%
  mutate(OS.Time=OS.Time/365)



#整理生存患者的临床信息
alive_patient <- clinical_trait %>%
  dplyr::filter(vital_status == 'Alive') %>%
  dplyr::select(-days_to_death) %>%
  reshape::rename(c(bcr_patient_barcode = 'Barcode',
                    gender = 'Gender',
                    vital_status = 'OS',
                    days_to_last_followup='OS.Time',
                    race_list = 'Race',
                    person_neoplasm_cancer_status='cancer_status',
                    age_at_initial_pathologic_diagnosis = 'Age',
                    neoplasm_histologic_grade = 'Grade',
                    stage_event_pathologic_stage = 'Stage',
                    stage_event_tnm_categories = 'TNM' )) %>%
  mutate(OS=ifelse(OS=='Dead',1,0))%>%
  mutate(OS.Time=OS.Time/365)

#合并两类患者,得到肾透明细胞癌的临床信息
survival_data <- rbind(dead_patient,alive_patient)

write.csv(survival_data , file = 'KIRC_survival.csv')



survival_data <- subset(survival_data,select=c(Barcode,OS ,OS.Time))
  • 读取表达量数据(前文所得)
代码语言:javascript复制
# ===============================================================

#load expression data

# ===============================================================



data <- read.csv('KIRC_mRNA_exprSet.csv',header = T,row.names = 1)

marker <- 'ERBB2'


data1 <- data[which(rownames(data) %in% marker),]

data1 <- as.data.frame(t(data1))

row.names(data1) <- substr(row.names(data1),start = 1,stop = 12)
row.names(data1) <- chartr(old='.',new = '-',x=row.names(data1))
data1$Barcode <- row.names(data1)


data1 <- subset(data1,select= c(ERBB2,Barcode))


dt <- merge(data1,survival_data ,by='Barcode')
  • 将患者根据ERBB2表达量分为高低两组(高于中位值和不高于中位值),通过KM法绘制生存曲线
代码语言:javascript复制
# ===============================================================
#plot survival figure
# ===============================================================

dt_OS <-  dt %>%
  dplyr::select(OS,OS.Time,ERBB2)
dt_OS <- na.omit(dt_OS)
dt_OS['ERBB2'] <- ifelse(dt_OS['ERBB2'] > median(dt_OS[,'ERBB2']), 'high', 'low' )
fit <- survfit(Surv(OS.Time,OS)~ERBB2,data=dt_OS)
getwd()
pdf("ERBB2_os.pdf",height = 8,width = 8)
ggpar( ggsurvplot(
  fit,   data = dt_OS,   
  pval = TRUE,           
  conf.int = TRUE, 
  legend.title = "ERBB2",
  xlim = c(0,5),  
  xlab = "Time in years",  
  break.time.by = 1,  
  pval.size = 8,
  risk.table = "absolute", 
  risk.table.y.text.col = T,
  risk.table.y.text = FALSE,
  risk.table.fontsize = 5,
  legend.labs =  c("high", "Low"),   
  palette = c("#E41A1C","#377EB8")),
  font.y  = c(16, "bold"), 
  font.x  = c(16, "bold"),
  legend = "top",
  font.legend = c(16, "bold"))
dev.off()
  • 临床参数相关分析
代码语言:javascript复制
# ===============================================================

#clinical parameters

# ===============================================================

survival_data <- rbind(dead_patient,alive_patient)



data <- merge(data1,survival_data,by='Barcode')



library(stringr)


data$Te2  <-   str_extract(data$TNM, "T\d")


data$Te1  <-   str_extract(data$TNM, "T\d[a-z] ")
data$Te1 <- ifelse(is.na(data$Te1),data$Te2,data$Te1)


data$N  <-   str_extract(data$TNM, "N\d")
data$N  <-   str_extract(data$N, "\d")

data$M <-   str_extract(data$TNM, "M\d")
data$M <-   str_extract(data$M, "\d")


data$Age <- ifelse(data$Age > 60, 'old','yong')



data$stage1  <- str_trim(str_extract(data$Stage, "\s[H-Z] "),
                         side = c("both", "left", "right"))






data$stage1 


data$stage2  <- str_extract(data$Stage, "\s[A-Z] ")
data$stage2

data$expression <- data$ERBB2 


#Age
data1 <- na.omit(data)
t.test(expression~Age,data = data1)

table(data$Age)

mean(na.omit(data[data$Age=='old','expression']))

sd(na.omit(data[data$Age=='old','expression']))


mean(na.omit(data[data$Age=='yong','expression']))
sd(na.omit(data[data$Age=='yong','expression']))



#Gender
t.test(expression~Gender,data = data)

mean(na.omit(data[data$Gender=='FEMALE','expression']))
sd(na.omit(data[data$Gender =='FEMALE','expression']))


mean(na.omit(data[data$Gender=='MALE','expression']))
sd(na.omit(data[data$Gender =='MALE','expression']))



#Tumor location


data1 <- subset(data,data$Location == 'Central Lung' | data$Location=='Peripheral Lung')
table(data1$Location)

t.test(expression~Location,data = data1)

mean(na.omit(data[data$Location=='Central Lung','expression']))
sd(na.omit(data[data$Location =='Central Lung','expression']))


mean(na.omit(data[data$Location=='Peripheral Lung','expression']))
sd(na.omit(data[data$Location =='Peripheral Lung','expression']))






#stage
table(data$stage1)

table(data$stage1)
data$stage1[data$stage1  == "I"] <-  'Stage I-II'
data$stage1[data$stage1  == "II"] <-  'Stage I-II'
data$stage1[data$stage1  == "III"] <-  'Stage III-IV'
data$stage1[data$stage1  == "IV"] <-  'Stage III-IV'
table(data$stage1)


data1 <- subset(data,data$stage1=='Stage I-II' | data$stage1 =='Stage III-IV')
table(data$stage1)



t.test(expression~stage1,data = data1)

mean( na.omit(data[data$stage1=='Stage I-II','expression']))
sd( na.omit(data[data$stage1 =='Stage I-II','expression']))


mean( na.omit(data[data$stage1=='Stage III-IV','expression']))
sd( na.omit(data[data$stage1 =='Stage III-IV','expression']))



#T
table(data$Te1)
table(data$Te2)


data$Te2[data$Te2  == "T1"] <-  'T1-T2'
data$Te2[data$Te2  == "T2"] <-  'T1-T2'
data$Te2[data$Te2  == "T3"] <-  'T3-T4'
data$Te2[data$Te2  == "T4"] <-  'T3-T4'
table(data$Te2)



t.test(expression~Te2,data = data)

mean(na.omit(data[data$Te2=='T1-T2','expression']))
sd( na.omit(data[data$Te2 =='T1-T2','expression']))
mean(na.omit(data[data$Te2=='T3-T4','expression']))
sd( na.omit(data[data$Te2 =='T3-T4','expression']))




#N
table(data$N)


data$N[data$N == "0"] <-  'NO'
data$N[data$N  == "1"] <-  'Yes'
data$N[data$N  == "2"] <-  'Yes'
data$N[data$N  == "3"] <-  'Yes'
table(data$N)



t.test(expression~N,data = data)

mean(na.omit(data[data$N=='NO','expression']))
sd( na.omit(data[data$N =='NO','expression']))
mean(na.omit(data[data$N=='Yes','expression']))
sd( na.omit(data[data$N =='Yes','expression']))




#M
table(data$M)


data$M[data$M== "0"] <-  'NO'
data$M[data$M  == "1"] <-  'Yes'

table(data$M)



t.test(expression~M,data = data)

mean(na.omit(data[data$M  =='NO','expression']))
sd(na.omit(data[data$M    =='NO','expression']))
mean(na.omit(data[data$M  =='Yes','expression']))
sd(na.omit(data[data$M    =='Yes','expression']))

基于上述的计算过程,我们得到这张表格所需的所有信息:

0 人点赞