本文目的
(1)绘制生存分析图 (2)临床参数相关分析
- 加载所必须的包
# ===============================================================
#load package
# ===============================================================
library(TCGAbiolinks)
library(SummarizedExperiment)
setwd('D:\train\single_gene')
library(dplyr)
library(survival)
library(survminer)
rm(list=ls())
- 通过TCGAbiolinks下载TCGA生存相关信息(将其中的OS和OStime留作后用)
# ===============================================================
#download survival data
# ===============================================================
query <- GDCquery(project = "TCGA-KIRC",
data.category = "Clinical",
file.type = "xml")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")
clinical_trait <- clinical %>%
dplyr::select(bcr_patient_barcode,gender,vital_status,
days_to_death,days_to_last_followup,race_list,
person_neoplasm_cancer_status,age_at_initial_pathologic_diagnosis,
laterality,neoplasm_histologic_grade,stage_event_pathologic_stage,
stage_event_tnm_categories ) %>%
distinct( bcr_patient_barcode, .keep_all = TRUE)
#整理死亡患者的临床信息
dead_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Dead') %>%
dplyr::select(-days_to_last_followup) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_death='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#整理生存患者的临床信息
alive_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Alive') %>%
dplyr::select(-days_to_death) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_last_followup='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#合并两类患者,得到肾透明细胞癌的临床信息
survival_data <- rbind(dead_patient,alive_patient)
write.csv(survival_data , file = 'KIRC_survival.csv')
survival_data <- subset(survival_data,select=c(Barcode,OS ,OS.Time))
- 读取表达量数据(前文所得)
# ===============================================================
#load expression data
# ===============================================================
data <- read.csv('KIRC_mRNA_exprSet.csv',header = T,row.names = 1)
marker <- 'ERBB2'
data1 <- data[which(rownames(data) %in% marker),]
data1 <- as.data.frame(t(data1))
row.names(data1) <- substr(row.names(data1),start = 1,stop = 12)
row.names(data1) <- chartr(old='.',new = '-',x=row.names(data1))
data1$Barcode <- row.names(data1)
data1 <- subset(data1,select= c(ERBB2,Barcode))
dt <- merge(data1,survival_data ,by='Barcode')
- 将患者根据ERBB2表达量分为高低两组(高于中位值和不高于中位值),通过KM法绘制生存曲线
# ===============================================================
#plot survival figure
# ===============================================================
dt_OS <- dt %>%
dplyr::select(OS,OS.Time,ERBB2)
dt_OS <- na.omit(dt_OS)
dt_OS['ERBB2'] <- ifelse(dt_OS['ERBB2'] > median(dt_OS[,'ERBB2']), 'high', 'low' )
fit <- survfit(Surv(OS.Time,OS)~ERBB2,data=dt_OS)
getwd()
pdf("ERBB2_os.pdf",height = 8,width = 8)
ggpar( ggsurvplot(
fit, data = dt_OS,
pval = TRUE,
conf.int = TRUE,
legend.title = "ERBB2",
xlim = c(0,5),
xlab = "Time in years",
break.time.by = 1,
pval.size = 8,
risk.table = "absolute",
risk.table.y.text.col = T,
risk.table.y.text = FALSE,
risk.table.fontsize = 5,
legend.labs = c("high", "Low"),
palette = c("#E41A1C","#377EB8")),
font.y = c(16, "bold"),
font.x = c(16, "bold"),
legend = "top",
font.legend = c(16, "bold"))
dev.off()
- 临床参数相关分析
# ===============================================================
#clinical parameters
# ===============================================================
survival_data <- rbind(dead_patient,alive_patient)
data <- merge(data1,survival_data,by='Barcode')
library(stringr)
data$Te2 <- str_extract(data$TNM, "T\d")
data$Te1 <- str_extract(data$TNM, "T\d[a-z] ")
data$Te1 <- ifelse(is.na(data$Te1),data$Te2,data$Te1)
data$N <- str_extract(data$TNM, "N\d")
data$N <- str_extract(data$N, "\d")
data$M <- str_extract(data$TNM, "M\d")
data$M <- str_extract(data$M, "\d")
data$Age <- ifelse(data$Age > 60, 'old','yong')
data$stage1 <- str_trim(str_extract(data$Stage, "\s[H-Z] "),
side = c("both", "left", "right"))
data$stage1
data$stage2 <- str_extract(data$Stage, "\s[A-Z] ")
data$stage2
data$expression <- data$ERBB2
#Age
data1 <- na.omit(data)
t.test(expression~Age,data = data1)
table(data$Age)
mean(na.omit(data[data$Age=='old','expression']))
sd(na.omit(data[data$Age=='old','expression']))
mean(na.omit(data[data$Age=='yong','expression']))
sd(na.omit(data[data$Age=='yong','expression']))
#Gender
t.test(expression~Gender,data = data)
mean(na.omit(data[data$Gender=='FEMALE','expression']))
sd(na.omit(data[data$Gender =='FEMALE','expression']))
mean(na.omit(data[data$Gender=='MALE','expression']))
sd(na.omit(data[data$Gender =='MALE','expression']))
#Tumor location
data1 <- subset(data,data$Location == 'Central Lung' | data$Location=='Peripheral Lung')
table(data1$Location)
t.test(expression~Location,data = data1)
mean(na.omit(data[data$Location=='Central Lung','expression']))
sd(na.omit(data[data$Location =='Central Lung','expression']))
mean(na.omit(data[data$Location=='Peripheral Lung','expression']))
sd(na.omit(data[data$Location =='Peripheral Lung','expression']))
#stage
table(data$stage1)
table(data$stage1)
data$stage1[data$stage1 == "I"] <- 'Stage I-II'
data$stage1[data$stage1 == "II"] <- 'Stage I-II'
data$stage1[data$stage1 == "III"] <- 'Stage III-IV'
data$stage1[data$stage1 == "IV"] <- 'Stage III-IV'
table(data$stage1)
data1 <- subset(data,data$stage1=='Stage I-II' | data$stage1 =='Stage III-IV')
table(data$stage1)
t.test(expression~stage1,data = data1)
mean( na.omit(data[data$stage1=='Stage I-II','expression']))
sd( na.omit(data[data$stage1 =='Stage I-II','expression']))
mean( na.omit(data[data$stage1=='Stage III-IV','expression']))
sd( na.omit(data[data$stage1 =='Stage III-IV','expression']))
#T
table(data$Te1)
table(data$Te2)
data$Te2[data$Te2 == "T1"] <- 'T1-T2'
data$Te2[data$Te2 == "T2"] <- 'T1-T2'
data$Te2[data$Te2 == "T3"] <- 'T3-T4'
data$Te2[data$Te2 == "T4"] <- 'T3-T4'
table(data$Te2)
t.test(expression~Te2,data = data)
mean(na.omit(data[data$Te2=='T1-T2','expression']))
sd( na.omit(data[data$Te2 =='T1-T2','expression']))
mean(na.omit(data[data$Te2=='T3-T4','expression']))
sd( na.omit(data[data$Te2 =='T3-T4','expression']))
#N
table(data$N)
data$N[data$N == "0"] <- 'NO'
data$N[data$N == "1"] <- 'Yes'
data$N[data$N == "2"] <- 'Yes'
data$N[data$N == "3"] <- 'Yes'
table(data$N)
t.test(expression~N,data = data)
mean(na.omit(data[data$N=='NO','expression']))
sd( na.omit(data[data$N =='NO','expression']))
mean(na.omit(data[data$N=='Yes','expression']))
sd( na.omit(data[data$N =='Yes','expression']))
#M
table(data$M)
data$M[data$M== "0"] <- 'NO'
data$M[data$M == "1"] <- 'Yes'
table(data$M)
t.test(expression~M,data = data)
mean(na.omit(data[data$M =='NO','expression']))
sd(na.omit(data[data$M =='NO','expression']))
mean(na.omit(data[data$M =='Yes','expression']))
sd(na.omit(data[data$M =='Yes','expression']))
基于上述的计算过程,我们得到这张表格所需的所有信息: