R建立一个临床预测分类

2021-01-14 11:29:31 浏览数 (1)

文章目录

  • 网页服务
  • 确定研究目标
  • 数据可视化
  • 预处理
    • 数据值化
    • 缺失值处理
  • 无量纲化
  • 特征选择lasso
  • 模型
    • 方法1:生存模型
    • 方法2:logistics
      • 结果

网页服务

  • 网页服务地址

确定研究目标

  • 研究目标Y:手术后是否复
  • 特征因素X:40个因素特征
  • 模型方法:属于分类,生存模型。
  • 编程语言:R语言

数据可视化

  • 1.散点图:
  • 实现方法
代码语言:javascript复制
# 散点图绘制
ggplot(input_data)   
  # 绘制散点图: 横坐标x为LA, 纵坐标y为LV, 点的颜色通过Age列区分,alpha透明度,size点大小,shape形状(实心正方形),stroke点边框的宽度
  geom_point(aes(x = LA, y = LV, colour = Age), alpha=0.7, size=1.0, shape=15, stroke=1)  
  # 添加拟合线
  geom_smooth(aes(x = LA, y = LV), method = 'glm')  
  # 添加水平线
  geom_hline(yintercept = 0, size = 1, linetype = "dotted",color = "black")  
  # 添加垂直线
  geom_vline(xintercept = 3, size = 1, linetype = "dotted",color = "black")  
  # 添加坐标轴与图像标题
  labs(title = "point plot",x="Carat",y="Price") 
  # 调整坐标轴的显示范围
  coord_cartesian(xlim = c(2,7),ylim = c(2,7) )  
  # 更换主题
  theme_linedraw()
  • 2.直方图
代码语言:javascript复制
# 2. 画柱状图
hist_outlier <- function (x) {
  x_na <- na.omit(x)
  hist(x_na, main=" ",xlab=deparse(substitute(x)))
  title(paste("Histogram of ",deparse(substitute(x)),sep = ""))
  abline(v=c(mean(x_na) 3*sd(x_na),mean(x_na)-3*sd(x_na)),lty=2,col='red')
  abline(v=mean(x_na),lty=1,col="red")
} 
hist_outlier(log2(input_data$Ap))
    1. 瓦片图相关系数
代码语言:javascript复制
library(corrplot)
#计算数据集的相关系数矩阵并可视化
input_data <- na.omit(input_data)
clomns = c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time","recurrence")
input_data = input_data[clomns]
mycor = round(cor(input_data),2)
corrplot(mycor, tl.col = "black")
  • 生存曲线
代码语言:javascript复制
fit1 <- survfit(survobj~Gender,data=data)
plot(fit1, xlab="Survival Time in months",
     ylab="% Surviving", yscale=100,col=c("red","blue"),
     main="Survival Distributions by Gender")
legend("topright", title="Gender",c("Male", "Female"),
       fill=c("red", "blue")) # 绘图

预处理

数据值化

缺失值处理

  • 方法为:多重插补
代码语言:javascript复制
rm(list = ls())
library(VIM)
library(naniar)
library(ggplot2)
library(mice)

# read data
data_exercise <- read.csv('./data/init_data.csv')
data <- data_exercise
summary(data)

varlist <- c("ID", "B1", "B2", "B3", "B4", 
             "C1", "C2", "C3", "C4", "C5", "C6_trans", 
             "Time_death", "Status_death", "haz_os")

clomns <- colnames(data)
# create a new dataset for imputation
data.impu = data[clomns]


## categorical variable as factor
data.impu$LA <- factor(data.impu$LA)
data.impu$LV <- factor(data.impu$LV)
data.impu$EF <- factor(data.impu$EF)
data.impu$BMI <- factor(data.impu$BMI)
data.impu$BNP <- factor(data.impu$BNP)
data.impu$Ccr <- factor(data.impu$Ccr)
data.impu$UA <- factor(data.impu$UA)
data.impu$Laa <- factor(data.impu$Laa)
data.impu$Va <- factor(data.impu$Va)
data.impu$LaaVa <- factor(data.impu$LaaVa)


# see all the default settings for imputation
impu_default <- mice(data.impu, maxit = 0)
summary(impu_default)

# see the predictor structure
pred <- quickpred(data.impu, exclude = c("NO","recurrence","DM","Ablation","Preantiplatelet","DM","HP","AF","Gender","Lmpv"))
pred

# imputation method
meth <- impu_default$meth
meth    # imputation method can be changed manually

# imputation
# single imputation (m=1)
imputation <- mice(data.impu, maxit = 25, m = 1, seed = 1234, pred = pred, meth = meth, print = TRUE)
data_single <- mice::complete(imputation, 1)
nrow(data_single)
summary(data_single)
write.csv(data_single,file="./data/b.csv",quote=F,row.names = F)

无量纲化

  • 即特征的规格不一样,不能够放在一起比较。
  • 比如血压值,与身高、体重 量纲不同,不能直接比较输入模型。
  • 方法:标准化

特征选择lasso

  • 方法:lasso
代码语言:javascript复制
rm(list = ls())

# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
library(glmnet)
library(MASS)
library(survival)
library(rms)

# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)

xclomns <- c("Age","history","LA","LV","EF","Hight","Weight","BMI","BNP_log","BNP","Ccr_log","Ccr","UA","Laa","Va","LaaVa","Ap_log","Ap","Trans","Lupv","Llpv","Rupv","Rlpv","Rmpv","Lmpv","Trunk",
             "PAI","IAB","Ptfv1","Gender","AF","CHA2DS2VASc","HasBled","NYHA","Stoke","TAI","CAD","PCI","CABG","HP","DM","HCM","Lipid","PreAAD","Preanticoagulance","Preantiplatelet","Smoke","Drink","PM","Ablation",
             "AADDischarged","NOACDischarged","PreAmiodarone","time","Age_cls")
Yclomns <- c("recurrence")

# 1.logistic 回归  逐步回归
## 创建公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "), 
                         paste(xclomns, collapse="   ")))
formula

## 1.2给数据味值
model.full <- glm(Formula, data=data,family=binomial)
summary(model.full)
## 1.3逐步选择变量
model.step <- stepAIC(model.full, direction="both")
summary(model.step)


# 2.lasso 
# 2.1将数据转化为矩阵
tmp.y <- data$recurrence
tmp.y
tmp.x <- model.matrix(~.,data[xclomns])
tmp.x

##2.2模型味数据
model.lasso <-  glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)

plot(model.lasso,xvar="lambda",label=TRUE)

##2.3通过交叉验证找到最佳模型
cv.model <- cv.glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(cv.model)
cv.model$lambda.min
coef(cv.model, s=cv.model$lambda.min) 

## 2.4增加lambda以进一步收缩
cv.model$lambda.1se
coef(cv.model, s=cv.model$lambda.1se) 

## 2.5如果最终的变量数为5个
model.lasso  #选择变量的依据
coef(model.lasso , s=model.lasso$lambda[8]) 
coef(model.lasso , s=model.lasso$lambda[9]) 
coef(model.lasso , s=0.024970) 
  • 特征选择结果:“Age”,“history”,“Laa”,“PAI”,“AF”,“time”

模型

方法1:生存模型

代码语言:javascript复制
rm(list = ls())
library(car)
library(ggplot2)
library(glmnet)
library(MASS)
library(survival)
library(rms)
library(survminer)
library(ggridges)
library(pROC)
library(plotROC)
library(riskRegression)
library(glmnet)
library(MASS)
library(survival)
library(rms)

# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)

# 1. 数据集拆分
set.seed(1234)
prop_train <- 0.8    # proportion of training data
train <- sample(nrow(data), nrow(data) * prop_train)
data.train <- data[train, ]
data.test <- data[-train, ]
nrow(data.train)
nrow(data.test)

# 不同性别的生存率差异
survobj <- with(data,Surv(time,recurrence))
fit0 <- survfit(survobj~1, data=data)
summary(fit0)
plot(fit0, xlab="Survival Time in months",  
     ylab="% Surviving", yscale=100,   
     main="Survival Distribution (Overall)") # 绘图横坐标代表天数,纵坐标代表生存率;实线是估计的生存率,虚线代表的是估计生存率的95%置信区间。

# 比较不同性别的生存差异
fit1 <- survfit(survobj~Gender,data=data)
plot(fit1, xlab="Survival Time in months",
     ylab="% Surviving", yscale=100,col=c("red","blue"),
     main="Survival Distributions by Gender")
legend("topright", title="Gender",c("Male", "Female"),
       fill=c("red", "blue")) # 绘图

# 比较是否抽烟生存差异
fit1 <- survfit(survobj~Smoke,data=data)
plot(fit1, xlab="Survival Time in months",
     ylab="% Surviving", yscale=100,col=c("red","blue"),
     main="Survival Distributions by Smoke")
legend("topright", title="Gender",c("Yes", "No"),
       fill=c("red", "blue")) # 绘图

# 2. cox建立模型

xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged")
Yclomns = "Surv(time,recurrence)"

# 2.1. 编辑公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
                         paste(xclomns, collapse="   ")))
formula

# fit a model with all selected varaibles
model.train <- coxph(Formula, data=data.train)
summary(model.train)
ggsurvplot(survfit(model.train), data = data.train, palette = "#2E9FDF", 
           ggtheme = theme_minimal(), legend = "none")

# 2.2预测概率值
## 2.2 训练集
data.train$lp <- predict(model.train, data = data.train, type="lp")
data.train$recurrence_pred <- predict(model.train, data = data.train, type="survival")
Hazard <- basehaz(model.train,centered=F)



S0_12 <- min(exp(-Hazard[Hazard[,2] <= 12,])[,1])

# 5 years (60 months)
S0_60 <- min(exp(-Hazard[Hazard[,2] <= 60,])[,1])

方法2:logistics

代码语言:javascript复制
 3. logistic
# xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time")
xclomns <- c("Age","history","Laa","PAI","AF","time")
Yclomns <- c("recurrence")
# 3.1 建立公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "), 
                         paste(xclomns, collapse="   ")))
formula

# 3.2 模型味数据
model.train <- glm(Formula, data=data.train,family=binomial)
model.train <- stepAIC(model.train, direction="both")
summary(model.train)

# data.train$lp_prediction <-  predict(model.train, data.train, type="link")
data.train$p_prediction <- predict(model.train, data.train, type="response")
data.train$recurrence_prediction<-ifelse(data.train$p_prediction>=0.5,1,0)

# 4.测试集预测
# data.test$lp_prediction <- predict(model.train, data.test, type="link")
data.test$p_prediction <- predict(model.train, data.test, type="response")
data.test$recurrence_prediction<-ifelse(data.test$p_prediction>=0.5,1,0)

## 混淆矩阵
f<-table(data.test$recurrence,data.test$recurrence_prediction)
f
## 准确率
recurrence_Accuracy = (165 13)/(165 13 5 22)
recurrence_Accuracy


# 5. ROC
## 5.1 训练集
roc.train <- roc(data.train$recurrence, data.train$p_prediction)
plot(roc.train)

## 5.2 测试集
roc.test <- roc(data.test$recurrence, data.test$p_prediction)
plot(roc.test)


# 6. 校准率
# 6.1 训练集校准
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F)
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F,statloc=F)

## 6.2 测试集校准
val.prob(p = data.test$p_prediction, y = data.test$recurrence,logistic.cal=F)
  • roc

结果

0 人点赞