文章目录
- 网页服务
- 确定研究目标
- 数据可视化
- 预处理
- 无量纲化
- 特征选择lasso
- 模型
网页服务
确定研究目标
- 研究目标Y:手术后是否复
- 特征因素X:40个因素特征
- 模型方法:属于分类,生存模型。
- 编程语言:R语言
数据可视化
- 1.散点图:
- 实现方法
代码语言:javascript
复制# 散点图绘制
ggplot(input_data)
# 绘制散点图: 横坐标x为LA, 纵坐标y为LV, 点的颜色通过Age列区分,alpha透明度,size点大小,shape形状(实心正方形),stroke点边框的宽度
geom_point(aes(x = LA, y = LV, colour = Age), alpha=0.7, size=1.0, shape=15, stroke=1)
# 添加拟合线
geom_smooth(aes(x = LA, y = LV), method = 'glm')
# 添加水平线
geom_hline(yintercept = 0, size = 1, linetype = "dotted",color = "black")
# 添加垂直线
geom_vline(xintercept = 3, size = 1, linetype = "dotted",color = "black")
# 添加坐标轴与图像标题
labs(title = "point plot",x="Carat",y="Price")
# 调整坐标轴的显示范围
coord_cartesian(xlim = c(2,7),ylim = c(2,7) )
# 更换主题
theme_linedraw()
- 2.直方图
代码语言:javascript
复制# 2. 画柱状图
hist_outlier <- function (x) {
x_na <- na.omit(x)
hist(x_na, main=" ",xlab=deparse(substitute(x)))
title(paste("Histogram of ",deparse(substitute(x)),sep = ""))
abline(v=c(mean(x_na) 3*sd(x_na),mean(x_na)-3*sd(x_na)),lty=2,col='red')
abline(v=mean(x_na),lty=1,col="red")
}
hist_outlier(log2(input_data$Ap))
代码语言:javascript
复制library(corrplot)
#计算数据集的相关系数矩阵并可视化
input_data <- na.omit(input_data)
clomns = c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time","recurrence")
input_data = input_data[clomns]
mycor = round(cor(input_data),2)
corrplot(mycor, tl.col = "black")
代码语言:javascript
复制fit1 <- survfit(survobj~Gender,data=data)
plot(fit1, xlab="Survival Time in months",
ylab="% Surviving", yscale=100,col=c("red","blue"),
main="Survival Distributions by Gender")
legend("topright", title="Gender",c("Male", "Female"),
fill=c("red", "blue")) # 绘图
预处理
数据值化
缺失值处理
代码语言:javascript
复制rm(list = ls())
library(VIM)
library(naniar)
library(ggplot2)
library(mice)
# read data
data_exercise <- read.csv('./data/init_data.csv')
data <- data_exercise
summary(data)
varlist <- c("ID", "B1", "B2", "B3", "B4",
"C1", "C2", "C3", "C4", "C5", "C6_trans",
"Time_death", "Status_death", "haz_os")
clomns <- colnames(data)
# create a new dataset for imputation
data.impu = data[clomns]
## categorical variable as factor
data.impu$LA <- factor(data.impu$LA)
data.impu$LV <- factor(data.impu$LV)
data.impu$EF <- factor(data.impu$EF)
data.impu$BMI <- factor(data.impu$BMI)
data.impu$BNP <- factor(data.impu$BNP)
data.impu$Ccr <- factor(data.impu$Ccr)
data.impu$UA <- factor(data.impu$UA)
data.impu$Laa <- factor(data.impu$Laa)
data.impu$Va <- factor(data.impu$Va)
data.impu$LaaVa <- factor(data.impu$LaaVa)
# see all the default settings for imputation
impu_default <- mice(data.impu, maxit = 0)
summary(impu_default)
# see the predictor structure
pred <- quickpred(data.impu, exclude = c("NO","recurrence","DM","Ablation","Preantiplatelet","DM","HP","AF","Gender","Lmpv"))
pred
# imputation method
meth <- impu_default$meth
meth # imputation method can be changed manually
# imputation
# single imputation (m=1)
imputation <- mice(data.impu, maxit = 25, m = 1, seed = 1234, pred = pred, meth = meth, print = TRUE)
data_single <- mice::complete(imputation, 1)
nrow(data_single)
summary(data_single)
write.csv(data_single,file="./data/b.csv",quote=F,row.names = F)
无量纲化
- 即特征的规格不一样,不能够放在一起比较。
- 比如血压值,与身高、体重 量纲不同,不能直接比较输入模型。
- 方法:标准化
特征选择lasso
代码语言:javascript
复制rm(list = ls())
# install.packages('glmnet')
# install.packages('MASS')
# install.packages('survival')
# install.packages('rms')
library(glmnet)
library(MASS)
library(survival)
library(rms)
# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)
xclomns <- c("Age","history","LA","LV","EF","Hight","Weight","BMI","BNP_log","BNP","Ccr_log","Ccr","UA","Laa","Va","LaaVa","Ap_log","Ap","Trans","Lupv","Llpv","Rupv","Rlpv","Rmpv","Lmpv","Trunk",
"PAI","IAB","Ptfv1","Gender","AF","CHA2DS2VASc","HasBled","NYHA","Stoke","TAI","CAD","PCI","CABG","HP","DM","HCM","Lipid","PreAAD","Preanticoagulance","Preantiplatelet","Smoke","Drink","PM","Ablation",
"AADDischarged","NOACDischarged","PreAmiodarone","time","Age_cls")
Yclomns <- c("recurrence")
# 1.logistic 回归 逐步回归
## 创建公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" ")))
formula
## 1.2给数据味值
model.full <- glm(Formula, data=data,family=binomial)
summary(model.full)
## 1.3逐步选择变量
model.step <- stepAIC(model.full, direction="both")
summary(model.step)
# 2.lasso
# 2.1将数据转化为矩阵
tmp.y <- data$recurrence
tmp.y
tmp.x <- model.matrix(~.,data[xclomns])
tmp.x
##2.2模型味数据
model.lasso <- glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(model.lasso,xvar="lambda",label=TRUE)
##2.3通过交叉验证找到最佳模型
cv.model <- cv.glmnet(tmp.x, tmp.y, family="binomial", nlambda=50, alpha=1, standardize=TRUE)
plot(cv.model)
cv.model$lambda.min
coef(cv.model, s=cv.model$lambda.min)
## 2.4增加lambda以进一步收缩
cv.model$lambda.1se
coef(cv.model, s=cv.model$lambda.1se)
## 2.5如果最终的变量数为5个
model.lasso #选择变量的依据
coef(model.lasso , s=model.lasso$lambda[8])
coef(model.lasso , s=model.lasso$lambda[9])
coef(model.lasso , s=0.024970)
- 特征选择结果:“Age”,“history”,“Laa”,“PAI”,“AF”,“time”
模型
方法1:生存模型
代码语言:javascript
复制rm(list = ls())
library(car)
library(ggplot2)
library(glmnet)
library(MASS)
library(survival)
library(rms)
library(survminer)
library(ggridges)
library(pROC)
library(plotROC)
library(riskRegression)
library(glmnet)
library(MASS)
library(survival)
library(rms)
# 0. 读取数据
init_data <- read.csv('./data/normal.csv')
noraml_data <- init_data
## 0.1空缺值处理
data <- na.omit(noraml_data)
summary(data)
clomns <- colnames(data)
class(clomns)
# 1. 数据集拆分
set.seed(1234)
prop_train <- 0.8 # proportion of training data
train <- sample(nrow(data), nrow(data) * prop_train)
data.train <- data[train, ]
data.test <- data[-train, ]
nrow(data.train)
nrow(data.test)
# 不同性别的生存率差异
survobj <- with(data,Surv(time,recurrence))
fit0 <- survfit(survobj~1, data=data)
summary(fit0)
plot(fit0, xlab="Survival Time in months",
ylab="% Surviving", yscale=100,
main="Survival Distribution (Overall)") # 绘图横坐标代表天数,纵坐标代表生存率;实线是估计的生存率,虚线代表的是估计生存率的95%置信区间。
# 比较不同性别的生存差异
fit1 <- survfit(survobj~Gender,data=data)
plot(fit1, xlab="Survival Time in months",
ylab="% Surviving", yscale=100,col=c("red","blue"),
main="Survival Distributions by Gender")
legend("topright", title="Gender",c("Male", "Female"),
fill=c("red", "blue")) # 绘图
# 比较是否抽烟生存差异
fit1 <- survfit(survobj~Smoke,data=data)
plot(fit1, xlab="Survival Time in months",
ylab="% Surviving", yscale=100,col=c("red","blue"),
main="Survival Distributions by Smoke")
legend("topright", title="Gender",c("Yes", "No"),
fill=c("red", "blue")) # 绘图
# 2. cox建立模型
xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged")
Yclomns = "Surv(time,recurrence)"
# 2.1. 编辑公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" ")))
formula
# fit a model with all selected varaibles
model.train <- coxph(Formula, data=data.train)
summary(model.train)
ggsurvplot(survfit(model.train), data = data.train, palette = "#2E9FDF",
ggtheme = theme_minimal(), legend = "none")
# 2.2预测概率值
## 2.2 训练集
data.train$lp <- predict(model.train, data = data.train, type="lp")
data.train$recurrence_pred <- predict(model.train, data = data.train, type="survival")
Hazard <- basehaz(model.train,centered=F)
S0_12 <- min(exp(-Hazard[Hazard[,2] <= 12,])[,1])
# 5 years (60 months)
S0_60 <- min(exp(-Hazard[Hazard[,2] <= 60,])[,1])
方法2:logistics
代码语言:javascript
复制 3. logistic
# xclomns <- c("Age","history","Laa","PAI","AF","Preanticoagulance","NOACDischarged","time")
xclomns <- c("Age","history","Laa","PAI","AF","time")
Yclomns <- c("recurrence")
# 3.1 建立公式
Formula <- formula(paste(paste(Yclomns,"~", collapse=" "),
paste(xclomns, collapse=" ")))
formula
# 3.2 模型味数据
model.train <- glm(Formula, data=data.train,family=binomial)
model.train <- stepAIC(model.train, direction="both")
summary(model.train)
# data.train$lp_prediction <- predict(model.train, data.train, type="link")
data.train$p_prediction <- predict(model.train, data.train, type="response")
data.train$recurrence_prediction<-ifelse(data.train$p_prediction>=0.5,1,0)
# 4.测试集预测
# data.test$lp_prediction <- predict(model.train, data.test, type="link")
data.test$p_prediction <- predict(model.train, data.test, type="response")
data.test$recurrence_prediction<-ifelse(data.test$p_prediction>=0.5,1,0)
## 混淆矩阵
f<-table(data.test$recurrence,data.test$recurrence_prediction)
f
## 准确率
recurrence_Accuracy = (165 13)/(165 13 5 22)
recurrence_Accuracy
# 5. ROC
## 5.1 训练集
roc.train <- roc(data.train$recurrence, data.train$p_prediction)
plot(roc.train)
## 5.2 测试集
roc.test <- roc(data.test$recurrence, data.test$p_prediction)
plot(roc.test)
# 6. 校准率
# 6.1 训练集校准
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F)
val.prob(p = fitted(model.train), y = data.train$recurrence,logistic.cal=F,statloc=F)
## 6.2 测试集校准
val.prob(p = data.test$p_prediction, y = data.test$recurrence,logistic.cal=F)
- roc
结果