接上一篇《R语言模拟:Bias-Variance trade-off》,本文通过模拟分析算法的泛化误差、偏差、方差和噪声之间的关系,是《element statistical learning》第七章的一个案例。
上一篇通过模拟给出了在均方误差度量下,测试集上存在的偏差方差Trade-Off的现象,随着模型复杂度(变量个数)增加,训练集上的误差不断减小,最终最终导致过拟合,而测试集的误差则先减小后增大。
模拟方法说明
本文通过对泛化误差的分解来说明训练集误差变化的原因,我们做如下模拟实验:
样本1::训练集和测试集均为20个自变量,80个样本,自变量服从[0,1]均匀分布,因变量定义为:
Y = ifelse(X1>1/2,1,0)
样本2 : 训练集和测试集均为20个自变量,80个样本,自变量服从[0,1]均匀分布,因变量定义为:
Y = ifelse(X1 X2 ... X10>5,1,0)
通过两类模型、两种误差度量方式共四种方法进行建模,分析误差,模型为knn和best subset linear model。
knn根据距离样本最近的k个样本的Y值预测样本的Y值,knn模型用于样本1,R语言中可通过函数knnreg实现。
best subset linear model 对于输入的样本,获取最优的自变量组合建立线性模型进行预测,best subset model用于样本2,R语言中可通过函数regsubsets实现。
误差度量分为均方误差(squared error)和0-1误差(0-1 Loss)两种,均方误差可以视为回归模型(regression),0-1误差可以视为分类模型(classification)。
结果说明
每种方法模拟100次,在每个模型中计算偏差、方差和预测误差并作图分析结果,最终得到结果如下:
其中,红色线表示预测误差,蓝色线表示方差,绿色线表示偏差平方,对比书上的结果
结果分析:
- 从数值上看,0-1 Loss 和Squared error度量的模型具有不同特征,0-1 Loss满足预测误差 = 方差 偏差平方的关系式,Squared error不满足这一关系;
- 方差都是随着模型中包含变量个数增加而减小,偏差的变化非线性。
代码
语言:r
后台回复“代码”获取代码文件
knn model
代码语言:javascript复制# bais variance trade-off regression
# knn
library(caret)
# get bais variance
# k:knn中的k值或best subset中的k值
# num:模拟次数
# sigma:随机误差的标准差
# test_id 用于计算偏差误差的训练集样本编号,1-80中任一整数
# regtype:knn或best sub
# seeds:随机数种子
# 返回方差偏差误差等值
getError <- function(k,num,modeltype,seeds,n_test){
set.seed(seeds)
testset <- as.data.frame(matrix(runif(n_test*21,0,1),n_test))
Allfx_hat <- matrix(0,n_test,num)
Ally <- matrix(0,n_test,num)
Allfx <- matrix(0,n_test,num)
# 模拟 num次
for (i in 1:num){
trainset <- as.data.frame(matrix(runif(80*21,0,1),80))
fx_train <- ifelse(trainset[,1]>0.5,1,0)
trainset[,21] <- fx_train
fx_test <- ifelse(testset[,1]>0.5,1,0)
testset[,21] <- fx_test
# knn model
knnmodel <- knnreg(trainset[,1:20],trainset[,21],k = k)
probs <- predict(knnmodel, newdata = testset[,1:20])
Allfx_hat[,i] <- probs
Ally[,i] <- testset[,21]
Allfx[,i] <- fx_test
}
# 计算方差、偏差等
# irreducible <- sigma^2
irreducible <- mean(apply( Allfx - Ally ,1,var))
SquareBais <- mean(apply((Allfx_hat - Allfx)^2,1,mean))
Variance <- mean(apply(Allfx_hat,1,var))
# 回归或分类两种情况
if (modeltype == 'reg'){
PredictError <- irreducible SquareBais Variance
}else{
PredictError <- mean(ifelse(Allfx_hat>=0.5,1,0)!=Allfx)
}
result <- data.frame(k,irreducible,SquareBais,Variance,PredictError)
return(result)
}
# ---------------- plot square error knn ----------------------------
# k:knn中的k值或best subset中的k值
# num:模拟次数
# test_id 用于计算偏差误差的训练集样本编号,1-80中任一整数
# regtype:knn或best sub
# seeds:随机数种子
n_test <- 100
modeltype <- 'reg'
num <- 100
seeds <- 1
result <- getError(2,num,modeltype,seeds,n_test)
result <- rbind(result,getError(5,num,modeltype,seeds,n_test))
result <- rbind(result,getError(7,num,modeltype,seeds,n_test))
for (i in seq(10,50,10)){
result <- rbind(result,getError(i,num,modeltype,seeds,n_test))
}
png(file = "k-NN - Regression_large_testset.png")
plot(-result$k,result$PredictError,type = 'o',col = 'red',
xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$SquareBais,type = 'o',col = 'green',
xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$Variance,type = 'o',col = 'blue',
xlim = c(-50,0),ylim = c(0,0.4),xlab = 'Number of Neighbors k', ylab ='', lwd = 2,
main = 'k-NN - Regression')
dev.off()
# ---------------------- plot 0-1 loss knn -------------------------
modeltype <- 'classification'
num <- 100
n_test <- 100
seeds <- 1
result <- getError(2,num,modeltype,seeds,n_test)
result <- rbind(result,getError(5,num,modeltype,seeds,n_test))
result <- rbind(result,getError(7,num,modeltype,seeds,n_test))
for (i in seq(10,50,10)){
result <- rbind(result,getError(i,num,modeltype,seeds,n_test))
}
png(file = "k-NN - Classification_large_testset.png")
plot(-result$k,result$PredictError,type = 'o',col = 'red',
xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$SquareBais,type = 'o',col = 'green',
xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$Variance,type = 'o',col = 'blue',
xlim = c(-50,0),ylim = c(0,0.4),xlab = 'Number of Neighbors k', ylab ='', lwd = 2,
main = 'k-NN - Classification')
dev.off()
best subset model
代码语言:javascript复制library(leaps)
lm.BestSubSet<- function(trainset,k){
lm.sub <- regsubsets(V21~.,trainset,nbest =1,nvmax = 20)
summary(lm.sub)
coef_lm <- coef(lm.sub,k)
strings_coef_lm <- coef_lm
x <- paste(names(coef_lm)[2:length(coef_lm)], collapse =' ')
formulas <- as.formula(paste('V21~',x,collapse=''))
return(formulas)
}
getError <- function(k,num,modeltype,seeds,n_test){
set.seed(seeds)
testset <- as.data.frame(matrix(runif(n_test*21,0,1),n_test))
Allfx_hat <- matrix(0,n_test,num)
Ally <- matrix(0,n_test,num)
Allfx <- matrix(0,n_test,num)
# 模拟 num次
for (i in 1:num){
trainset <- as.data.frame(matrix(runif(80*21,0,1),80))
fx_train <- ifelse(trainset[,1] trainset[,2] trainset[,3] trainset[,4] trainset[,5]
trainset[,6] trainset[,7] trainset[,8] trainset[,9] trainset[,10]>5,1,0)
trainset[,21] <- fx_train
fx_test <- ifelse(testset[,1] testset[,2] testset[,3] testset[,4] testset[,5]
testset[,6] testset[,7] testset[,8] testset[,9] testset[,10]>5,1,0)
testset[,21] <- fx_test
# best subset
lm.sub <- lm(formula = lm.BestSubSet(trainset,k),trainset)
probs <- predict(lm.sub,testset[,1:20], type = 'response')
Allfx_hat[,i] <- probs
Ally[,i] <- testset[,21]
Allfx[,i] <- fx_test
}
# 计算方差、偏差等
# irreducible <- sigma^2
irreducible <- mean(apply( Allfx - Ally ,1,var))
SquareBais <- mean(apply((Allfx_hat - Allfx)^2,1,mean))
Variance <- mean(apply(Allfx_hat,1,var))
# 回归或分类两种情况
if (modeltype == 'reg'){
PredictError <- irreducible SquareBais Variance
}else{
PredictError <- mean(ifelse(Allfx_hat>=0.5,1,0)!=Allfx)
}
result <- data.frame(k,irreducible,SquareBais,Variance,PredictError)
return(result)
}
# ---------------- plot square error Best Subset Regression ----------------------------
modeltype <- 'reg'
num <- 100
n_test <- 1000
seeds <- 4
all_p <- seq(2,20,3)
result <- getError(1,num,modeltype,seeds,n_test)
for (i in all_p){
result <- rbind(result,getError(i,num,modeltype,seeds,n_test))
}
png(file = "Linear Model - Regression_large_testset.png")
plot(result$k,result$PredictError,type = 'o',col = 'red',
xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$SquareBais,type = 'o',col = 'green',
xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$Variance,type = 'o',col = 'blue',
xlim = c(0,20),ylim = c(0,0.4),xlab = 'Subset Size p', ylab ='', lwd = 2,
main = 'Linear Model - Regression')
dev.off()
# ---------------------- plot 0-1 loss Best Subset Classification -------------------------
modeltype <- 'classification'
num <- 100
n_test <- 1000
seeds <- 4
all_p <- seq(2,20,3)
result <- getError(1,num,modeltype,seeds,n_test)
for (i in all_p){
result <- rbind(result,getError(i,num,modeltype,seeds,n_test))
}
png(file = "Linear Model - Classification_large_testset.png")
plot(result$k,result$PredictError,type = 'o',col = 'red',
xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$SquareBais,type = 'o',col = 'green',
xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$Variance,type = 'o',col = 'blue',
xlim = c(0,20),ylim = c(0,0.4),xlab = 'Subset Size p', ylab ='', lwd = 2,
main = 'Linear Model - Classification')
#
dev.off()
参考文献
1. Ruppert D. The Elements of Statistical Learning: Data Mining, Inference, and Prediction[J]. Journal of the Royal Statistical Society, 2010, 99(466):567-567.