R语言模拟:Bias Variance Decomposition

接上一篇《R语言模拟：Bias-Variance trade-off》，本文通过模拟分析算法的泛化误差、偏差、方差和噪声之间的关系，是《element statistical learning》第七章的一个案例。

上一篇通过模拟给出了在均方误差度量下，测试集上存在的偏差方差Trade-Off的现象，随着模型复杂度（变量个数）增加，训练集上的误差不断减小，最终最终导致过拟合，而测试集的误差则先减小后增大。

模拟方法说明

本文通过对泛化误差的分解来说明训练集误差变化的原因，我们做如下模拟实验：

样本1：:训练集和测试集均为20个自变量，80个样本，自变量服从[0,1]均匀分布，因变量定义为：

Y = ifelse(X1>1/2,1,0)

样本2 ：训练集和测试集均为20个自变量，80个样本，自变量服从[0,1]均匀分布，因变量定义为：

Y = ifelse(X1 X2 ... X10>5,1,0)

通过两类模型、两种误差度量方式共四种方法进行建模，分析误差，模型为knn和best subset linear model。

knn根据距离样本最近的k个样本的Y值预测样本的Y值，knn模型用于样本1,R语言中可通过函数knnreg实现。

best subset linear model 对于输入的样本，获取最优的自变量组合建立线性模型进行预测，best subset model用于样本2，R语言中可通过函数regsubsets实现。

误差度量分为均方误差（squared error）和0-1误差（0-1 Loss）两种，均方误差可以视为回归模型（regression），0-1误差可以视为分类模型（classification）。

结果说明

每种方法模拟100次，在每个模型中计算偏差、方差和预测误差并作图分析结果，最终得到结果如下：

其中，红色线表示预测误差，蓝色线表示方差，绿色线表示偏差平方，对比书上的结果

结果分析：

从数值上看，0-1 Loss 和Squared error度量的模型具有不同特征，0-1 Loss满足预测误差 = 方差偏差平方的关系式，Squared error不满足这一关系；
方差都是随着模型中包含变量个数增加而减小，偏差的变化非线性。

代码

语言：r

后台回复“代码”获取代码文件

knn model

代码语言：javascript复制

# bais variance trade-off  regression

# knn 

library(caret)

# get bais variance
# k:knn中的k值或best subset中的k值
# num：模拟次数
# sigma:随机误差的标准差
# test_id 用于计算偏差误差的训练集样本编号，1-80中任一整数
# regtype:knn或best sub
# seeds:随机数种子
# 返回方差偏差误差等值

getError <- function(k,num,modeltype,seeds,n_test){
  set.seed(seeds)


  testset <- as.data.frame(matrix(runif(n_test*21,0,1),n_test))

  Allfx_hat <- matrix(0,n_test,num)
  Ally <- matrix(0,n_test,num)
  Allfx <- matrix(0,n_test,num)

  # 模拟 num次 



  for (i in 1:num){
    trainset <- as.data.frame(matrix(runif(80*21,0,1),80))


    fx_train <- ifelse(trainset[,1]>0.5,1,0)
    trainset[,21] <- fx_train

    fx_test <- ifelse(testset[,1]>0.5,1,0)
    testset[,21] <- fx_test 


    # knn model
    knnmodel <- knnreg(trainset[,1:20],trainset[,21],k = k)
    probs <- predict(knnmodel, newdata = testset[,1:20])


    Allfx_hat[,i] <- probs
    Ally[,i] <- testset[,21]
    Allfx[,i] <- fx_test



  } 
  # 计算方差、偏差等

  # irreducible <- sigma^2

  irreducible  <- mean(apply( Allfx - Ally  ,1,var))
  SquareBais  <- mean(apply((Allfx_hat - Allfx)^2,1,mean))
  Variance <- mean(apply(Allfx_hat,1,var))

  # 回归或分类两种情况
  if (modeltype == 'reg'){

    PredictError  <- irreducible   SquareBais   Variance 

  }else{

    PredictError  <- mean(ifelse(Allfx_hat>=0.5,1,0)!=Allfx)
  }



  result <- data.frame(k,irreducible,SquareBais,Variance,PredictError)

  return(result)
}

# ----------------   plot square error  knn ----------------------------




# k:knn中的k值或best subset中的k值
# num：模拟次数
# test_id 用于计算偏差误差的训练集样本编号，1-80中任一整数
# regtype:knn或best sub
# seeds:随机数种子

n_test <- 100
modeltype <- 'reg'
num <- 100

seeds <- 1

result <- getError(2,num,modeltype,seeds,n_test)
result <- rbind(result,getError(5,num,modeltype,seeds,n_test))
result <- rbind(result,getError(7,num,modeltype,seeds,n_test))
for (i in seq(10,50,10)){
  result <- rbind(result,getError(i,num,modeltype,seeds,n_test))

}


png(file = "k-NN - Regression_large_testset.png")

plot(-result$k,result$PredictError,type = 'o',col = 'red',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$SquareBais,type = 'o',col = 'green',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T) 
plot(-result$k,result$Variance,type = 'o',col = 'blue',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = 'Number of Neighbors k', ylab ='', lwd = 2,
     main = 'k-NN - Regression')
dev.off()

# ----------------------  plot 0-1 loss knn -------------------------
modeltype <- 'classification'
num <- 100
n_test <- 100
seeds <- 1

result <- getError(2,num,modeltype,seeds,n_test)
result <- rbind(result,getError(5,num,modeltype,seeds,n_test))
result <- rbind(result,getError(7,num,modeltype,seeds,n_test))
for (i in seq(10,50,10)){
  result <- rbind(result,getError(i,num,modeltype,seeds,n_test))

}


png(file = "k-NN - Classification_large_testset.png")

plot(-result$k,result$PredictError,type = 'o',col = 'red',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(-result$k,result$SquareBais,type = 'o',col = 'green',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T) 
plot(-result$k,result$Variance,type = 'o',col = 'blue',
     xlim = c(-50,0),ylim = c(0,0.4),xlab = 'Number of Neighbors k', ylab ='', lwd = 2,
     main = 'k-NN - Classification')
dev.off()

best subset model

代码语言：javascript复制

library(leaps) 
lm.BestSubSet<- function(trainset,k){
  lm.sub <- regsubsets(V21~.,trainset,nbest =1,nvmax = 20)
  summary(lm.sub)
  coef_lm <- coef(lm.sub,k)
  strings_coef_lm <- coef_lm
  x <- paste(names(coef_lm)[2:length(coef_lm)], collapse =' ')
  formulas <- as.formula(paste('V21~',x,collapse=''))
  return(formulas)
}

getError <- function(k,num,modeltype,seeds,n_test){
  set.seed(seeds)
  testset <- as.data.frame(matrix(runif(n_test*21,0,1),n_test))

  Allfx_hat <- matrix(0,n_test,num)
  Ally <- matrix(0,n_test,num)
  Allfx <- matrix(0,n_test,num)


  # 模拟 num次



  for (i in 1:num){
    trainset <- as.data.frame(matrix(runif(80*21,0,1),80))
    fx_train <- ifelse(trainset[,1]  trainset[,2]  trainset[,3]  trainset[,4]  trainset[,5] 
                         trainset[,6]  trainset[,7]  trainset[,8]  trainset[,9]  trainset[,10]>5,1,0)

    trainset[,21] <- fx_train

    fx_test <- ifelse(testset[,1]  testset[,2]  testset[,3]  testset[,4]  testset[,5] 
                        testset[,6]  testset[,7]  testset[,8]  testset[,9]  testset[,10]>5,1,0)

    testset[,21] <- fx_test 


    # best subset
    lm.sub <- lm(formula = lm.BestSubSet(trainset,k),trainset)
    probs <- predict(lm.sub,testset[,1:20], type = 'response')


    Allfx_hat[,i] <- probs
    Ally[,i] <- testset[,21]
    Allfx[,i] <- fx_test

  } 
  # 计算方差、偏差等

  # irreducible <- sigma^2

  irreducible  <- mean(apply( Allfx - Ally  ,1,var))
  SquareBais  <- mean(apply((Allfx_hat - Allfx)^2,1,mean))
  Variance <- mean(apply(Allfx_hat,1,var))

  # 回归或分类两种情况
  if (modeltype == 'reg'){
    PredictError <- irreducible   SquareBais   Variance 
  }else{
    PredictError <- mean(ifelse(Allfx_hat>=0.5,1,0)!=Allfx)
  }
  result <- data.frame(k,irreducible,SquareBais,Variance,PredictError)
  return(result)
}



# ----------------   plot square error Best Subset Regression ----------------------------


modeltype <- 'reg'
num <- 100
n_test <- 1000

seeds <- 4
all_p <- seq(2,20,3)
result <- getError(1,num,modeltype,seeds,n_test)
for (i in all_p){
  result <- rbind(result,getError(i,num,modeltype,seeds,n_test))

}

png(file = "Linear Model - Regression_large_testset.png")

plot(result$k,result$PredictError,type = 'o',col = 'red',
     xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$SquareBais,type = 'o',col = 'green',
     xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T) 
plot(result$k,result$Variance,type = 'o',col = 'blue',
     xlim = c(0,20),ylim = c(0,0.4),xlab = 'Subset Size p', ylab ='', lwd = 2,
     main = 'Linear Model - Regression')
dev.off()

# ----------------------  plot 0-1 loss Best Subset Classification -------------------------

modeltype <- 'classification'
num <- 100
n_test <- 1000
seeds <- 4


all_p <- seq(2,20,3)
result <- getError(1,num,modeltype,seeds,n_test)
for (i in all_p){
  result <- rbind(result,getError(i,num,modeltype,seeds,n_test))

}

png(file = "Linear Model - Classification_large_testset.png")


plot(result$k,result$PredictError,type = 'o',col = 'red',
     xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T)
plot(result$k,result$SquareBais,type = 'o',col = 'green',
     xlim = c(0,20),ylim = c(0,0.4),xlab = '', ylab ='', lwd = 2)
par(new = T) 
plot(result$k,result$Variance,type = 'o',col = 'blue',
     xlim = c(0,20),ylim = c(0,0.4),xlab = 'Subset Size p', ylab ='', lwd = 2,
     main = 'Linear Model - Classification')
# 
dev.off()

参考文献

1. Ruppert D. The Elements of Statistical Learning: Data Mining, Inference, and Prediction[J]. Journal of the Royal Statistical Society, 2010, 99(466):567-567.

r语言

0 人点赞