我们采用波士顿房价预测数据集进行回归任务分析。数据集分为训练集和测试集,训练集可用于训练回归模型,测试集需要进行预测。
要求:
1.做linear regression,或使用现成的线性回归函数,方法尝试使用Gradient Descent,SGD 以及 ADAM。
2.比较不同learning rate的结果。例如损失函数曲线图
3.比较有无加上regularization的结果。
4.比较有无否使用 feature scaling的结果。
Try:
1、机器学习(LinearRegression) 2、深度学习(待开始)
Code as follows
- 数据未处理
- 数据标准化处理
- 数据标准化处理 特征提取
- 特征可视化
"""
Author:cold
Date:2021-04-01
Version:1.0
Info:baseline
"""
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.preprocessing import StandardScaler
# 加载数据(455)
dataset =read_csv('train_dataset.csv').values
# 划分训练集和测试集
X = dataset[:,0:13]
Y = dataset[:,13]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)
# 创建线性回归模型
lr = LinearRegression()
# 拟合训练数据
lr.fit(x_train,y_train)
# 得到预测结果
y_test_pred = lr.predict(x_test)
y_train_pred = lr.predict(x_train)
# 计算相应的评测指标
error_test = mean_squared_error(y_test,y_test_pred)
error_train = mean_squared_error(y_train,y_train_pred)
print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test))
#预测结果
testset =read_csv('test_dataset.csv').values
x_pred = testset[:,1:14]
y_pred = lr.predict(x_pred)
ID = []
for i in range(len(y_pred)):
ID.append("id_" str(i 1))
res = pd.DataFrame()
res['ID']=ID
res['value']=y_pred
res.to_csv('res.csv',index=False)
print("res.csv 已生成")
代码语言:javascript复制"""
Author:cold
Date:2021-04-04
Version:2.0
Info:baselineStd
"""
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.preprocessing import StandardScaler
# 加载数据(455)
dataset =read_csv('train_dataset.csv').values
# 划分训练集和测试集( 数据标准化)
X = dataset[:,0:13]
Y = dataset[:,13]
stand = StandardScaler()
X_std=stand.fit_transform(X)
x_train,x_test,y_train,y_test = train_test_split(X_std,Y,test_size=0.3)
# 创建线性回归模型
lr = LinearRegression()
# 拟合训练数据
lr.fit(x_train,y_train)
# 得到预测结果
y_test_pred = lr.predict(x_test)
y_train_pred = lr.predict(x_train)
# 计算相应的评测指标
error_test = mean_squared_error(y_test,y_test_pred)
error_train = mean_squared_error(y_train,y_train_pred)
print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test))
#预测结果
testset =read_csv('test_dataset.csv').values
x_pred = testset[:,1:14]
y_pred = lr.predict(x_pred)
ID = []
for i in range(len(y_pred)):
ID.append("id_" str(i 1))
res = pd.DataFrame()
res['ID']=ID
res['value']=y_pred
res.to_csv('res.csv',index=False)
print("res.csv 已生成")
代码语言:javascript复制"""
Author:cold
Date:2021-04-04
Version:3.0
Info:baselineSelFeatures
"""
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,f_regression
from matplotlib import pyplot as plt
#字典结果:{提取k个最佳特征,及索引}
def ToBeStdAndSel(X,Y,k):
stand = StandardScaler()
X_std = stand.fit_transform(X)
best = SelectKBest(f_regression, k=k)
X_best = best.fit_transform(X_std, Y)#A
best_index = best.get_support()#B AB顺序不能换
BEST = {}
BEST['best_index'] = best_index
BEST['X_best'] = X_best # 等价于 X_std[:,best_index]
return BEST
#标准化
def ToBeStd(X):
stand = StandardScaler()
X_std = stand.fit_transform(X)
return X_std
# 保存csv
def ToSaveCsv(y_pred):
ID = []
for i in range(len(y_pred)):
ID.append("id_" str(i 1))
res = pd.DataFrame()
res['ID'] = ID
res['value'] = y_pred
res.to_csv('res.csv', index=False)
print("res.csv 已生成")
#预测
def TryToPredict(testset):
x_pred = testset[:, 1:14]
x_pred_best = ToBeStd(x_pred)[:, best_index]
y_pred = lr.predict(x_pred_best)
return y_pred
# 加载数据(455)
dataset =read_csv('train_dataset.csv').values
# 划分训练集和测试集( 数据标准化, 特征提取)X--> X_std--> X_best
X = dataset[:,0:13]
Y = dataset[:,13]
BEST = ToBeStdAndSel(X,Y,6)
X_best = BEST['X_best']
best_index = BEST['best_index']
x_train,x_test,y_train,y_test = train_test_split(X_best,Y,test_size=0.3)
# 创建线性回归模型
lr = LinearRegression()
# 拟合训练数据
lr.fit(x_train,y_train)
# 得到预测结果
y_test_pred = lr.predict(x_test)
y_train_pred = lr.predict(x_train)
# 计算相应的评测指标
error_test = mean_squared_error(y_test,y_test_pred)
error_train = mean_squared_error(y_train,y_train_pred)
print("训练集误差为:{},测试集误差为:{}".format(error_train,error_test))
plt.plot(y_test_pred,'r-',label='predict_value')
plt.plot(y_test,'b-',label='true_value')
plt.legend()
plt.show()
#预测、保存
testset =read_csv('test_dataset.csv').values
y_pred = TryToPredict(testset)
ToSaveCsv(y_pred)
代码语言:javascript复制"""
Author:cold
Date:2021-04-04
Version:1.0
Info: Features show
"""
from pandas import read_csv
import matplotlib.pyplot as plt
import math
# 加载数据(455)
dataset =read_csv('train_dataset.csv').values
X = dataset[:,0:13]
Y = dataset[:,13]
#(特征工程)
features = []
for i in read_csv('train_dataset.csv').keys():
features.append(i)
nums = len(features)-1
columns =3
rows =math.ceil(nums /columns)
plt.figure(figsize=(12,10))
for i in range(nums):
plt.subplot(rows,columns,i 1)
plt.plot(X[:,i],Y,'b ')
plt.title(features[i])
plt.subplots_adjust(hspace=1.5)
plt.show()
Next:
- 数据特征标准化
- 特征提取
- 异常值判断(箱线图) 数据清洗(较之前mean_square_error更低的原因)
- 更换回归模型
- 模型融合
- 尝试深度学习模型