本文用的环境是Jupyter notebook,方便大家查看变量所存数据以及便于执行,文末可回复关键字“比赛”获取代码文件。
代码语言:javascript复制# 导入相应的科学计算以及可视化工具包
from matplotlib import pyplot as plt
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score
%pylab inline
# 指定训练集与测试集路径
train_data_path=r'../data/train/train.txt'
test_data_path=r'../data/test/apply_new.txt'
# 给定相应的列名
common_features = ['gender','age','tagid','time','province','city','model','make']
trained_features=['pid','label'] common_features
tested_features=['pid'] common_features
# 利用pandas读取训练集和测试集数据到内存中
train_data=pd.read_csv(train_data_path, sep=',', header=None, names=trained_features)
test_data=pd.read_csv(test_data_path,sep=',',header=None, names=tested_features)
# 查看训练集示例数据
train_data.head(5)
# 查看训练集字段描述信息,可见age和gender字段存在很大比例的缺失值。
# 然而这两个字段对于点击率预估起到很重要的作用,因此后续考虑
# 建立模型来对其进行预测填充,期望能够提高预估模型的性能
train_data.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 300000 entries, 0 to 299999
# Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
# 0 pid 300000 non-null int64
# 1 label 300000 non-null int64
# 2 gender 57638 non-null float64
# 3 age 261369 non-null float64
# 4 tagid 300000 non-null object
# 5 time 300000 non-null object
# 6 province 300000 non-null object
# 7 city 300000 non-null object
# 8 model 300000 non-null object
# 9 make 300000 non-null object
# dtypes: float64(2), int64(2), object(6)
# memory usage: 22.9 MB
# 查看训练集字段统计信息,可见label的正负比例是1:1
train_data.describe()
# pid | label | gender | age
# count 3.000000e 05 300000.000000 57638.000000 261369.000000
# mean 1.150000e 06 0.500000 0.404507 3.318760
# std 8.660268e 04 0.500001 0.490801 1.468274
# min 1.000001e 06 0.000000 0.000000 1.000000
# 25% 1.075001e 06 0.000000 0.000000 2.000000
# 50% 1.150000e 06 0.500000 0.000000 3.000000
# 75% 1.225000e 06 1.000000 1.000000 4.000000
# max 1.300000e 06 1.000000 1.000000 6.000000
# 可视化训练集年龄分布情况
dict_age=dict(train_data['age'].value_counts())
# 对字典按照value值来进行排序
# sorted(dict_age.items(),key=lambda x:x[0],reverse=False)
plt.title('Age distribution in train set.')
plt.bar(*zip(*dict_age.items()),color='green')
plt.show()
代码语言:javascript复制# 可视化训练集性别分布情况
dict_age=dict(test_data['age'].value_counts())
plt.title('Age distribution in test set.')
plt.bar(*zip(*dict_age.items()),color='green')
plt.show()
代码语言:javascript复制# 同理,查看测试集示例数据
test_data.head(5)
# 查看测试集字段描述信息
test_data.info()
# 查看测试集字段统计信息
test_data.describe()
# 可视化测试集年龄分布情况
dict_age=dict(test_data['age'].value_counts())
plt.title('Age distribution in test set.')
plt.bar(*zip(*dict_age.items()),color='green')
plt.show()
# 可视化测试集性别分布情况
dict_gender=dict(test_data['gender'].value_counts())
plt.title('Gender distribution in test set.')
plt.bar(*zip(*dict_gender.items()))
plt.show()
代码语言:javascript复制# 将训练集与测试集的特征拼接起来统一进行处理
data=pd.concat([train_data,test_data],axis=0)
# 对除了label之外的字段存在的null值进行填充以及将格式转为string
for col in [x for x in data.columns if x not in ['label']]:
data[col] = data[col].fillna(-1)
data[col] = data[col].astype('str')
# 由于tagid和tiem字段存储的是其序列的字符串形式,因此利用eval函数将其转为list类型,便于后续输入到word2vec进行训练
data['tagid'] = data['tagid'].apply(lambda x: eval(x))
data['time'] = data['time'].apply(lambda x: eval(x))
# 由于model和make字段可能存在重复的情况,比如model='华为',make='华为荣耀',因此将后边字段的重复内容删除
def make_rm_model(x):
a = str(x[0]).strip()
b = str(x[1]).strip()
if b.__contains__(a):
b = b.replace(a, '')
return b
data['make'] = data[['model', 'make']].apply(make_rm_model, axis=1)
# 对类别特征进行Label Encoding处理,并且将新的特征名称保存起来,便于后续训练和测试时利用
used_features = []
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for col in ['gender', 'age', 'province', 'city', 'model', 'make']:
data['{}_category'.format(col)] = le.fit_transform(data[col])
used_features.append('{}_category'.format(col))
代码语言:javascript复制# 利用word2vec模型对tags进行训练,
# 使其同时出现在同一字段的两个tag的向量尽可能相似,
# 使得两个不经常同一出现在同一序列的tag的低维嵌入向量尽可能远
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
# 词向量维度
emb_size = 16
# 将tagid字段的类型由pd.Series转到list
sentences = data['tagid'].values.tolist()
# 初始化word2vec模型
model = Word2Vec(sentences, vector_size=emb_size, window=5, min_count=5, sg=0, hs=1, seed=42)
emb_matrix = []
# 将训练好的词向量模型所对应的tagid转换为相应的embedding
# 同时,对存在于同一个字段中的tagid进行加权平均
for seq in sentences:
vec = []
for w in seq:
if w in model.wv:
vec.append(model.wv.get_vector(w))
if len(vec) > 0:
emb_matrix.append(np.mean(vec, axis=0))
else:
emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
# 将对应的词向量维度生成相应的新的特征
for i in range(emb_size):
data['{}_emb_{}'.format('tagid', i)] = emb_matrix[:, i]
used_features.append('{}_emb_{}'.format('tagid', i))
del model, sentences
代码语言:javascript复制# 将完整的data数据集拆分为之前给定的训练和测试集
train=data[:train_data.shape[0]]
test=data[train_data.shape[0]:]
# 利用sklearn工具包自动生成相应数量的交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#初始化变量用于保存预测结果
train_res = np.zeros(shape=(train.shape[0]))
test_res = np.zeros(shape=(test.shape[0]))
# 定义模型训练所需要的超参数
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'max_depth': -1,
'num_leaves': 31,
'learning_rate': 0.1,
'verbose': 0,
'random_state': 42,
'n_jobs': -1,
}
# 保存相应的特征列名称,方便后续对其进行重要性分析
imp_Df = pd.DataFrame()
imp_Df['feature'] = used_features
代码语言:javascript复制# 利用交叉验证对lightgbm模型进行训练
for index, (train_index, valid_index) in enumerate(skf.split(train, train['label'])):
# 注意,在训练过程中并没有利用所有字段,而是used_features中指定的
X_train, X_valid = train.iloc[train_index][used_features].values, train.iloc[valid_index][used_features].values
y_train, y_valid = train.iloc[train_index]['label'], train.iloc[valid_index]['label']
print(index)
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_valid, label=y_valid)
lgb_model = lgb.train(
params,
dtrain,
num_boost_round=10000,
valid_sets=[dval],
early_stopping_rounds=50,
verbose_eval=50,
)
X_valid_pred = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
imp_Df['cv' str(index)] = lgb_model.feature_importance()
# 预测当前折所对应的验证集的结果
# 5折交叉验证后,训练集所有样本均被预测了一遍
train_res[valid_index] = X_valid_pred
# 对测试集的结果进行5折预测结果的平均作为最终的预测结果
test_res = test_res lgb_model.predict(test[used_features].values,
num_iteration=lgb_model.best_iteration) / skf.n_splits
代码语言:javascript复制train['predict'] = train_res
train['rank'] = train['predict'].rank()
train['p'] = 1
train.loc[train['rank'] <= train.shape[0] * 0.5, 'p'] = 0
best_f1_train = f1_score(train['label'].values, train['p'].values)
print(best_f1_train)
# 线下训练集测试出的f1值为:0.7115933333333333
# 将模型预测出的线上测试集结果保存为指定pandas
submit = test[['pid']]
submit['rank'] = test_res
submit.columns = ['user_id', 'rank']
submit['rank'] = submit['rank'].rank()
submit['category_id'] = 1
submit.loc[submit['rank'] <= int(submit.shape[0] * 0.5), 'category_id'] = 0
# 保存预测结果到规定格式的csv文件中
submit[['user_id', 'category_id']].to_csv('f1_{}.csv'.format(str(best_f1_train).split('.')[1]), index=False)
# 该版本的线上f1值为:0.60364
# 后续版本将考虑特征交叉以及特征选择,
# 选用更适合序列建模的表示学习模型,
# 建立模型对重要的age和gender特征进行预测填充。
回复关键字“比赛”可获取ipython notebook源文件。