写在前面
此baseline是西安电子科技大学小幸运所提供,借此平台能够帮助更多新入手同学。
正文
模型使用的lightgbm,共18个特征
代码语言:javascript复制import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
train_data = pd.read_table('./data/oppo_round1_train_20180929.txt',
names= ['prefix','query_prediction','title','tag','label'], header= None, encoding='utf-8').astype(str)
val_data = pd.read_table('./data/oppo_round1_vali_20180929.txt',
names = ['prefix','query_prediction','title','tag','label'], header = None, encoding='utf-8').astype(str)
test_data = pd.read_table('./data/oppo_round1_test_A_20180929.txt',
names = ['prefix','query_prediction','title','tag'],header = None, encoding='utf-8').astype(str)
train_data = train_data[train_data['label'] != '音乐' ]
test_data['label'] = -1
train_data = pd.concat([train_data,val_data])
train_data['label'] = train_data['label'].apply(lambda x: int(x))
test_data['label'] = test_data['label'].apply(lambda x: int(x))
items = ['prefix', 'title', 'tag']
for item in items:
temp = train_data.groupby(item, as_index = False)['label'].agg({item '_click':'sum', item '_count':'count'})
temp[item '_ctr'] = temp[item '_click']/(temp[item '_count'])
train_data = pd.merge(train_data, temp, on=item, how='left')
test_data = pd.merge(test_data, temp, on=item, how='left')
for i in range(len(items)):
for j in range(i 1, len(items)):
item_g = [items[i], items[j]]
temp = train_data.groupby(item_g, as_index=False)['label'].agg({'_'.join(item_g) '_click': 'sum','_'.join(item_g) 'count':'count'})
temp['_'.join(item_g) '_ctr'] = temp['_'.join(item_g) '_click']/(temp['_'.join(item_g) 'count'] 3)
train_data = pd.merge(train_data, temp, on=item_g, how='left')
test_data = pd.merge(test_data, temp, on=item_g, how='left')
train_data_ = train_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
test_data_ = test_data.drop(['prefix', 'query_prediction', 'title', 'tag'], axis = 1)
print('train beginning')
X = np.array(train_data_.drop(['label'], axis = 1))
y = np.array(train_data_['label'])
X_test_ = np.array(test_data_.drop(['label'], axis = 1))
print('================================')
print(X.shape)
print(y.shape)
print('================================')
xx_logloss = []
xx_submit = []
N = 5
skf = StratifiedKFold(n_splits=N, random_state=42, shuffle=True)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 32,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 1
}
for k, (train_in, test_in) in enumerate(skf.split(X, y)):
print('train _K_ flod', k)
X_train, X_test, y_train, y_test = X[train_in], X[test_in], y[train_in], y[test_in]
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
gbm = lgb.train(params,
lgb_train,
num_boost_round=5000,
valid_sets=lgb_eval,
early_stopping_rounds=50,
verbose_eval=50,
)
print(f1_score(y_test, np.where(gbm.predict(X_test, num_iteration=gbm.best_iteration)>0.5, 1,0)))
xx_logloss.append(gbm.best_score['valid_0']['binary_logloss'])
xx_submit.append(gbm.predict(X_test_, num_iteration=gbm.best_iteration))
print('train_logloss:', np.mean(xx_logloss))
s = 0
for i in xx_submit:
s = s i
test_data_['label'] = list(s / N)
test_data_['label'] = test_data_['label'].apply(lambda x: round(x))
print('test_logloss:', np.mean(test_data_.label))
test_data_['label'].to_csv('./submit/result.csv',index = False)
写在最后
知乎专栏目的传播更多机器学习干货,比赛方法。欢迎投稿!
ML理论&实践