赛事地址
WSDM Cup on Cross-Market Recommendation Competition
赛题背景
电子商务公司通常跨市场运营;例如亚马逊已将业务和销售扩展到全球18 个市场(即国家/地区)。跨市场推荐涉及通过利用类似的高资源市场的数据向目标市场的用户推荐相关产品的问题,例如利用美国市场的数据改进目标市场的推荐。
然而关键的挑战是数据,例如用户与产品的交互数据(点击、购买、评论),传达了个别市场的某些偏见。因此在源市场上训练的算法在不同的目标市场不一定有效。
赛题目标
在本次WSDM杯挑战赛中,我们提供不同市场的用户购买和评分数据,目标是通过利用来自类似辅助市场的数据来改进这些目标市场中的个人推荐系统。
数据描述
评估指标
使用NDCG@10进行评估,项目的分数为每个用户排序,前10个项目被考虑进行评估。
Topline代码
加载数据
代码语言:javascript复制import pandas as pd
import os
import gc
import math
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from itertools import combinations
train_t1 = pd.read_csv('DATA/t1/train.tsv', sep='t')
train_5core_t1 = pd.read_csv('DATA/t1/train_5core.tsv', sep='t')
valid_qrel_t1 = pd.read_csv('DATA/t1/valid_qrel.tsv', sep='t') # 验证集 正样本
valid_run_t1 = pd.read_csv('DATA/t1/valid_run.tsv', sep='t', header=None) # 验证样本
valid_run_t1.columns = ['userId','itemIds']
test_run_t1 = pd.read_csv('DATA/t1/test_run.tsv', sep='t', header=None) # 测试样本
test_run_t1.columns = ['userId','itemIds']
train_t2 = pd.read_csv('DATA/t2/train.tsv', sep='t')
train_5core_t2 = pd.read_csv('DATA/t2/train_5core.tsv', sep='t')
valid_qrel_t2 = pd.read_csv('DATA/t2/valid_qrel.tsv', sep='t') # 验证集 正样本
valid_run_t2 = pd.read_csv('DATA/t2/valid_run.tsv', sep='t', header=None) # 验证样本
valid_run_t2.columns = ['userId','itemIds']
test_run_t2 = pd.read_csv('DATA/t2/test_run.tsv', sep='t', header=None) # 测试样本
test_run_t2.columns = ['userId','itemIds']
item cf
代码语言:javascript复制def item_cf(df, user_col, item_col):
user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()
user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))
sim_item = {}
item_cnt = defaultdict(int)
for user, items in tqdm(user_item_dict.items()):
for item in items:
item_cnt[item] = 1
sim_item.setdefault(item, {})
for relate_item in items:
if item == relate_item:
continue
sim_item[item].setdefault(relate_item, 0)
sim_item[item][relate_item] = 1 / math.log(1 len(items))
sim_item_corr = sim_item.copy()
for i, related_items in tqdm(sim_item.items()):
for j, cij in related_items.items():
sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i]*item_cnt[j])
return sim_item_corr, user_item_dict
def recommend(sim_item_corr, user_item_dict, user_id):
rank = {}
try:
interacted_items = user_item_dict[user_id]
except:
interacted_items = {}
for i in interacted_items:
try:
for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True):
if j not in interacted_items:
rank.setdefault(j, 0)
rank[j] = wij
except:
pass
return sorted(rank.items(), key=lambda d: d[1], reverse=True)
def match_func(items1, items2):
res = []
for it in items1:
if it in items2:
res.append(it)
if len(res) < 100:
for it in items2:
if it not in res:
res.append(it)
return res[:100]
召回函数
代码语言:javascript复制def recall_func(train, valid_run):
# 构建相似矩阵
item_sim_list, user_item = item_cf(train, 'userId', 'itemId')
# 进行召回
recom_item = []
for i in tqdm(valid_run['userId'].unique()):
rank_item = recommend(item_sim_list, user_item, i)
for j in rank_item:
if j[1] > 0.001:
recom_item.append([i, j[0], j[1]])
############## 转为DataFrame
recom_item_df = pd.DataFrame(recom_item)
recom_item_df.columns = ['userId','itemId','score']
# 聚合itemId成list
recom_df = recom_item_df.groupby(['userId'])['itemId'].agg(list).reset_index()
recom_df.columns = ['userId','pred_itemIds']
# 合并验证集itemIds
recom_df = recom_df.merge(valid_run, on='userId', how='left')
recom_df['itemIds'] = recom_df['itemIds'].apply(lambda x:x.split(','))
recom_df['result_itemIds'] = recom_df.apply(lambda row:match_func(row['pred_itemIds'], row['itemIds']),axis = 1)
return recom_df
热度填充
代码语言:javascript复制def hot_fill(train, valid_run, test_run):
# 验证数据
valid_run = valid_run.merge(valid_recom_df, on='userId', how='left')
# 按热度进行填充
valid_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())
valid_run['itemIds'] = valid_run['itemIds'].apply(lambda x:x.split(','))
valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:x.split(','))
valid_run['hot_itemIds'] = valid_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:','.join(x))
valid_run.loc[valid_run.result_itemIds.isnull(), 'result_itemIds'] =
valid_run.loc[valid_run.result_itemIds.isnull(), 'hot_itemIds']
# 测试数据
test_run = test_run.merge(test_recom_df, on='userId', how='left')
# 按热度进行填充
test_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())
test_run['itemIds'] = test_run['itemIds'].apply(lambda x:x.split(','))
test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:x.split(','))
test_run['hot_itemIds'] = test_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:','.join(x))
test_run.loc[test_run.result_itemIds.isnull(), 'result_itemIds'] =
test_run.loc[test_run.result_itemIds.isnull(), 'hot_itemIds']
return valid_run, test_run
nDCG指标
代码语言:javascript复制def getDCG(scores):
return np.sum(
np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float32) 2)),
dtype=np.float32)
def getNDCG(rank_list, pos_items):
relevance = np.ones_like(pos_items)
it2rel = {it: r for it, r in zip(pos_items, relevance)}
rank_scores = np.asarray([it2rel.get(it, 0.0) for it in rank_list], dtype=np.float32)
#idcg = getDCG(relevance)
idcg = 1
dcg = getDCG(rank_scores)
if dcg == 0.0:
return 0.0
ndcg = dcg / idcg
return ndcg
召回打分并离线评估
代码语言:javascript复制## 仅展示t1的召回打分和离线评估代码,t2部分类似
print('valid_recom_df......')
valid_recom_df = recall_func(train_t1, valid_run_t1)
print('test_recom_df......')
test_recom_df = recall_func(train_t1, test_run_t1)
valid_qrel = valid_qrel_t1
# 合并验证集真实结果
valid_recom_df = valid_recom_df.merge(valid_qrel, on='userId', how='left')
# 计算NDCG分数
NDCG = 0
for items in valid_recom_df[['result_itemIds','itemId']].values:
l1 = items[0][:10]
l2 = [items[1]]
NDCG = getNDCG(l1, l2)
NDCG = NDCG/len(valid_run_t1)
print('NDCG : ', NDCG)