2022 WSDM-Xmrec:跨境电商推荐挑战赛题方案

2022-08-31 15:17:29 浏览数 (2)

赛事地址

WSDM Cup on Cross-Market Recommendation Competition

赛题背景

电子商务公司通常跨市场运营;例如亚马逊已将业务和销售扩展到全球18 个市场(即国家/地区)。跨市场推荐涉及通过利用类似的高资源市场的数据向目标市场的用户推荐相关产品的问题,例如利用美国市场的数据改进目标市场的推荐。

然而关键的挑战是数据,例如用户与产品的交互数据(点击、购买、评论),传达了个别市场的某些偏见。因此在源市场上训练的算法在不同的目标市场不一定有效。

赛题目标

在本次WSDM杯挑战赛中,我们提供不同市场的用户购买和评分数据,目标是通过利用来自类似辅助市场的数据来改进这些目标市场中的个人推荐系统。

数据描述

评估指标

使用NDCG@10进行评估,项目的分数为每个用户排序,前10个项目被考虑进行评估。

Topline代码

加载数据

代码语言:javascript复制
import pandas as pd
import os
import gc
import math
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from itertools import combinations

train_t1 = pd.read_csv('DATA/t1/train.tsv', sep='t')
train_5core_t1 = pd.read_csv('DATA/t1/train_5core.tsv', sep='t')
valid_qrel_t1 = pd.read_csv('DATA/t1/valid_qrel.tsv', sep='t') # 验证集 正样本
valid_run_t1 = pd.read_csv('DATA/t1/valid_run.tsv', sep='t', header=None) # 验证样本
valid_run_t1.columns = ['userId','itemIds']
test_run_t1 = pd.read_csv('DATA/t1/test_run.tsv', sep='t', header=None) # 测试样本
test_run_t1.columns = ['userId','itemIds']

train_t2 = pd.read_csv('DATA/t2/train.tsv', sep='t')
train_5core_t2 = pd.read_csv('DATA/t2/train_5core.tsv', sep='t')
valid_qrel_t2 = pd.read_csv('DATA/t2/valid_qrel.tsv', sep='t') # 验证集 正样本
valid_run_t2 = pd.read_csv('DATA/t2/valid_run.tsv', sep='t', header=None) # 验证样本
valid_run_t2.columns = ['userId','itemIds']
test_run_t2 = pd.read_csv('DATA/t2/test_run.tsv', sep='t', header=None) # 测试样本
test_run_t2.columns = ['userId','itemIds']

item cf

代码语言:javascript复制
def item_cf(df, user_col, item_col):  
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()  
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))  
    
    sim_item = {}  
    item_cnt = defaultdict(int)  
    for user, items in tqdm(user_item_dict.items()):  
        for item in items:  
            item_cnt[item]  = 1  
            sim_item.setdefault(item, {})  
            for relate_item in items:  
                if item == relate_item:  
                    continue 
                
                sim_item[item].setdefault(relate_item, 0)  
                sim_item[item][relate_item]  = 1 / math.log(1   len(items))
                
    sim_item_corr = sim_item.copy()  
    for i, related_items in tqdm(sim_item.items()):  
        for j, cij in related_items.items():  
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i]*item_cnt[j]) 
  
    return sim_item_corr, user_item_dict  

def recommend(sim_item_corr, user_item_dict, user_id):  
    rank = {}
    try:
        interacted_items = user_item_dict[user_id]
    except:
        interacted_items = {}
    for i in interacted_items: 
        try:
            for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True):  
                if j not in interacted_items:
                    rank.setdefault(j, 0) 
                    rank[j]  = wij
        except:
            pass

    return sorted(rank.items(), key=lambda d: d[1], reverse=True)

def match_func(items1, items2):
    res = []
    for it in items1:
        if it in items2:
            res.append(it)
    if len(res) < 100:
        for it in items2:
            if it not in res:
                res.append(it)
    return res[:100]

召回函数

代码语言:javascript复制
def recall_func(train, valid_run):
    # 构建相似矩阵
    item_sim_list, user_item = item_cf(train, 'userId', 'itemId')
    
    # 进行召回
    recom_item = []
    for i in tqdm(valid_run['userId'].unique()):
        
        rank_item = recommend(item_sim_list, user_item, i)
        
        for j in rank_item:  
            if j[1] > 0.001:
                recom_item.append([i, j[0], j[1]])  

    ############## 转为DataFrame
    recom_item_df = pd.DataFrame(recom_item)
    recom_item_df.columns = ['userId','itemId','score']
    
    # 聚合itemId成list
    recom_df = recom_item_df.groupby(['userId'])['itemId'].agg(list).reset_index()
    recom_df.columns = ['userId','pred_itemIds']

    # 合并验证集itemIds
    recom_df = recom_df.merge(valid_run, on='userId', how='left')
    recom_df['itemIds'] =  recom_df['itemIds'].apply(lambda x:x.split(','))
    
    recom_df['result_itemIds'] = recom_df.apply(lambda row:match_func(row['pred_itemIds'], row['itemIds']),axis = 1)
    
    return recom_df

热度填充

代码语言:javascript复制
def hot_fill(train, valid_run, test_run):
    
    # 验证数据
    valid_run = valid_run.merge(valid_recom_df, on='userId', how='left')

    # 按热度进行填充
    valid_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())
    valid_run['itemIds'] = valid_run['itemIds'].apply(lambda x:x.split(','))
    valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:x.split(','))
    valid_run['hot_itemIds'] = valid_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
    valid_run['hot_itemIds'] = valid_run['hot_itemIds'].apply(lambda x:','.join(x))
    valid_run.loc[valid_run.result_itemIds.isnull(), 'result_itemIds'] = 
    valid_run.loc[valid_run.result_itemIds.isnull(), 'hot_itemIds']
    
    # 测试数据
    test_run = test_run.merge(test_recom_df, on='userId', how='left')

    # 按热度进行填充
    test_run['hot_itemIds'] = ','.join(train['itemId'].value_counts().reset_index()['index'].tolist())
    test_run['itemIds'] = test_run['itemIds'].apply(lambda x:x.split(','))
    test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:x.split(','))
    test_run['hot_itemIds'] = test_run.apply(lambda row:match_func(row['hot_itemIds'], row['itemIds']),axis = 1)
    test_run['hot_itemIds'] = test_run['hot_itemIds'].apply(lambda x:','.join(x))
    test_run.loc[test_run.result_itemIds.isnull(), 'result_itemIds'] = 
    test_run.loc[test_run.result_itemIds.isnull(), 'hot_itemIds']
    
    return valid_run, test_run

nDCG指标

代码语言:javascript复制
def getDCG(scores):
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float32)   2)),
        dtype=np.float32)

def getNDCG(rank_list, pos_items):
    relevance = np.ones_like(pos_items)
    it2rel = {it: r for it, r in zip(pos_items, relevance)}
    rank_scores = np.asarray([it2rel.get(it, 0.0) for it in rank_list], dtype=np.float32)

    #idcg = getDCG(relevance)
    idcg = 1
    
    dcg = getDCG(rank_scores)

    if dcg == 0.0:
        return 0.0

    ndcg = dcg / idcg
    return ndcg

召回打分并离线评估

代码语言:javascript复制
## 仅展示t1的召回打分和离线评估代码,t2部分类似
print('valid_recom_df......')
valid_recom_df = recall_func(train_t1, valid_run_t1)
print('test_recom_df......')
test_recom_df = recall_func(train_t1, test_run_t1)

valid_qrel = valid_qrel_t1

# 合并验证集真实结果
valid_recom_df = valid_recom_df.merge(valid_qrel, on='userId', how='left')

# 计算NDCG分数
NDCG = 0
for items in valid_recom_df[['result_itemIds','itemId']].values:
    l1 = items[0][:10]
    l2 = [items[1]]
    NDCG  = getNDCG(l1, l2)
NDCG = NDCG/len(valid_run_t1)
print('NDCG : ', NDCG)

0 人点赞