作为推荐模型训练的重要组成部分,推荐系统的负采样对模型的训练效果有着重要的影响,也是重要研究分支。实际的推荐系统场景,大部分数据是隐式反馈信息。对于模型训练,一般假设用户交互过的产品都是正例,通过抽样,选择用户没有交互过的部分产品作为负例。根据一定的策略从用户的非交互产品集中选择负样本的过程称为负样本采样(Negative Sampling)。
1. 基于numpy的随机采样
代码语言:javascript复制 def get_neg_sample(self, word_index, array):
neg_sample = []
while len(neg_sample) < self.neg:
neg_sample_index = array[np.random.randint(10**8)]
if neg_sample_index == word_index:
continue
neg_sample.append(neg_sample_index)
return neg_sample
2.基于scipy的稠密矩阵快速实现
代码语言:javascript复制import random
import time
import scipy
def neg_sampling(ratings_df, n_neg=1, neg_val=0, pos_val=1, percent_print=5):
"""version 1.2: 1 positive 1 neg (2 times bigger than the original dataset by default)
Parameters:
input rating data as pandas dataframe: userId|movieId|rating
n_neg: take n_negative / 1 positive
Returns:
negative sampled set as pandas dataframe
userId|movieId|interact (implicit)
"""
ratings_df.userId = ratings_df.userId.astype('category').cat.codes.values
ratings_df.movieId = ratings_df.movieId.astype('category').cat.codes.values
sparse_mat = scipy.sparse.coo_matrix((ratings_df.rating, (ratings_df.userId, ratings_df.movieId)))
dense_mat = np.asarray(sparse_mat.todense())
print(dense_mat.shape)
nsamples = ratings_df[['userId', 'movieId']]
nsamples['interact'] = nsamples.apply(lambda row: 1, axis=1)
length = dense_mat.shape[0]
printpc = int(length * percent_print/100)
nTempData = []
i = 0
start_time = time.time()
stop_time = time.time()
extra_samples = 0
for row in dense_mat:
if(i%printpc==0):
stop_time = time.time()
print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / length, stop_time - start_time))
start_time = stop_time
n_non_0 = len(np.nonzero(row)[0])
zero_indices = np.where(row==0)[0]
if(n_non_0 * n_neg extra_samples > len(zero_indices)):
print(i, "non 0:", n_non_0,": len ",len(zero_indices))
neg_indices = zero_indices.tolist()
extra_samples = n_non_0 * n_neg extra_samples - len(zero_indices)
else:
neg_indices = random.sample(zero_indices.tolist(), n_non_0 * n_neg extra_samples)
extra_samples = 0
nTempData.extend([(uu, ii, rr) for (uu, ii, rr) in zip(np.repeat(i, len(neg_indices))
, neg_indices, np.repeat(neg_val, len(neg_indices)))])
i =1
nsamples=nsamples.append(pd.DataFrame(nTempData, columns=["userId","movieId", "interact"]),ignore_index=True)
return nsamples
3. pyspark的方法
1) window random方法
代码语言:javascript复制from pyspark.sql import Window
from pyspark.sql.functions import col
import pyspark.sql.functions as F
#Segregate into Positive n negative
df_0=df.filter(df.label == 0)
df_1=df.filter(df.label == 1)
#Create a window groups together records of same userid with random order
window_random = Window.partitionBy(col('userid')).orderBy(F.rand())
# For Negative Dataframe , rank and choose rank <= 5
data_0 = df_0.withColumn('rank', F.rank().over(window_random)).filter(F.col('rank') <= 5).drop('rank')
# For Positive Dataframe , rank and choose rank <= 1
data_1 = df_1.withColumn('rank', F.rank().over(window_random)).filter(F.col('rank') <= 1).drop('rank')
#Finally union both results
final_result = data_1.union(data_0)
2) sampleby
代码语言:javascript复制frac = df.select("label").distinct().withColumn("frac", F.when(col('label')=='1',lit(0.1)).otherwise(lit(0.5))).rdd.collectAsMap()
print(frac)
sampled_df = df.sampleBy("label", frac, seed=3)