异动分析(三)利用Python模拟业务数据
上期提到【数据是利用python生成的】,有很多同学留言想了解具体的生成过程,所以这一期就插空讲一下如何利用Python模拟日常业务数据
模拟思路
日常业务数据都会服从一定的概率分布,对于稳定的业务场景,时间序列数据基本服从均匀分布。转化类数据一般服从二项分布,因此可以先生成每日明细数据,在此基础上进行转化判断。
为了更贴近业务数据,需要对不同细分维度赋以一个权重,以期能产生具有差异的转化率数据。废话不多说,直接展示代码吧~
生成案例一数据
代码语言:javascript复制import pandas as pd
from faker import Faker
from faker.providers import BaseProvider, internet
from random import randint, random
import numpy as np
import itertools
from numpy.random import choice
代码语言:javascript复制# 自定义随机权重,用于定性的控制转化率差异
def rand_weight(x):
wi=random()
lift=[i*0.05 for i in range(5,-5,-1)]
return wi lift[x]
代码语言:javascript复制rand_weight可以自定义权重,这里给出了10个等级,等级越高(x越趋近于0)生成高权重的概率越大。 可尝试执行
np.mean([rand_weight(0) for x in range(1000)])
和np.mean([rand_weight(9) for x in range(1000)])
进行理解。
# 构造维度
ch = ['A', 'B', 'C', 'D']
platform = ['Android', 'iOS']
app_version = ['11', '10', '其他']
dt = pd.date_range('2018-03-01', '2018-03-30').strftime("%Y-%m-%d").to_list() # 日期范围
代码语言:javascript复制# 自定义权重
w1 = {
'A':0
,'B':1
,'C':9
,'D':2
}
w2 = {
'11':0
,'10':1
,'其他':3
}
代码语言:javascript复制# 定义日期范围的搜索数据
s = [int(x) for x in np.random.uniform(500000, 510000, len(dt))]
代码语言:javascript复制cols = ['ch', 'platform', 'app_version', 'searchs', 'hits', 'hit_rate',
'search_questions', 'hit_questions', 'hit_videos', 'dt']
df_finall = pd.DataFrame(columns=cols)
for n,d in zip(s, dt):
if d<='2018-03-25':
# 构造有差异的样本数据
s1 = choice(ch, size=n, p=[0.7, 0.24, 0.05, 0.01])
s2 = choice(platform, size=n, p=[0.7, 0.3])
s3 = choice(app_version, size=n, p=[0.85, 0.1, 0.05])
# 生成随机的转化率
hit_rate = np.random.uniform(0.65, 0.7)
h = np.random.binomial(1, hit_rate, n)
# 构造每日数据明细
df = pd.DataFrame(columns=['ch', 'platform', 'app_version'])
df['ch'], df['platform'], df['app_version'] = s1, s2, s3
df['w'] = df.apply(lambda x: 0 if x['ch']=='C' else
rand_weight(w1[x['ch']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w", ascending=False, inplace=True)
df['if_hit'] = abs(np.sort(-h))
# 生成维度汇总数据
df_temp = df.groupby(['ch', 'platform', 'app_version'])['if_hit'].agg(['count', 'sum', 'mean']).reset_index()
df_temp.columns = ['ch', 'platform', 'app_version', 'searchs', 'hits', 'hit_rate']
# 生成无关指标
df_temp['search_questions'] = df_temp['searchs']/5
df_temp['hit_questions'] = df_temp['hits']/5
df_temp['hit_videos'] = df_temp['hits']/20
# 增加日期
df_temp['dt'] = d
# 拼接数据
df_finall=pd.concat([df_finall, df_temp])
else:
# 构造有差异的样本数据
s1 = choice(ch, size=n, p=[0.63, 0.16, 0.2, 0.01]) # C渠道的样本数据提升至0.2
s2 = choice(platform, size=n, p=[0.7, 0.3])
s3 = choice(app_version, size=n, p=[0.85, 0.1, 0.05])
# 生成随机的转化率
hit_rate = np.random.uniform(0.55, 0.58)
h = np.random.binomial(1, hit_rate, n)
# 构造每日数据明细
df = pd.DataFrame(columns=['ch', 'platform', 'app_version'])
df['ch'], df['platform'], df['app_version'] = s1, s2, s3
df['w'] = df.apply(lambda x: 0 if x['ch']=='C' else
rand_weight(w1[x['ch']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w", ascending=False, inplace=True)
df['if_hit'] = abs(np.sort(-h))
# 生成维度汇总数据
df_temp = df.groupby(['ch', 'platform', 'app_version'])['if_hit'].agg(['count', 'sum', 'mean']).reset_index()
df_temp.columns = ['ch', 'platform', 'app_version', 'searchs', 'hits', 'hit_rate']
# 生成无关指标
df_temp['search_questions'] = df_temp['searchs']/5
df_temp['hit_questions'] = df_temp['hits']/5
df_temp['hit_videos'] = df_temp['hits']/20
# 增加日期
df_temp['dt'] = d
# 拼接数据
df_finall=pd.concat([df_finall, df_temp])
代码语言:javascript复制df_finall.to_excel('case1.xlsx', index=None)
生成案例二数据
收入及转化数据
代码语言:javascript复制import pandas as pd
from faker import Faker
from faker.providers import BaseProvider, internet
from random import randint, random
import numpy as np
import itertools
from numpy.random import choice
代码语言:javascript复制# 自定义随机权重,用于定性的控制转化率差异
def rand_weight(x):
wi=random()
lift=[i*0.05 for i in range(5,-5,-1)]
return wi lift[x]
代码语言:javascript复制# 构造维度
strategy = ['S1', 'S2', 'S3', 'Other']
platform = ['Android', 'iOS']
app_version = ['11', '10', '其他']
dt = pd.date_range('2019-08-01', '2019-08-30').strftime("%Y-%m-%d").to_list() # 日期范围
代码语言:javascript复制# 自定义权重
w1 = {
'S1':2
,'S2':1
,'S3':0
,'Other':3
}
w2 = {
'11':0
,'10':1
,'其他':3
}
代码语言:javascript复制# 定义日期范围的命中数据
h = [int(x) for x in np.random.uniform(350000, 400000, len(dt)-4)]
[int(x) for x in np.random.uniform(30000, 320000, 4)]
代码语言:javascript复制cols = ['strategy', 'platform', 'app_version', 'hits', 'shows', 'clicks',
'plays', 'pays', 'pay_mount', 'dt']
df_finall = pd.DataFrame(columns=cols)
for n,d in zip(h, dt):
# 生成各维度命中数据
h1 = choice(strategy, size=n, p=[0.7, 0.2, 0.05, 0.05])
h2 = choice(platform, size=n, p=[0.7, 0.3])
h3 = choice(app_version, size=n, p=[0.85, 0.1, 0.05])
# 生成随机的转化率
show_rate = np.random.uniform(0.6, 0.65)
click_rate = np.random.uniform(0.3, 0.35)
play_rate = np.random.uniform(0.2, 0.25)
pay_rate = np.random.uniform(0.05, 0.07)
show = np.random.binomial(1, show_rate, n)
click = np.random.binomial(1, click_rate, n)
play = np.random.binomial(1, play_rate, n)
pay = np.random.binomial(1, pay_rate, n)
# 构造每日数据明细
df = pd.DataFrame(columns=['strategy', 'platform', 'app_version'])
df['strategy'], df['platform'], df['app_version'] = h1, h2, h3
# 生成展示数据
df['w_show'] = df.apply(lambda x: rand_weight(w1[x['strategy']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w_show", ascending=False, inplace=True)
df['if_show'] = abs(np.sort(-show))
# 生成点击数据
df['w_click'] = df.apply(lambda x: 0 if x['if_show']==0 else
rand_weight(w1[x['strategy']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w_click", ascending=False, inplace=True)
df['if_click'] = abs(np.sort(-click))
# 生成播放数据
df['w_play'] = df.apply(lambda x: 0 if x['if_click']==0 else
rand_weight(w1[x['strategy']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w_play", ascending=False, inplace=True)
df['if_play'] = abs(np.sort(-play))
# 生成支付数据
df['w_pay'] = df.apply(lambda x: 0 if x['if_play']==0 else
rand_weight(w1[x['strategy']]) rand_weight(w2[x['app_version']]), axis=1)
df.sort_values(by="w_pay", ascending=False, inplace=True)
df['if_pay'] = abs(np.sort(-pay))
# 生成支付金额
df['pay_mount'] = df['if_pay']*10
# 生成维度汇总数据
df_temp = df.groupby(['strategy', 'platform', 'app_version']).agg({'if_show' : ['count', 'sum'],
'if_click' : 'sum',
'if_play' : 'sum',
'if_pay' : 'sum',
'pay_mount' : 'sum',
}).reset_index()
df_temp.columns = ['strategy', 'platform', 'app_version', 'hits', 'shows', 'clicks', 'plays', 'pays', 'pay_mount']
# 增加日期
df_temp['dt'] = d
# 拼接数据
df_finall=pd.concat([df_finall, df_temp])
代码语言:javascript复制df_finall.to_excel('case2-1.xlsx', index=None)
生产数据
代码语言:javascript复制# 构造维度
strategy = ['S1', 'S2', 'S3', 'Other']
teacher = ['New', 'Old']
dt = pd.date_range('2019-08-01', '2019-08-30').strftime("%Y-%m-%d").to_list() # 日期范围
代码语言:javascript复制# 自定义权重
w4 = {
'New':2
,'Old':0
}
代码语言:javascript复制# 定义日期范围的生产数据
m = [int(x) for x in np.random.uniform(1000, 1200, len(dt))]
代码语言:javascript复制cols = ['strategy', 'teacher', 'makes', 'onlines', 'cycles', 'dt']
df_finall = pd.DataFrame(columns=cols)
for n,d in zip(m, dt):
if d<'2019-08-27':
# 生成各维度生产数据
m1 = choice(strategy, size=n, p=[0.7, 0.2, 0.05, 0.05])
m2 = choice(teacher, size=n, p=[0.2, 0.8])
# 生成随机的转化率
online_rate = np.random.uniform(0.9, 0.95)
cycle_rate = np.random.uniform(0.84, 0.89)
online = np.random.binomial(1, online_rate, n)
cycle = np.random.binomial(1, cycle_rate, n)
# 构造每日数据明细
df = pd.DataFrame(columns=['strategy', 'teacher'])
df['strategy'], df['teacher'] = m1, m2
# 生成上线数据
df['w_online'] = df.apply(lambda x: rand_weight(w1[x['strategy']]) rand_weight(w4[x['teacher']]), axis=1)
df.sort_values(by="w_online", ascending=False, inplace=True)
df['if_online'] = abs(np.sort(-online))
# 生成同步数据
df['w_cycle'] = df.apply(lambda x: 0 if x['if_online']==0 else
rand_weight(w1[x['strategy']]) rand_weight(w4[x['teacher']]), axis=1)
df.sort_values(by="w_cycle", ascending=False, inplace=True)
df['if_cycle'] = abs(np.sort(-cycle))
# 生成维度汇总数据
df_temp = df.groupby(['strategy', 'teacher']).agg({'if_online' : ['count', 'sum'],
'if_cycle' : 'sum'
}).reset_index()
df_temp.columns = ['strategy', 'teacher', 'makes', 'onlines', 'cycles']
# 增加日期
df_temp['dt'] = d
# 拼接数据
df_finall=pd.concat([df_finall, df_temp])
else:
# 生成各维度生产数据
m1 = choice(strategy, size=n, p=[0.7, 0.2, 0.05, 0.05])
m2 = choice(teacher, size=n, p=[0.2, 0.8])
# 生成随机的转化率
online_rate = np.random.uniform(0.9, 0.95)
cycle_rate = np.random.uniform(0.25, 0.27) # 同步率下降
online = np.random.binomial(1, online_rate, n)
cycle = np.random.binomial(1, cycle_rate, n)
# 构造每日数据明细
df = pd.DataFrame(columns=['strategy', 'teacher'])
df['strategy'], df['teacher'] = m1, m2
# 生成上线数据
df['w_online'] = df.apply(lambda x: rand_weight(w1[x['strategy']]) rand_weight(w4[x['teacher']]), axis=1)
df.sort_values(by="w_online", ascending=False, inplace=True)
df['if_online'] = abs(np.sort(-online))
# 生成同步数据
df['w_cycle'] = df.apply(lambda x: 0 if x['if_online']==0 else
rand_weight(w1[x['strategy']]) rand_weight(w4[x['teacher']]), axis=1)
df.sort_values(by="w_cycle", ascending=False, inplace=True)
df['if_cycle'] = abs(np.sort(-cycle))
# 生成维度汇总数据
df_temp = df.groupby(['strategy', 'teacher']).agg({'if_online' : ['count', 'sum'],
'if_cycle' : 'sum'
}).reset_index()
df_temp.columns = ['strategy', 'teacher', 'makes', 'onlines', 'cycles']
# 增加日期
df_temp['dt'] = d
# 拼接数据
df_finall=pd.concat([df_finall, df_temp])
代码语言:javascript复制df_finall.to_excel('case2-2.xlsx', index=None)
总结
通过概率分布能生成整体的数据框架,在概率随机的基础上增加自定义权重,则能生成具有维度差异的转化数据~