数仓用户行为漏斗分析如何SQL实现(第三节)

2022-03-09 09:53:39 浏览数 (1)

目录

  • 需求一:用户活跃主题
  • 需求二:用户新增主题
  • 需求三:用户留存主题
  • 需求四:沉默用户数
  • 需求五:本周回流用户数
  • 需求六:流失用户数
  • 需求七:最近连续3周活跃用户数
  • 需求八:最近七天内连续三天活跃用户数
  • 需求九:GMV(Gross Merchandise Volume)一段时间内的成交总额
  • 需求十:转化率=新增用户/日活用户
  • 需求十一:用户行为漏斗分析
  • 需求十二:品牌复购率
  • 需求十三:ADS层品牌复购率报表分析
  • 需求十四:求每个等级的用户对应的复购率前十的商品排行

需求九:GMV一段时间内的成交总额

GMV拍下订单金额;包括付款和未付款;

建表ads_gmv_sum_day语句:

代码语言:javascript复制
drop table if exists ads_gmv_sum_day;
create table ads_gmv_sum_day(
`dt` string comment '统计日期',
`gmv_count` bigint comment '当日GMV订单个数',
`gmv_amount` decimal(16, 2) comment '当日GMV订单总额',
`gmv_payment` decimal(16, 2) comment '当日支付金额'
) comment 'GMV'
row format delimited fields terminated by 't'
location '/warehouse/gmall/ads/ads_gmv_sum_day';

导入数据:from用户行为宽表dws_user_action

sum(order_count) gmv_count 、 sum(order_amount) gmv_amount 、sum(payment_amount) payment_amount 过滤日期,以dt分组;

代码语言:javascript复制
insert into table ads_gmv_sum_day 
select '2019-02-10' dt, sum(order_count) gmv_count, sum(order_amount) gmv_amount, sum(payment_amount) gmv_payment
from dws_user_action where dt='2019-02-10' group by dt;

编写脚本:

代码语言:javascript复制
#/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
if [ -n "$1" ]; then
    do_date=$1
else
    do_date=`date -d "-1 day"  %F`
fi    
sql="
insert into table "$APP".ads_gmv_sum_day 
select '$do_date' dt, sum(order_count) gmv_count, sum(order_amount) gmv_amount, sum(payment_amount) gmv_payment
from "$APP".dws_user_action where dt='$do_date' group by dt;
"
$hive -e "$sql";

需求十:转化率=新增用户/日活用户

https://img2018.cnblogs.com/blog/1247221/201905/1247221-20190516210029788-275690435.png

代码语言:javascript复制
ads_user_convert_day
  dt
  uv_m_count   当日活跃设备
  new_m_count  当日新增设备
  new_m_ratio  新增占日活比率

ads_uv_count      用户活跃数(在行为数仓中;) day_count dt
ads_new_mid_count 用户新增表(行为数仓中) new_mid_count create_date

建表ads_user_convert_day

代码语言:javascript复制
drop table if exists ads_user_convert_day;
create table ads_user_convert_day(
`dt` string comment '统计日期',
`uv_m_count` bigint comment '当日活跃设备',
`new_m_count` bigint comment '当日新增设备',
`new_m_radio` decimal(10, 2) comment '当日新增占日活比率'
)comment '转化率'
row format delimited fields terminated by 't'
location '/warehouse/gmall/ads/ads_user_convert_day/';

数据导入 cast(sum( uc.nmc)/sum( uc.dc)*100 as decimal(10,2)) new_m_ratio ;使用union all

代码语言:javascript复制
insert into table ads_user_convert_day select '2019-02-10', sum(uc.dc) sum_dc, sum(uc.nmc) sum_nmc, 
cast(sum(uc.nmc)/sum(uc.dc) * 100 as decimal(10, 2)) new_m_radio
from(select day_count dc, 0 nmc from ads_uv_count where dt='2019-02-10'
union all select 0 dc, new_mid_count from ads_new_mid_count where create_date='2019-02-10'
)uc;

访问到下单转化率| 下单到支付转化率

代码语言:javascript复制
ads_user_action_convert_day
dt
total_visitor_m_count                 总访问人数
order_u_count                        下单人数
visitor2order_convert_ratio         访问到下单转化率
payment_u_count                     支付人数
order2payment_convert_ratio            下单到支付转化率

dws_user_action (宽表中)
    user_id
    order_count
    order_amount
    payment_count
    payment_amount 
    comment_count
ads_uv_count 用户活跃数(行为数仓中)
    dt
    day_count 
    wk_count
    mn_count
    is_weekend
    is_monthend

建表

代码语言:javascript复制
drop table if exists ads_user_action_convert_day;
create table ads_user_action_convert_day(
`dt` string comment '统计日期',
`total_visitor_m_count` bigint comment '总访问人数',
`order_u_count` bigint comment '下单人数',
`visitor2order_convert_radio` decimal(10, 2) comment '访问到下单转化率',
`payment_u_count` bigint comment '支付人数',
`order2payment_convert_radio` decimal(10, 2) comment '下单到支付的转化率'
)COMMENT '用户行为漏斗分析'
row format delimited  fields terminated by 't' 
location '/warehouse/gmall/ads/ads_user_convert_day/'
;

插入数据

代码语言:javascript复制
insert into table ads_user_action_convert_day
select '2019-02-10', uv.day_count, ua.order_count, 
cast(ua.order_count/uv.day_count * 100 as decimal(10, 2)) visitor2order_convert_radio,
ua.payment_count,
cast(ua.payment_count/ua.order_count * 100 as decimal(10, 2)) order2payment_convert_radio
from(
select sum(if(order_count>0, 1, 0)) order_count,
sum(if(payment_count>0, 1, 0)) payment_count
from dws_user_action where dt='2019-02-10'
)ua, ads_uv_count  uv where uv.dt='2019-02-10';

需求十一:品牌复购率

需求:以月为单位统计,购买2次以上商品的用户,用户购买商品明细表 dws_sale_detail_daycount:(宽表)建表dws_sale_detail_daycount

代码语言:javascript复制
drop table if exists dws_sale_detail_daycount;

create external table dws_sale_detail_daycount(
user_id   string  comment '用户 id',
sku_id    string comment '商品 Id',
user_gender  string comment '用户性别',
user_age string  comment '用户年龄',
user_level string comment '用户等级',
order_price decimal(10,2) comment '商品价格',
sku_name string   comment '商品名称',
sku_tm_id string   comment '品牌id',
sku_category3_id string comment '商品三级品类id',
sku_category2_id string comment '商品二级品类id',
sku_category1_id string comment '商品一级品类id',
sku_category3_name string comment '商品三级品类名称',
sku_category2_name string comment '商品二级品类名称',
sku_category1_name string comment '商品一级品类名称',
spu_id  string comment '商品 spu',
sku_num  int comment '购买个数',
order_count string comment '当日下单单数',
order_amount string comment '当日下单金额'
) comment  '用户购买商品明细表'
partitioned by(`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_sale_detail_daycount'
tblproperties("parquet.compression"="snappy");

数据导入

ods_order_detail订单详情表、dwd_user_info用户表、dwd_sku_info商品表

代码语言:javascript复制
with tmp_detail as(
select user_id, sku_id, sum(sku_num) sku_num, count(*) order_count, sum(od.order_price*sku_num) order_amount
from ods_order_detail od where od.dt='2019-02-10' and user_id is not null group by user_id, sku_id
)
insert overwrite table dws_sale_detail_daycount partition(dt='2019-02-10')
select
tmp_detail.user_id,
tmp_detail.sku_id,
u.gender,
months_between('2019-02-10', u.birthday)/12 age,
u.user_level,
price,
sku_name,
tm_id,
category3_id ,  
category2_id ,  
category1_id ,  
category3_name ,  
category2_name ,  
category1_name ,  
spu_id,
tmp_detail.sku_num,
tmp_detail.order_count,
tmp_detail.order_amount 
from tmp_detail 
left join dwd_user_info u on u.id=tmp_detail.user_id and u.dt='2019-02-10'
left join dwd_sku_info s on s.id=tmp_detail.sku_id and s.dt='2019-02-10';

ADS层 品牌复购率报表分析 建表ads_sale_tm_category1_stat_mn

buycount 购买人数、buy_twice_last两次以上购买人数、

buy_twice_last_ratio '单次复购率'、

buy_3times_last '三次以上购买人数',

buy_3times_last_ratio 多次复购率'

代码语言:javascript复制
drop table ads_sale_tm_category1_stat_mn;
create  table ads_sale_tm_category1_stat_mn
(   
    tm_id string comment '品牌id ' ,
    category1_id string comment '1级品类id ',
    category1_name string comment '1级品类名称 ',
    buycount   bigint comment  '购买人数',
    buy_twice_last bigint  comment '两次以上购买人数',
    buy_twice_last_ratio decimal(10,2)  comment  '单次复购率', 
    buy_3times_last   bigint comment   '三次以上购买人数',
    buy_3times_last_ratio decimal(10,2)  comment  '多次复购率' ,
    stat_mn string comment '统计月份',
    stat_date string comment '统计日期' 
)   COMMENT '复购率统计'
row format delimited  fields terminated by 't' 
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/'
;

插入数据

代码语言:javascript复制
insert into table ads_sale_tm_category1_stat_mn
select mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(if(mn.order_count >= 1, 1, 0)) buycount,
sum(if(mn.order_count >= 2, 1, 0)) buyTwiceLast,
sum(if(mn.order_count >= 2, 1, 0)) / sum(if(mn.order_count >= 1, 1, 0)) buyTwiceLastRatio,
sum(if(mn.order_count >= 3, 1, 0)) buy3timeLast,
sum(if(mn.order_count >= 3, 1, 0)) / sum(if(mn.order_count >= 1, 1, 0)) buy3timeLastRadio,
date_format ('2019-02-10' ,'yyyy-MM') stat_mn,
'2019-02-10' stat_date
from (
select sd.sku_tm_id, sd.sku_category1_id, sd.sku_category1_name, user_id, sum(order_count) order_count
from dws_sale_detail_daycount sd where date_format(dt, 'yyyy-MM') <= date_format('2019-02-10', 'yyyy-MM')
group by sd.sku_tm_id, sd.sku_category1_id, user_id, sd.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name
;

数据导入脚本

1)在/home/kris/bin目录下创建脚本ads_sale.sh

[kris@hadoop101 bin]$ vim ads_sale.sh

代码语言:javascript复制
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date  -d "-1 day"   %F`  
fi 

sql="

set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_sale_tm_category1_stat_mn
select   
    mn.sku_tm_id,
    mn.sku_category1_id,
    mn.sku_category1_name,
    sum(if(mn.order_count>=1,1,0)) buycount,
    sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
    sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
    sum(if(mn.order_count>=3,1,0))  buy3timeLast  ,
    sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
    date_format('$do_date' ,'yyyy-MM') stat_mn,
    '$do_date' stat_date
from 
(     
    select od.sku_tm_id, 
        od.sku_category1_id,
        od.sku_category1_name,  
        user_id , 
        sum(order_count) order_count
    from  "$APP".dws_sale_detail_daycount  od 
    where date_format(dt,'yyyy-MM')<=date_format('$do_date' ,'yyyy-MM')
    group by od.sku_tm_id, od.sku_category1_id, user_id, od.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;

"
$hive -e "$sql"

增加脚本执行权限
[kris@hadoop101 bin]$ chmod 777 ads_sale.sh
执行脚本导入数据
[kris@hadoop101 bin]$ ads_sale.sh 2019-02-11
查看导入数据
hive (gmall)>select * from ads_sale_tm_category1_stat_mn limit 2;

品牌复购率结果输出到MySQL

1)在MySQL中创建ads_sale_tm_category1_stat_mn表

代码语言:javascript复制
create  table ads_sale_tm_category1_stat_mn
(   
    tm_id varchar(200) comment '品牌id ' ,
    category1_id varchar(200) comment '1级品类id ',
    category1_name varchar(200) comment '1级品类名称 ',
    buycount   varchar(200) comment  '购买人数',
    buy_twice_last varchar(200) comment '两次以上购买人数',
    buy_twice_last_ratio varchar(200) comment  '单次复购率', 
    buy_3times_last   varchar(200) comment   '三次以上购买人数',
    buy_3times_last_ratio varchar(200)  comment  '多次复购率' ,
    stat_mn varchar(200) comment '统计月份',
    stat_date varchar(200) comment '统计日期' 
)

2)编写Sqoop导出脚本

在/home/kris/bin目录下创建脚本sqoop_export.sh

[kris@hadoop101 bin]$ vim sqoop_export.sh

代码语言:javascript复制
#!/bin/bash

db_name=gmall

export_data() {
/opt/module/sqoop/bin/sqoop export 
--connect "jdbc:mysql://hadoop101:3306/${db_name}?useUnicode=true&characterEncoding=utf-8"  
--username root 
--password 123456 
--table $1 
--num-mappers 1 
--export-dir /warehouse/$db_name/ads/$1 
--input-fields-terminated-by "t"  
--update-key "tm_id,category1_id,stat_mn,stat_date" 
--update-mode allowinsert 
--input-null-string '\N'    
--input-null-non-string '\N'  
}

case $1 in
  "ads_sale_tm_category1_stat_mn")
     export_data "ads_sale_tm_category1_stat_mn"
;;
   "all")
     export_data "ads_sale_tm_category1_stat_mn"
;;
esac

3)执行Sqoop导出脚本

[kris@hadoop101 bin]$ chmod 777 sqoop_export.sh

[kris@hadoop101 bin]$ sqoop_export.sh all

4)在MySQL中查看结果

SELECT * FROM ads_sale_tm_category1_stat_mn;

需求十二:求每个等级的用户对应的复购率前十的商品排行

1)每个等级,每种商品,买一次的用户数,买两次的用户数=》得出复购率

2)利用开窗函数,取每个等级的前十

3)形成脚本

用户购买明细宽表 dws_sale_detail_daycount

① t1--按user_leval, sku_id, user_id统计下单次数

代码语言:javascript复制
select 
    user_level, 
    sku_id, 
    user_id, 
    sum(order_count) order_count_sum
from dws_sale_detail_daycount
where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')
group by user_level, sku_id, user_id limit 10;

② t2 --求出每个等级,每种商品,买一次的用户数,买两次的用户数 得出复购率

代码语言:javascript复制
select 
    t1.user_level,
    t1.sku_id,
    sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,
    '2019-02-13' stat_date
from(
select 
    user_level, 
    sku_id, 
    user_id, 
    sum(order_count) order_count_sum
from dws_sale_detail_daycount
where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')
group by user_level, sku_id, user_id
) t1
group by t1.user_level, t1.sku_id;

③ t3 --按用户等级分区,复购率排序

代码语言:javascript复制
select
    t2.user_level,
    t2.sku_id,
    t2.buyOneCount,
    t2.buyTwiceCount,
    t2.buyTwiceCountRatio,
    t2.stat_date
from(
select 
    t1.user_level,
    t1.sku_id,
    sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,
    '2019-02-13' stat_date
from(
select 
    user_level, 
    sku_id, 
    user_id, 
    sum(order_count) order_count_sum
from dws_sale_detail_daycount
where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')
group by user_level, sku_id, user_id
) t1
group by t1.user_level, t1.sku_id
)t2

④ -分区排序 rank()

代码语言:javascript复制
select
    t2.user_level,
    t2.sku_id,
    t2.buyOneCount,
    t2.buyTwiceCount,
    t2.buyTwiceCountRatio,
rank() over(partition by t2.sku_id order by t2.buyTwiceCount) rankNo
from(
select 
    t1.user_level,
    t1.sku_id,
    sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,
    '2019-02-13' stat_date
from(
select 
    user_level, 
    sku_id, 
    user_id, 
    sum(order_count) order_count_sum
from dws_sale_detail_daycount
where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')
group by user_level, sku_id, user_id
) t1
group by t1.user_level, t1.sku_id
)t2

⑤ 作为子查询取前10

代码语言:javascript复制
select t3.user_level, t3.sku_id, t3.buyOneCount, t3.buyTwiceCount, t3.buyTwiceCountRatio, t3.rankNo
from(
select
    t2.user_level,
    t2.sku_id,
    t2.buyOneCount,
    t2.buyTwiceCount,
    t2.buyTwiceCountRatio,
rank() over(partition by t2.sku_id order by t2.buyTwiceCount) rankNo
from(
select 
    t1.user_level,
    t1.sku_id,
    sum(if(t1.order_count_sum > 0, 1, 0)) buyOneCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) buyTwiceCount,
    sum(if(t1.order_count_sum > 1, 1, 0)) / sum(if(t1.order_count_sum > 0, 1, 0)) * 100 buyTwiceCountRatio,
    '2019-02-13' stat_date
from(
select 
    user_level, 
    sku_id, 
    user_id, 
    sum(order_count) order_count_sum
from dws_sale_detail_daycount
where date_format(dt, 'yyyy-MM') = date_format('2019-02-13', 'yyyy-MM')
group by user_level, sku_id, user_id
) t1
group by t1.user_level, t1.sku_id
)t2
) t3 where rankNo <= 10;

0 人点赞