数仓用户行为漏斗分析数如何SQL实现(第一节)

2022-03-09 10:25:23 浏览数 (1)

「目录」

  • 需求一:用户活跃主题
  • 需求二:用户新增主题
  • 需求三:用户留存主题
  • 需求四:沉默用户数
  • 需求五:本周回流用户数
  • 需求六:流失用户数
  • 需求七:最近连续3周活跃用户数
  • 需求八:最近七天内连续三天活跃用户数
  • 需求九:GMV(Gross Merchandise Volume)一段时间内的成交总额
  • 需求十:转化率=新增用户/日活用户
  • 需求十一:用户行为漏斗分析
  • 需求十二:品牌复购率
  • 需求十三:ADS层品牌复购率报表分析
  • 需求十四:求每个等级的用户对应的复购率前十的商品排行

需求一:用户活跃主题

DWS层--(用户行为宽表层) 目标:统计当日、当周、当月活动的每个设备明细

1 每日活跃设备明细 dwd_start_log--->dws_uv_detail_day

--把相同的字段collect_set到一个数组, 按mid_id分组(便于后边统计)

collect_set将某字段的值进行去重汇总,产生array类型字段。如: concat_ws('|', collect_set(user_id)) user_id,

建分区表dws_uv_detail_day:partitioned by ('dt' string)

代码语言:javascript复制
drop table if exists dws_uv_detail_day;
create table dws_uv_detail_day( 
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按天明细'
PARTITIONED BY ( `dt` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_day/'
;

数据导入

按周分区;过滤出一周内的数据;按设备id分组;===>count(*)得到最终结果;

partition(dt='2019-02-10') from dwd_start_log where dt='2019-02-10' group by mid_id ( mid_id设备唯一标示 )

以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一

代码语言:javascript复制
hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dws_uv_detail_day  partition(dt='2019-02-10')
select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang))lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat
from dwd_start_log
where dt='2019-02-10'  
group by mid_id;

查询导入结果;

代码语言:javascript复制
hive (gmall)> select * from dws_uv_detail_day limit 1;

###最后count(*)即是每日活跃设备的个数;
hive (gmall)> select count(*) from dws_uv_detail_day;

2 每周(dws_uv_detail_wk)活跃设备明细 partition(wk_dt)

周一到周日concat(date_add(next_day('2019-02-10', 'MO'), -7), '_', date_add(next_day('2019-02-10', 'MO'), -1))即 2019-02-04_2019-02-10

创建分区表:partitioned by('wk_dt' string)

代码语言:javascript复制
hive (gmall)>
drop table if exists dws_uv_detail_wk;

create table dws_uv_detail_wk( 
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
    `monday_date` string COMMENT '周一日期',
    `sunday_date` string COMMENT  '周日日期' 
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/'
;

导入数据:以周为分区;过滤出一个月内的数据,按设备id分组;

周一:date_add(next_day('2019-05-16','MO'),-7);

周日:date_add(next_day('2019-05-16','MO'),-1);

周一---周日:concat(date_add(next_day('2019-05-16', 'MO'), -7), "_", date_add(next_day('2019-05-16', 'MO'), -1));

代码语言:javascript复制
insert overwrite table dws_uv_detail_wk partition(wk_dt)
select mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area, 
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_add(next_day('2019-02-10', 'MO'), -7),
date_add(next_day('2019-02-10', 'MO'), -1),
concat(date_add(next_day('2019-02-10', 'MO'), -7), '_', date_add(next_day('2019-02-10', 'MO'), -1))
from dws_uv_detail_day
where dt >= date_add(next_day('2019-02-10', 'MO'), -7) and dt <= date_add(next_day('2019-02-10', 'MO'), -1)
group by mid_id;

查询导入结果

代码语言:javascript复制
hive (gmall)> select * from dws_uv_detail_wk limit 1;
hive (gmall)> select count(*) from dws_uv_detail_wk;

3 每月活跃设备明细 dws_uv_detail_mn partition(mn) - 把每日的数据插入进去

DWS层创建分区表 partitioned by(mn string)

代码语言:javascript复制
hive (gmall)>
drop table if exists dws_uv_detail_mn;

create  external table dws_uv_detail_mn( 
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/'
;

数据导入 按月分区;过滤出一个月内的数据,按照设备id分组;

data_format('2019-03-10', 'yyyy-MM') ---> 2019-03

where date_format('dt', 'yyyy-MM') = date_format('2019-02-10', 'yyyy-MM') group by mid_id;

代码语言:javascript复制
hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;

insert  overwrite table dws_uv_detail_mn  partition(mn)
select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang)) lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat,
    date_format('2019-02-10','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('2019-02-10','yyyy-MM')   
group by mid_id;

查询导入结果

代码语言:javascript复制
hive (gmall)> select * from dws_uv_detail_mn limit 1;
hive (gmall)> select count(*) from dws_uv_detail_mn ;

DWS层加载数据脚本

在hadoop101的/home/kris/bin目录下创建脚本

[kris@hadoop101 bin]$ vim dws.sh

代码语言:javascript复制
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day"  %F`  
fi 


sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

  insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
  select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang)) lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat
  from "$APP".dwd_start_log
  where dt='$do_date'  
  group by mid_id;


  insert  overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
  select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang)) lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat,
    date_add(next_day('$do_date','MO'),-7),
    date_add(next_day('$do_date','SU'),-7),
    concat(date_add( next_day('$do_date','MO'),-7), '_' , date_add(next_day('$do_date','MO'),-1) 
  )
  from "$APP".dws_uv_detail_day 
  where dt>=date_add(next_day('$do_date','MO'),-7) and dt<=date_add(next_day('$do_date','MO'),-1) 
  group by mid_id; 


  insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
  select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang))lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat,
    date_format('$do_date','yyyy-MM')
  from "$APP".dws_uv_detail_day
  where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')   
  group by mid_id;
"

$hive -e "$sql"

增加脚本执行权限 chmod 777 dws.sh

脚本使用[kris@hadoop101 module]$ dws.sh 2019-02-11

查询结果

代码语言:javascript复制
hive (gmall)> select count(*) from dws_uv_detail_day;
hive (gmall)> select count(*) from dws_uv_detail_wk;
hive (gmall)> select count(*) from dws_uv_detail_mn ;

脚本执行时间;企业开发中一般在每日凌晨30分~1点

ADS层 目标:当日、当周、当月活跃设备数 使用 day_count表 join wk_count join mn_count , 把3张表连接一起

建表ads_uv_count表:

字段有day_count、wk_count、mn_count is_weekend if(date_add(next_day('2019-02-10', 'MO'), -1) = '2019-02-10', 'Y', 'N') is_monthend if(last_day('2019-02-10') = '2019-02-10', 'Y', 'N')

代码语言:javascript复制
drop table if exists ads_uv_count;
create external table ads_uv_count(
`dt` string comment '统计日期',
`day_count` bigint comment '当日用户量',
`wk_count` bigint comment '当周用户量',
`mn_count` bigint comment '当月用户量',
`is_weekend` string comment 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string comment 'Y,N是否是月末,用于得到本月最终结果'
) comment '每日活跃用户数量'
stored as parquet
location '/warehouse/gmall/ads/ads_uv_count/';

导入数据:

代码语言:javascript复制
hive (gmall)>
insert  overwrite table ads_uv_count 
select  
  '2019-02-10' dt,
   daycount.ct,
   wkcount.ct,
   mncount.ct,
   if(date_add(next_day('2019-02-10','MO'),-1)='2019-02-10','Y','N') ,
   if(last_day('2019-02-10')='2019-02-10','Y','N') 
from 
(
   select  
      '2019-02-10' dt,
       count(*) ct
   from dws_uv_detail_day
   where dt='2019-02-10'  
)daycount   join 
( 
   select  
     '2019-02-10' dt,
     count (*) ct
   from dws_uv_detail_wk
   where wk_dt=concat(date_add(next_day('2019-02-10','MO'),-7),'_' ,date_add(next_day('2019-02-10','MO'),-1) )
)  wkcount  on daycount.dt=wkcount.dt
join 
( 
   select  
     '2019-02-10' dt,
     count (*) ct
   from dws_uv_detail_mn
   where mn=date_format('2019-02-10','yyyy-MM')  
)mncount on daycount.dt=mncount.dt
;

查询导入结果

hive (gmall)> select * from ads_uv_count ;

ADS层加载数据脚本

1)在hadoop101的/home/kris/bin目录下创建脚本

[kris@hadoop101 bin]$ vim ads.sh

代码语言:javascript复制
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day"  %F`  
fi 

sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_uv_count 
select  
  '$do_date' dt,
   daycount.ct,
   wkcount.ct,
   mncount.ct,
   if(date_add(next_day('$do_date','MO'),-1)='$do_date','Y','N') ,
   if(last_day('$do_date')='$do_date','Y','N') 
from 
(
   select  
      '$do_date' dt,
       count(*) ct
   from "$APP".dws_uv_detail_day
   where dt='$do_date'  
)daycount   join 
( 
   select  
     '$do_date' dt,
     count (*) ct
   from "$APP".dws_uv_detail_wk
   where wk_dt=concat(date_add(next_day('$do_date','MO'),-7),'_' ,date_add(next_day('$do_date','MO'),-1) )
)  wkcount  on daycount.dt=wkcount.dt
join 
( 
   select  
     '$do_date' dt,
     count (*) ct
   from "$APP".dws_uv_detail_mn
   where mn=date_format('$do_date','yyyy-MM')  
)mncount on daycount.dt=mncount.dt;
"

$hive -e "$sql"

增加脚本执行权限 chmod 777 ads.sh

脚本使用 ads.sh 2019-02-11

查询导入结果 hive (gmall)> select * from ads_uv_count ;

需求二:用户新增主题

首次联网使用应用的用户。如果一个用户首次打开某APP,那这个用户定义为新增用户;卸载再安装的设备,不会被算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户。

每日新增(老用户不算,之前没登陆过,今天是第一次登陆)设备--没有分区 -->以往的新增库里边没有他,但他今天活跃了即新增加的用户;

1 DWS层(每日新增设备明细表) 创建每日新增设备明细表:dws_new_mid_day

代码语言:javascript复制
hive (gmall)>
drop table if exists  dws_new_mid_day;
create  table  dws_new_mid_day
(
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
    `create_date`  string  comment '创建时间' 
)  COMMENT '每日新增设备信息'
stored as  parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';

dws_uv_detail_day(每日活跃设备明细) left join dws_new_mid_day nm(以往的新增用户表, 新建字段create_time2019-02-10) nm.mid_id is null;

导入数据

用每日活跃用户表 left join 每日新增设备表,关联的条件是mid_id相等。如果是每日新增的设备,则在每日新增设备表中为null。

from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id

where ud.dt='2019-02-10' and nm.mid_id is null;

代码语言:javascript复制
hive (gmall)>
insert into table dws_new_mid_day  
select  
    ud.mid_id,
    ud.user_id , 
    ud.version_code , 
    ud.version_name , 
    ud.lang , 
    ud.source, 
    ud.os, 
    ud.area, 
    ud.model, 
    ud.brand, 
    ud.sdk_version, 
    ud.gmail, 
    ud.height_width,
    ud.app_time,
    ud.network,
    ud.lng,
    ud.lat,
    '2019-02-10'
from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id
where ud.dt='2019-02-10' and nm.mid_id is null;

查询导入数据

hive (gmall)> select count(*) from dws_new_mid_day ;

2 ADS层(每日新增设备表) 创建每日新增设备表ads_new_mid_count

代码语言:javascript复制
hive (gmall)>
drop table if exists  `ads_new_mid_count`;
create  table  `ads_new_mid_count`
(
    `create_date`     string  comment '创建时间' ,
    `new_mid_count`   BIGINT comment '新增设备数量' 
)  COMMENT '每日新增设备信息数量'
row format delimited  fields terminated by 't' 
location '/warehouse/gmall/ads/ads_new_mid_count/';

导入数据 count(*) dws_new_mid_day表即可

加了create_date就必须group by create_time,否则报错:not in GROUP BY key 'create_date'

代码语言:javascript复制
hive (gmall)>
insert into table ads_new_mid_count 
select create_date , count(*)  from dws_new_mid_day
where create_date='2019-02-10'
group by create_date ;

查询导入数据

hive (gmall)> select * from ads_new_mid_count;

扩展每月新增:

代码语言:javascript复制
--每月新增
drop table if exists dws_new_mid_mn;
create table dws_new_mid_mn(
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
    `lang` string COMMENT '系统语言', 
    `source` string COMMENT '渠道号', 
    `os` string COMMENT '安卓系统版本', 
    `area` string COMMENT '区域', 
    `model` string COMMENT '手机型号', 
    `brand` string COMMENT '手机品牌', 
    `sdk_version` string COMMENT 'sdkVersion', 
    `gmail` string COMMENT 'gmail', 
    `height_width` string COMMENT '屏幕宽高',
    `app_time` string COMMENT '客户端日志产生时的时间',
    `network` string COMMENT '网络模式',
    `lng` string COMMENT '经度',
    `lat` string COMMENT '纬度'
)comment "每月新增明细"
partitioned by(mn string)
stored as parquet
location "/warehouse/gmall/dws/dws_new_mid_mn";

insert overwrite table dws_new_mid_mn partition(mn)
select
    um.mid_id,
    um.user_id , 
    um.version_code , 
    um.version_name , 
    um.lang , 
    um.source, 
    um.os, 
    um.area, 
    um.model, 
    um.brand, 
    um.sdk_version, 
    um.gmail, 
    um.height_width,
    um.app_time,
    um.network,
    um.lng,
    um.lat,
    date_format('2019-02-10', 'yyyy-MM')
from dws_uv_detail_mn um left join dws_new_mid_mn nm on um.mid_id = nm.mid_id
where um.mn =date_format('2019-02-10', 'yyyy-MM') and nm.mid_id = null; ----为什么加上它就是空的??查不到数据了呢
--##注意这里不能写出date_format(um.mn, 'yyyy-MM') =date_format('2019-02-10', 'yyyy-MM') 
    |

先更新到需求2,后续需求我会继续更新。。。。。。敬请期待!!!!!

0 人点赞