set hive.exec.mode.local.auto=true;
开启hive的local模式
一、新的需求建表
1.1 建表语句:
代码语言:javascript复制create table tmp.test(
deviceid string,
dt string
)row format delimited
fields terminated by ','
1.2 插入语句:
代码语言:javascript复制INSERT INTO tmp.test VALUES
('deviceid1','2022-11-01'),
('deviceid1','2022-11-01'),
('deviceid1','2022-11-02'),
('deviceid1','2022-11-02'),
('deviceid1','2022-11-03'),
('deviceid1','2022-11-05'),
('deviceid1','2022-11-05'),
('deviceid1','2022-11-05'),
('deviceid1','2022-11-06'),
('deviceid1','2022-11-07'),
('deviceid1','2022-11-07'),
('deviceid1','2022-11-08'),
('deviceid1','2022-11-09'),
('deviceid1','2022-11-10'),
('deviceid2','2022-11-01'),
('deviceid2','2022-11-01'),
('deviceid2','2022-11-02'),
('deviceid2','2022-11-02'),
('deviceid2','2022-11-03'),
('deviceid2','2022-11-05'),
('deviceid2','2022-11-05'),
('deviceid2','2022-11-05'),
('deviceid2','2022-11-06'),
('deviceid2','2022-11-07'),
('deviceid2','2022-11-07'),
('deviceid2','2022-11-08'),
('deviceid2','2022-11-09'),
('deviceid2','2022-11-10');
SELECT * FROM tmp.test
二、需求一
2.1 最近一个月内,有过连续活跃2天的用户数
代码语言:javascript复制SELECT deviceid,count(*)
from (
SELECT deviceid, dt_sub,count(*) count_n
from (
SELECT deviceid , dt
,date_sub(dt,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt)) as dt_sub
,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt) as rn
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
group by deviceid,dt
)t1
)t2
group by deviceid,dt_sub
having count(*) >= 2
)t3
group by deviceid
2.2 实现思路解析
2.2.1 求出连续登录的天数
本小节利用row_number()
开窗计算出每个设备deviceid
的行号,再利用日期dt
减去行号,求出相同的dt_sub
即为连续登录的日期。
SELECT deviceid , dt
,date_sub(dt,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt)) as dt_sub
,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt) as rn
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
group by deviceid,dt
)t1
Step 1:子查询从表 tmp.test
中选择 dt
大于等于 '2022-11-15'
减去 30天的数据,并按 deviceid
和 dt
进行分组。返回的结果将包含 deviceid
和 dt
列的值,这里是为了筛选出近一个月30天的数据。
Step 2:在外层查询中,使用窗口函数ROW_NUMBER()
将每个 deviceid
分组内的数据按照 dt
进行排序,并为每行分配一个行号,即 rn
列。Step 2查询结果如下:
deviceid dt dt_sub rn
deviceid1 2022-11-01 2022-10-31 1
deviceid1 2022-11-02 2022-10-31 2
deviceid1 2022-11-03 2022-10-31 3
deviceid1 2022-11-05 2022-11-01 4
deviceid1 2022-11-06 2022-11-01 5
deviceid1 2022-11-07 2022-11-01 6
deviceid1 2022-11-08 2022-11-01 7
deviceid1 2022-11-09 2022-11-01 8
deviceid1 2022-11-10 2022-11-01 9
deviceid2 2022-11-01 2022-10-31 1
deviceid2 2022-11-02 2022-10-31 2
deviceid2 2022-11-03 2022-10-31 3
deviceid2 2022-11-05 2022-11-01 4
deviceid2 2022-11-06 2022-11-01 5
deviceid2 2022-11-07 2022-11-01 6
deviceid2 2022-11-08 2022-11-01 7
deviceid2 2022-11-09 2022-11-01 8
deviceid2 2022-11-10 2022-11-01 9
2.2.2 汇总求和
本小节求出连续登录大于2天的数量,并进行汇总计算。
代码语言:javascript复制SELECT deviceid, dt_sub,count(*) count_n
from (
SELECT deviceid , dt
,date_sub(dt,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt)) as dt_sub
,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt) as rn
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
group by deviceid,dt
)t1
)t2
group by deviceid,dt_sub
having count(*) >= 2
Step 3:对结果进行分组,按照 deviceid
和 dt_sub
进行分组,并计算了每个分组中的行数,即 count_n
列。
deviceid dt_sub count_n
deviceid1 2022-10-31 3
deviceid1 2022-11-01 6
deviceid2 2022-10-31 3
deviceid2 2022-11-01 6
Step 4:对满足条件的分组再次进行了 GROUP BY deviceid
,统计每个 deviceid
对应的分组数,即为需求用户数。
SELECT deviceid,count(*)
from (
SELECT deviceid, dt_sub,count(*) count_n
from (
SELECT deviceid , dt
,date_sub(dt,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt)) as dt_sub
,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt) as rn
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
group by deviceid,dt
)t1
)t2
group by deviceid,dt_sub
having count(*) >= 2
)t3
group by deviceid
代码语言:javascript复制deviceid count(*)
deviceid1 2
deviceid2 2
三、需求二
2.1 统计最近一个月内,每个用户的活跃总天数
2.1.1 Step 1
代码语言:javascript复制select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
这行语句可以理解为统计最近 11 月 15 日前的 30 天内的用户数据。它使用了 date_sub()
函数来计算日期,并使用 >=
运算符筛选出符合条件的数据,即日期大于等于最近的 11 月 15 日前的 30 天。这样就可以获取最近一个月内的用户数据。
deviceid dt
deviceid1 2022-11-01
deviceid1 2022-11-01
deviceid1 2022-11-02
deviceid1 2022-11-02
deviceid1 2022-11-03
deviceid1 2022-11-05
deviceid1 2022-11-05
deviceid1 2022-11-05
deviceid1 2022-11-06
deviceid1 2022-11-07
deviceid1 2022-11-07
deviceid1 2022-11-08
deviceid1 2022-11-09
deviceid1 2022-11-10
deviceid2 2022-11-01
deviceid2 2022-11-01
deviceid2 2022-11-02
deviceid2 2022-11-02
deviceid2 2022-11-03
deviceid2 2022-11-05
deviceid2 2022-11-05
deviceid2 2022-11-05
deviceid2 2022-11-06
deviceid2 2022-11-07
deviceid2 2022-11-07
deviceid2 2022-11-08
deviceid2 2022-11-09
deviceid2 2022-11-10
2.1.2 Step 2
代码语言:javascript复制SELECT deviceid, COUNT(DISTINCT dt) AS active_days
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
)t1
GROUP BY deviceid;
按照 deviceid
进行分组,并使用 COUNT(DISTINCT dt)
函数计算每个设备的唯一日期数,即活跃天数。结果将返回每个设备和其对应的活跃天数。
deviceid active_days
deviceid1 9
deviceid2 9
四、需求三
4.1 最近一个月内,每个用户的最大连续登陆天数
代码语言:javascript复制select deviceid,max(count_n) max_days
from (
SELECT deviceid, dt_sub,count(*) count_n
from (
SELECT deviceid , dt
,date_sub(dt,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt)) as dt_sub
,ROW_NUMBER() over(PARTITION by deviceid ORDER by dt) as rn
from (
select deviceid,dt
from tmp.test
where dt >= date_sub('2022-11-15',30)
group by deviceid,dt
)t1
)t2
group by deviceid,dt_sub
)t3
group by deviceid
这个需求比较简单,在需求一的基础上,增加对deviceid
的分组和对count_n
的求最大值即可
deviceid max_days
deviceid1 6
deviceid2 6
五、需求四
5.1 最近一个月内,连续活跃[1-3]天的人数,[4-6]天的人数,[7 天的人数
代码语言:javascript复制SELECT x1,count(x1)
from (
SELECT deviceid,dt_sub,active_days,
case when active_days >= 1 and active_days <= 3 then '[1-3]'
when active_days >= 4 and active_days <= 6 then '[4-6]'
when active_days >= 7 then '[7 ]' end x1
from (
SELECT deviceid,dt_sub,count(*) as active_days
FROM (
SELECT deviceid, dt,
ROW_NUMBER() OVER (PARTITION BY deviceid ORDER BY dt) as rn,
date_sub(dt, ROW_NUMBER() OVER (PARTITION BY deviceid ORDER BY dt)) AS dt_sub
FROM (
SELECT deviceid, dt
FROM tmp.test
WHERE dt >= date_sub('2022-11-15', 30)
GROUP BY deviceid, dt
) t1
) t2
group by deviceid,dt_sub
)t3
)t4
group by x1
Step 1
仍然是先计算出连续活跃的天数,在内层查询:
代码语言:javascript复制deviceid dt_sub active_days
deviceid1 2022-10-31 3
deviceid1 2022-11-01 6
deviceid2 2022-10-31 3
deviceid2 2022-11-01 6
Step 2
使用case when 进行如下结果样例操作,可以进行行转列:
代码语言:javascript复制/**
* 区间 人数
* [1-3] 2
* [4-6] 3
* [7 4
*/
/**
* deviceid1 [1-3]
deviceid1 [4-6]
deviceid2 [1-3]
deviceid2 [4-6]
*/
六、需求五
6.1 最近30天内,沉默天数超过3天的有多少人,超过5天有多少人
6.1.1 更换表
更换满足本需求案例的新的数据和表。
代码语言:javascript复制CREATE TABLE tmp.testdt (
deviceid VARCHAR(255),
dt DATE
);
INSERT INTO tmp.testdt (deviceid, dt) VALUES
('deviceid1', '2023-06-01'),
('deviceid1', '2023-06-02'),
('deviceid1', '2023-06-03'),
('deviceid1', '2023-06-29'),
('deviceid1', '2023-06-30');
6.2 需求实现
代码语言:javascript复制select count(DISTINCT if(dt_diff > 3,deviceid,null)) as `超过3天`,
count(DISTINCT if(dt_diff > 5,deviceid,null)) as `超过5天`
from (
select deviceid ,dt,pre_date,datediff(date_sub(dt,1),pre_date) as dt_diff
from (
SELECT deviceid,dt, LAG(dt,1,dt) OVER (partition by deviceid ORDER BY dt) AS pre_date
from (
select deviceid,dt
from (
select deviceid,dt
from tmp.testdt where dt >= date_sub('2023-06-30',30)
group by deviceid,dt
)t1
)t2
)t3
)t4
6.3 代码技术点
代码语言:javascript复制SELECT deviceid,dt, LAG(dt,1,dt) OVER (partition by deviceid ORDER BY dt) AS pre_date
from (
select deviceid,dt
from (
select deviceid,dt
from tmp.testdt where dt >= date_sub('2023-06-30',30)
group by deviceid,dt
)t1
)t2
Step 1:
以上查询用lag开窗,求出了给定date类型列中每个日期的前一个出现的日期,仅在本列中求出。查询结果如下:
代码语言:javascript复制deviceid dt pre_date
deviceid1 2023-06-01 2023-06-01
deviceid1 2023-06-02 2023-06-01
deviceid1 2023-06-03 2023-06-02
deviceid1 2023-06-29 2023-06-03
deviceid1 2023-06-30 2023-06-29
Step 2:
使用datediff
计算出dt
和pre_date
的间隔日期,即为沉默的天数。
deviceid dt pre_date dt_diff
deviceid1 2023-06-01 2023-06-01 -1
deviceid1 2023-06-02 2023-06-01 0
deviceid1 2023-06-03 2023-06-02 0
deviceid1 2023-06-29 2023-06-03 25
deviceid1 2023-06-30 2023-06-29 0
最后通过count(DISTINCT if(dt_diff > 3,deviceid,null)) as
超过3天``给定查找的格式字段,即可求出。
超过3天 超过5天
1 1