hive数据检查的常见方式
主键上的数据是否用重复
代码语言:javascript
复制select phone_segment,count(1) as cnt
from dw.dim_phone_segment_info a
group by phone_segment
having cnt > 1;
对比去重前后的数据量
代码语言:javascript
复制select count(1), count(distinct phone_num) from dw.dim_phone_profile a;
取每个分组中的唯一一条
代码语言:javascript
复制drop table temp.zhjq_tmp_cc_phone;
create table temp.zhjq_tmp_cc_phone as
select
aa.user_num,
aa.user_province,
aa.user_city
from(
select
a.user_num,
case when length(trim(a.user_area_province)) > 0 then user_area_province else 'unknow' end user_province,
case when length(trim(a.user_area_city)) > 0 then user_area_city else 'unknow' end user_city,
row_number() over (partition by user_num order by time_start desc) rn
from temp.icsoc_call_detail_bill_bill201807 a
where length(trim(a.user_num)) > 0
) aa
where rn = 1
;