在数据治理过程中,除了按照业务质量要求编制数据质量规则脚本,通过数据质量引擎对数据开展校核生成问题清单以外,还要对问题数据进行下发、整改、以行政方式为主进行督办,这时候对问题数据的追踪和多版本管理就是主要解决的技术问题了。本文就是实现通过脚本实现问题清单跟踪。
问题数据的跟踪还是比较麻烦的,有存量的,有新增的,有治理过的,还有治理错的,几种情况掺杂在一起,逻辑上不容易理解。
思考了几天,想到了几个指标,当日存量问题数据,历史存量问题数据,当日新增问题数据,历史治理问题数据,当天治理问题数据,再按照时间维度、数据主体方等多个维度进行统计。
创建一张数据表,并构造了一些问题数据,如下:
代码语言:javascript复制CREATE TABLE detailtable(statdate VARCHAR2(10),id INTEGER);
TRUNCATE TABLE detailtable;
INSERT INTO detailtable VALUES('2021-11-01', 1 );
INSERT INTO detailtable VALUES('2021-11-01', 2 );
INSERT INTO detailtable VALUES('2021-11-01', 3 );
INSERT INTO detailtable VALUES('2021-11-01', 4 );
INSERT INTO detailtable VALUES('2021-11-01', 5 );
INSERT INTO detailtable VALUES('2021-11-01', 6 );
INSERT INTO detailtable VALUES('2021-11-01', 7 );
INSERT INTO detailtable VALUES('2021-11-01', 8 );
INSERT INTO detailtable VALUES('2021-11-01', 9 );
INSERT INTO detailtable VALUES('2021-11-01', 10 );
INSERT INTO detailtable VALUES('2021-11-02', 1 );
INSERT INTO detailtable VALUES('2021-11-02', 2 );
INSERT INTO detailtable VALUES('2021-11-02', 3 );
INSERT INTO detailtable VALUES('2021-11-02', 4 );
INSERT INTO detailtable VALUES('2021-11-02', 7 );
INSERT INTO detailtable VALUES('2021-11-02', 8 );
INSERT INTO detailtable VALUES('2021-11-02', 9 );
INSERT INTO detailtable VALUES('2021-11-02', 10 );
INSERT INTO detailtable VALUES('2021-11-02', 11 );
INSERT INTO detailtable VALUES('2021-11-03', 1 );
INSERT INTO detailtable VALUES('2021-11-03', 2 );
INSERT INTO detailtable VALUES('2021-11-03', 3 );
INSERT INTO detailtable VALUES('2021-11-03', 7 );
INSERT INTO detailtable VALUES('2021-11-03', 8 );
INSERT INTO detailtable VALUES('2021-11-03', 9 );
INSERT INTO detailtable VALUES('2021-11-03', 10 );
INSERT INTO detailtable VALUES('2021-11-03', 12 );
INSERT INTO detailtable VALUES('2021-11-04', 1 );
INSERT INTO detailtable VALUES('2021-11-04', 2 );
INSERT INTO detailtable VALUES('2021-11-04', 3 );
INSERT INTO detailtable VALUES('2021-11-04', 7 );
INSERT INTO detailtable VALUES('2021-11-04', 8 );
INSERT INTO detailtable VALUES('2021-11-04', 9 );
INSERT INTO detailtable VALUES('2021-11-04', 10 );
INSERT INTO detailtable VALUES('2021-11-04', 12 );
INSERT INTO detailtable VALUES('2021-11-04', 13 );
INSERT INTO detailtable VALUES('2021-11-05', 1 );
INSERT INTO detailtable VALUES('2021-11-05', 2 );
INSERT INTO detailtable VALUES('2021-11-05', 8 );
INSERT INTO detailtable VALUES('2021-11-05', 9 );
INSERT INTO detailtable VALUES('2021-11-05', 10 );
INSERT INTO detailtable VALUES('2021-11-05', 12 );
INSERT INTO detailtable VALUES('2021-11-05', 13 );
INSERT INTO detailtable VALUES('2021-11-06', 1 );
INSERT INTO detailtable VALUES('2021-11-06', 2 );
INSERT INTO detailtable VALUES('2021-11-06', 8 );
INSERT INTO detailtable VALUES('2021-11-06', 9 );
INSERT INTO detailtable VALUES('2021-11-06', 10 );
INSERT INTO detailtable VALUES('2021-11-06', 12 );
INSERT INTO detailtable VALUES('2021-11-06', 14 );
INSERT INTO detailtable VALUES('2021-11-07', 1 );
INSERT INTO detailtable VALUES('2021-11-07', 2 );
INSERT INTO detailtable VALUES('2021-11-07', 8 );
INSERT INTO detailtable VALUES('2021-11-07', 10 );
INSERT INTO detailtable VALUES('2021-11-07', 12 );
INSERT INTO detailtable VALUES('2021-11-07', 14 );
INSERT INTO detailtable VALUES('2021-11-08', 1 );
INSERT INTO detailtable VALUES('2021-11-08', 2 );
INSERT INTO detailtable VALUES('2021-11-08', 8 );
INSERT INTO detailtable VALUES('2021-11-08', 12 );
INSERT INTO detailtable VALUES('2021-11-08', 13 );
INSERT INTO detailtable VALUES('2021-11-08', 14 );
INSERT INTO detailtable VALUES('2021-11-09', 1 );
INSERT INTO detailtable VALUES('2021-11-09', 2 );
INSERT INTO detailtable VALUES('2021-11-09', 8 );
INSERT INTO detailtable VALUES('2021-11-09', 12 );
INSERT INTO detailtable VALUES('2021-11-09', 13 );
INSERT INTO detailtable VALUES('2021-11-09', 14 );
INSERT INTO detailtable VALUES('2021-11-10', 1 );
INSERT INTO detailtable VALUES('2021-11-10', 2 );
INSERT INTO detailtable VALUES('2021-11-10', 8 );
INSERT INTO detailtable VALUES('2021-11-10', 12 );
INSERT INTO detailtable VALUES('2021-11-10', 14 );
COMMIT;
按照之前的逻辑,实现各指标的统计,代码如下:
第一天的指标统计
代码语言:javascript复制--基准日2021-11-01
select count(*) from detailtable t
where t.statdate='2021-11-01';
--当日存量问题数据10
--历史存量问题数据10
--当日新增问题数据10
--历史治理问题数据0
--当天治理问题数据0
后续各天的数据统计,如下:
代码语言:javascript复制--第二天2021-11-02
--当日存量问题数据
select count(*) as "当日存量问题数据"
from detailtable t --当天
left outer join detailtable l on t.id=l.id and l.statdate='2021-11-01'--前一天
where t.statdate='2021-11-02';
--历史存量问题数据
select count(distinct id) as "历史存量问题数据"
from detailtable t
where t.statdate<='2021-11-02' ;
--当日新增问题数据
select count(*) as "当日新增问题数据"
from
(
select id from detailtable t where t.statdate='2021-11-02'
minus
select id from detailtable t where t.statdate='2021-11-01'
) t;
--当日治理问题数据
select count(*) as "当日治理问题数据"
from
(
select id from detailtable t where t.statdate='2021-11-01'
minus
select id from detailtable t where t.statdate='2021-11-02'
) t;
--历史治理问题数据
select count(*) as "历史治理问题数据"
from
(
select id from detailtable t where t.statdate<='2021-11-02'
minus
select id from detailtable t where t.statdate='2021-11-02'
) t;
--当日存量问题数据10
--历史存量问题数据11
--当日新增问题数据1
--当天治理问题数据2
--历史治理问题数据2
后续各天的数据统计,如下:
代码语言:javascript复制--第三天2021-11-03
--当日存量问题数据
select count(*) as "当日存量问题数据"
from detailtable t --当天
left outer join detailtable l on t.id=l.id and l.statdate='2021-11-02'--前一天
where t.statdate='2021-11-03';
--历史存量问题数据
select count(distinct id) as "历史存量问题数据"
from detailtable t
where t.statdate<='2021-11-03' ;
--当日新增问题数据
select count(*) as "当日新增问题数据"
from
(
select id from detailtable t where t.statdate='2021-11-03'
minus
select id from detailtable t where t.statdate='2021-11-02'
) t;
--当日治理问题数据
select count(*) as "当日治理问题数据"
from
(
select id from detailtable t where t.statdate='2021-11-02'
minus
select id from detailtable t where t.statdate='2021-11-03'
) t;
--历史治理问题数据
select count(*) as "历史治理问题数据"
from
(
select id from detailtable t where t.statdate<='2021-11-03'
minus
select id from detailtable t where t.statdate='2021-11-03'
) t;
--当日存量问题数据8
--历史存量问题数据12
--当日新增问题数据1
--当天治理问题数据2
--历史治理问题数据4
一个简单的问题多版本跟踪和闭环管理就完成了,后续再深入思考一下