(五)进阶技术 9. 杂项维度 本篇讨论杂项维度。简单地说,杂项维度就是一种包含的数据具有很少可能值的维度。例如销售订单,它可能有很多离散数据(yes-no这种类型的值),如
- verification_ind(如果订单已经被审核,值为yes)
- credit_check_flag(表示此订单的客户信用状态是否已经检查)
- new_customer_ind(如果这是新客户的首个订单,值为yes)
- web_order_flag(表示此订单是否是在线下的订单)
这类数据常被用于增强销售分析,应该用称为杂项维度的特殊维度类型存储。 新增销售订单属性杂项维度 给现有的数据仓库新增一个销售订单杂项维度,需要新增一个名为sales_order_attribute_dim的维度表。图(五)- 9-1显示了增加杂项维度表后的数据仓库模式(这里只显示了和销售订单属性相关的表)。
图(五)- 9-1
新的维度表包括四个yes-no列:verification_ind、credit_check_flag、new_customer_ind和web_order_flag。每个列可以有两个可能值中的一个(Y 或 N),因此sales_order_attribute_dim表最多有16(2^4)行。可以预装载这个维度,并且只需装载一次。 注意 如果知道某种组合是不可能出现的,就不需要装载这种组合。执行清单(五)- 9-1里的脚本修改数据库模式。这个脚本做了四项工作:建立sales_order_attribute_dim表,向表中预装载全部16种可能的组合,给销售订单事实表添加杂项维度代理键,给源数据库里的sales_order表增加对应的四个属性列。
代码语言:javascript复制USE dw;
-- 建立杂项维度表
CREATE TABLE sales_order_attribute_dim (
sales_order_attribute_sk INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
verification_ind CHAR(1),
credit_check_flag CHAR(1),
new_customer_ind CHAR(1),
web_order_flag CHAR(1),
version int,
effective_date DATE,
expiry_date DATE
);
-- 生成杂项维度数据
INSERT INTO sales_order_attribute_dim VALUES
(NULL, 'Y', 'N', 'N', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'Y', 'N', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'Y', 'Y', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'Y', 'Y', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'N', 'Y', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'N', 'Y', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'N', 'N', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'Y', 'Y', 'N', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'N', 'N', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'Y', 'N', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'Y', 'Y', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'Y', 'Y', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'N', 'Y', 'N', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'N', 'Y', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'N', 'N', 'Y', 1,'1900-00-00', '2200-01-01')
, (NULL, 'N', 'Y', 'N', 'Y', 1,'1900-00-00', '2200-01-01');
COMMIT;
-- 建立杂项维度外键
ALTER TABLE sales_order_fact ADD sales_order_attribute_sk INT AFTER product_sk;
ALTER TABLE sales_order_fact ADD FOREIGN KEY (sales_order_attribute_sk)
REFERENCES sales_order_attribute_dim(sales_order_attribute_sk ) ON DELETE CASCADE ON UPDATE CASCADE ;
-- 给源库的销售订单表增加对应的属性
USE source;
ALTER TABLE sales_order
ADD verification_ind CHAR (1) AFTER product_code
, ADD credit_check_flag CHAR (1) AFTER verification_ind
, ADD new_customer_ind CHAR (1) AFTER credit_check_flag
, ADD web_order_flag CHAR (1) AFTER new_customer_ind ;
清单(五)- 9-1
修改定期装载脚本 由于有了一个新的维度,必须修改定期装载脚本。清单(五)- 9-2显示修改后的脚本。
代码语言:javascript复制USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
-- 设置CDC的上限时间
UPDATE cdc_time SET current_load = CURRENT_DATE ;
-- 装载客户维度
TRUNCATE TABLE customer_stg;
INSERT INTO customer_stg
SELECT
customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
FROM source.customer ;
/* 在所有地址列上 SCD2 */
/* 置过期 */
UPDATE customer_dim a,
customer_stg b
SET
expiry_date = @pre_date
WHERE
a.customer_number = b.customer_number
AND (a.customer_street_address <> b.customer_street_address
OR a.customer_city <> b.customer_city
OR a.customer_zip_code <> b.customer_zip_code
OR a.customer_state <> b.customer_state
OR a.shipping_address <> b.shipping_address
OR a.shipping_city <> b.shipping_city
OR a.shipping_zip_code <> b.shipping_zip_code
OR a.shipping_state <> b.shipping_state
OR a.shipping_address IS NULL
OR a.shipping_city IS NULL
OR a.shipping_zip_code IS NULL
OR a.shipping_state IS NULL)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO customer_dim
SELECT
NULL
, b.customer_number
, b.customer_name
, b.customer_street_address
, b.customer_zip_code
, b.customer_city
, b.customer_state
, b.shipping_address
, b.shipping_zip_code
, b.shipping_city
, b.shipping_state
, a.version 1
, @pre_date
, '2200-01-01'
FROM
customer_dim a
, customer_stg b
WHERE
a.customer_number = b.customer_number
AND ( a.customer_street_address <> b.customer_street_address
OR a.customer_city <> b.customer_city
OR a.customer_zip_code <> b.customer_zip_code
OR a.customer_state <> b.customer_state
OR a.shipping_address <> b.shipping_address
OR a.shipping_city <> b.shipping_city
OR a.shipping_zip_code <> b.shipping_zip_code
OR a.shipping_state <> b.shipping_state
OR a.shipping_address IS NULL
OR a.shipping_city IS NULL
OR a.shipping_zip_code IS NULL
OR a.shipping_state IS NULL)
AND EXISTS(
SELECT *
FROM customer_dim x
WHERE
b.customer_number=x.customer_number
AND a.expiry_date = @pre_date )
AND NOT EXISTS (
SELECT *
FROM customer_dim y
WHERE
b.customer_number = y.customer_number
AND y.expiry_date = '2200-01-01') ;
/* 在 customer_name 列上 SCD1 */
UPDATE customer_dim a, customer_stg b
SET a.customer_name = b.customer_name
WHERE a.customer_number = b.customer_number
AND a.customer_name <> b.customer_name ;
/* 新增的客户 */
INSERT INTO customer_dim
SELECT
NULL
, customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
, 1
, @pre_date
,'2200-01-01'
FROM customer_stg
WHERE customer_number NOT IN(
SELECT y.customer_number
FROM customer_dim x, customer_stg y
WHERE x.customer_number = y.customer_number) ;
/* 重建PA客户维度 */
TRUNCATE pa_customer_dim;
INSERT INTO pa_customer_dim
SELECT
customer_sk
, customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
, version
, effective_date
, expiry_date
FROM customer_dim
WHERE customer_state = 'PA' ;
/* 装载产品维度 */
TRUNCATE TABLE product_stg ;
INSERT INTO product_stg
SELECT
product_code
, product_name
, product_category
FROM source.product ;
/* 在 product_name 和 product_category 列上 SCD2 */
/* 置过期 */
UPDATE
product_dim a
, product_stg b
SET
expiry_date = @pre_date
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO product_dim
SELECT
NULL
, b.product_code
, b.product_name
, b.product_category
, a.version 1
, @pre_date
,'2200-01-01'
FROM
product_dim a
, product_stg b
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND EXISTS(
SELECT *
FROM product_dim x
WHERE b.product_code = x.product_code
AND a.expiry_date = @pre_date)
AND NOT EXISTS (
SELECT *
FROM product_dim y
WHERE b.product_code = y.product_code
AND y.expiry_date = '2200-01-01') ;
/* 新增的产品 */
INSERT INTO product_dim
SELECT
NULL
, product_code
, product_name
, product_category
, 1
, @pre_date
, '2200-01-01'
FROM product_stg
WHERE product_code NOT IN(
SELECT y.product_code
FROM product_dim x, product_stg y
WHERE x.product_code = y.product_code) ;
-- 装载事实表,新增前一天的订单
INSERT INTO sales_order_fact
SELECT
customer_sk
, product_sk
, g.sales_order_attribute_sk
, e.order_date_sk
, NULL
, NULL
, NULL
, NULL
, a.order_number
, f.request_delivery_date_sk
, order_amount
, quantity
, NULL
, NULL
, NULL
, NULL
FROM
source.sales_order a
, customer_dim c
, product_dim d
, order_date_dim e
, request_delivery_date_dim f
, sales_order_attribute_dim g
, cdc_time h
WHERE
a.order_status = 'N'
AND a.customer_number = c.customer_number
AND a.status_date >= c.effective_date
AND a.status_date < c.expiry_date
AND a.product_code = d.product_code
AND a.status_date >= d.effective_date
AND a.status_date < d.expiry_date
AND a.status_date = e.order_date
AND a.request_delivery_date = f.request_delivery_date
AND a.verification_ind = g.verification_ind
AND a.credit_check_flag = g.credit_check_flag
AND a.new_customer_ind = g.new_customer_ind
AND a.web_order_flag = g.web_order_flag
AND a.entry_date >= h.last_load AND a.entry_date < h.current_load ;
/* UPDATING the new sales order to Allocated status */
UPDATE sales_order_fact a,
source.sales_order b,
allocate_date_dim c,
cdc_time h
SET
a.allocate_date_sk = c.allocate_date_sk,
a.allocate_quantity = b.quantity
WHERE
order_status = 'A'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND c.allocate_date = b.status_date ;
/* UPDATING the allocated order to Packed status */
UPDATE sales_order_fact a,
source.sales_order b,
packing_date_dim d,
cdc_time h
SET
a.packing_date_sk = d.packing_date_sk,
a.packing_quantity = b.quantity
WHERE
order_status = 'P'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND d.packing_date = b.status_date ;
/* UPDATING the packed order to Shipped status */
UPDATE sales_order_fact a,
source.sales_order b,
ship_date_dim e,
cdc_time h
SET
a.ship_date_sk = e.ship_date_sk,
a.ship_quantity = b.quantity
WHERE
order_status = 'S'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND e.ship_date = b.status_date ;
/* UPDATING the shipped order to Received status */
UPDATE sales_order_fact a,
source.sales_order b,
receive_date_dim f,
cdc_time h
SET
a.receive_date_sk = f.receive_date_sk,
a.receive_quantity = b.quantity
WHERE
order_status = 'R'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND f.receive_date = b.status_date ;
-- 更新时间戳表的last_load字段
UPDATE cdc_time SET last_load = current_load ;
COMMIT ;
清单(五)- 9-2
图(五)- 9-2到图(五)- 9-5显示了对Kettle定时装载的修改。
图(五)- 9-2
图(五)- 9-3
图(五)- 9-4
图(五)- 9-5
测试修改后的定期装载 现在使用清单(五)- 9-3里的脚本添加八个销售订单。
代码语言:javascript复制USE source;
INSERT INTO customer
(
customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state)
VALUES
(10, 'Bigger Customers', '7777 Ridge Rd.', '44102',
'Cleveland', 'OH', '7777 Ridge Rd.', '44102', 'Cleveland',
'OH')
, (11, 'Smaller Stores', '8888 Jennings Fwy.', '44102',
'Cleveland', 'OH', '8888 Jennings Fwy.', '44102',
'Cleveland', 'OH')
, (12, 'Small-Medium Retailers', '9999 Memphis Ave.', '44102',
'Cleveland', 'OH', '9999 Memphis Ave.', '44102', 'Cleveland',
'OH') ,
(13, 'PA Customer', '1111 Louise Dr.', '17050',
'Mechanicsburg', 'PA', '1111 Louise Dr.', '17050',
'Mechanicsburg', 'PA')
, (14, 'OH Customer', '6666 Ridge Rd.', '44102',
'Cleveland', 'OH', '6666 Ridge Rd.', '44102',
'Cleveland', 'OH') ;
INSERT INTO sales_order VALUES
(54, 1, 1, 'Y', 'Y', 'N', 'Y', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 7500, 75)
, (55, 2, 2, 'N', 'N', 'N', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 1000, 10)
, (56, 3, 3, 'Y', 'Y', 'N', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 7500, 75)
, (57, 4, 4, 'Y', 'N', 'N', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 1000, 10)
, (58, 11, 1, 'N', 'Y', 'Y', 'Y', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 7500, 75)
, (59, 12, 2, 'N', 'Y', 'Y', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 1000, 10)
, (60, 13, 3, 'Y', 'Y', 'Y', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 7500, 75)
, (61, 14, 4, 'Y', 'N', 'Y', 'N', '2015-03-16', 'N', '2015-03-20',
'2015-03-16', 1000, 10)
;
COMMIT;
清单(五)- 9-3
现在把系统日期设置为2015年3月17日,然后再执行清单(五)- 9-2里的脚本或对应的Kettle作业。 可以使用清单(五)- 9-4里的分析性查询确认装载正确。该查询分析出检查了信用状态的新用户有多少销售订单。查询结果如图(五)- 9-6所示。
代码语言:javascript复制USE dw;
SELECT
CONCAT(ROUND(checked / (checked not_checked) * 100),' % ')
FROM
(SELECT
COUNT(*) checked
FROM
sales_order_fact a, sales_order_attribute_dim b
WHERE
new_customer_ind = 'Y'
and credit_check_flag = 'Y'
AND a.sales_order_attribute_sk = b.sales_order_attribute_sk) x,
(SELECT
COUNT(*) not_checked
FROM
sales_order_fact a, sales_order_attribute_dim b
WHERE
new_customer_ind = 'Y'
and credit_check_flag = 'N'
AND a.sales_order_attribute_sk = b.sales_order_attribute_sk) y;
清单(五)- 9-4
图(五)- 9-6