(五)进阶技术 1. 增加列 数据仓库最常碰到的扩展是给一个已经存在的维度表和事实表添加列。本篇先讨论如果需要增加列,模式会发生怎样的变化。然后进一步说明如何在客户维度和销售订单事实表上添加列,并在新列上应用SCD2。假设需要在客户维度中增加送货地址属性,并在销售订单事实表中增加数量度量值。
修改数据库模式
图(五)- 1-1 显示了修改后的模式,在它的customer_dim表和sales_order_fact表上增加了新列。customer_dim表增加的新列是shipping_address、shipping_zip_code、shipping_city和shipping_state。sales_order_fact表增加的新列是order_quantity。使用清单(五)-1-1里的SQL脚本修改数据库模式。
图(五)- 1-1
代码语言:javascript复制USE dw;
ALTER TABLE customer_dim
ADD shipping_address VARCHAR (50) AFTER customer_state
, ADD shipping_zip_code INT (5) AFTER shipping_address
, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code
, ADD shipping_state VARCHAR (2) AFTER shipping_city ;
ALTER TABLE customer_stg
ADD shipping_address VARCHAR (50) AFTER customer_state
, ADD shipping_zip_code INT (5) AFTER shipping_address
, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code
, ADD shipping_state VARCHAR (2) AFTER shipping_city ;
ALTER TABLE sales_order_fact
ADD order_quantity INT AFTER order_amount ;
USE source;
ALTER TABLE customer
ADD shipping_address VARCHAR (50) AFTER customer_state
, ADD shipping_zip_code INT (5) AFTER shipping_address
, ADD shipping_city VARCHAR (30) AFTER shipping_zip_code
, ADD shipping_state VARCHAR (2) AFTER shipping_city ;
ALTER TABLE sales_order
ADD order_quantity INT AFTER order_amount ;
清单(五)-1-1
修改定期装载脚本 修改数据库模式后,还要修改已经使用的定期装载脚本。清单(五)-1-2显示了修改后的定期装载SQL脚本。
代码语言:javascript复制USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
-- 设置CDC的上限时间
UPDATE cdc_time SET current_load = CURRENT_DATE ;
-- 装载客户维度
TRUNCATE TABLE customer_stg;
INSERT INTO customer_stg
SELECT
customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
FROM source.customer ;
/* 在所有地址列上 SCD2 */
/* 置过期 */
UPDATE customer_dim a,
customer_stg b
SET
expiry_date = @pre_date
WHERE
a.customer_number = b.customer_number
AND (a.customer_street_address <> b.customer_street_address
OR a.customer_city <> b.customer_city
OR a.customer_zip_code <> b.customer_zip_code
OR a.customer_state <> b.customer_state
OR a.shipping_address <> b.shipping_address
OR a.shipping_city <> b.shipping_city
OR a.shipping_zip_code <> b.shipping_zip_code
OR a.shipping_state <> b.shipping_state
OR a.shipping_address IS NULL
OR a.shipping_city IS NULL
OR a.shipping_zip_code IS NULL
OR a.shipping_state IS NULL)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO customer_dim
SELECT
NULL
, b.customer_number
, b.customer_name
, b.customer_street_address
, b.customer_zip_code
, b.customer_city
, b.customer_state
, b.shipping_address
, b.shipping_zip_code
, b.shipping_city
, b.shipping_state
, a.version 1
, @pre_date
, '2200-01-01'
FROM
customer_dim a
, customer_stg b
WHERE
a.customer_number = b.customer_number
AND ( a.customer_street_address <> b.customer_street_address
OR a.customer_city <> b.customer_city
OR a.customer_zip_code <> b.customer_zip_code
OR a.customer_state <> b.customer_state
OR a.shipping_address <> b.shipping_address
OR a.shipping_city <> b.shipping_city
OR a.shipping_zip_code <> b.shipping_zip_code
OR a.shipping_state <> b.shipping_state
OR a.shipping_address IS NULL
OR a.shipping_city IS NULL
OR a.shipping_zip_code IS NULL
OR a.shipping_state IS NULL)
AND EXISTS(
SELECT *
FROM customer_dim x
WHERE
b.customer_number=x.customer_number
AND a.expiry_date = @pre_date )
AND NOT EXISTS (
SELECT *
FROM customer_dim y
WHERE
b.customer_number = y.customer_number
AND y.expiry_date = '2200-01-01') ;
/* 在 customer_name 列上 SCD1 */
UPDATE customer_dim a, customer_stg b
SET a.customer_name = b.customer_name
WHERE a.customer_number = b.customer_number
AND a.customer_name <> b.customer_name ;
/* 新增的客户 */
INSERT INTO customer_dim
SELECT
NULL
, customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
, 1
, @pre_date
,'2200-01-01'
FROM customer_stg
WHERE customer_number NOT IN(
SELECT y.customer_number
FROM customer_dim x, customer_stg y
WHERE x.customer_number = y.customer_number) ;
/* 装载产品维度 */
TRUNCATE TABLE product_stg ;
INSERT INTO product_stg
SELECT
product_code
, product_name
, product_category
FROM source.product ;
/* 在 product_name 和 product_category 列上 SCD2 */
/* 置过期 */
UPDATE
product_dim a
, product_stg b
SET
expiry_date = @pre_date
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO product_dim
SELECT
NULL
, b.product_code
, b.product_name
, b.product_category
, a.version 1
, @pre_date
,'2200-01-01'
FROM
product_dim a
, product_stg b
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND EXISTS(
SELECT *
FROM product_dim x
WHERE b.product_code = x.product_code
AND a.expiry_date = @pre_date)
AND NOT EXISTS (
SELECT *
FROM product_dim y
WHERE b.product_code = y.product_code
AND y.expiry_date = '2200-01-01') ;
/* 新增的产品 */
INSERT INTO product_dim
SELECT
NULL
, product_code
, product_name
, product_category
, 1
, @pre_date
, '2200-01-01'
FROM product_stg
WHERE product_code NOT IN(
SELECT y.product_code
FROM product_dim x, product_stg y
WHERE x.product_code = y.product_code) ;
-- 装载订单维度,新增前一天的订单号
INSERT INTO order_dim (
order_number
, effective_date
, expiry_date)
SELECT
order_number
, order_date
, '2200-01-01'
FROM source.sales_order, cdc_time
WHERE entry_date >= last_load AND entry_date < current_load ;
-- 装载事实表,新增前一天的订单
INSERT INTO sales_order_fact
SELECT
order_sk
, customer_sk
, product_sk
, date_sk
, order_amount
, order_quantity
FROM
source.sales_order a
, order_dim b
, customer_dim c
, product_dim d
, date_dim e
, cdc_time f
WHERE
a.order_number = b.order_number
AND a.customer_number = c.customer_number
AND a.order_date >= c.effective_date
AND a.order_date < c.expiry_date
AND a.product_code = d.product_code
AND a.order_date >= d.effective_date
AND a.order_date < d.expiry_date
AND a.order_date = e.date
AND a.entry_date >= f.last_load AND a.entry_date < f.current_load ;
-- 更新时间戳表的last_load字段
UPDATE cdc_time SET last_load = current_load ;
COMMIT ;
清单(五)-1-2
假设源数据里的客户送货地址可以为空,当客户在数据源里的送货地址变为有值时,就要更新这些数据仓库中已经存在的客户。如果还想维护送货地址的历史数据,就要在送货地址列上应用SCD2。还假设数量在销售订单源数据中是有效的,并且数据仓库中已经存在的销售订单不做更新。 测试步骤:
- 执行清单(五)- 1-3里的SQL脚本准备准备客户和销售订单测试数据。
- 设置系统日期为2015年3月3日。
- 执行清单(五)-1-2里的SQL脚本或Kettle步骤进行定期装载。
说明:Kettle的修改相对于SQL来说更容易,只需要对上一篇的三个步骤进行修改,这三个步骤分别是“装载过渡表”、“装载客户维度”、“装载事实表”,把新增加的列补充上即可。如图(五)- 1-2到(五)- 1-7显示了变化的步骤。
代码语言:javascript复制USE source;
/***
客户数据的改变如下:
更新已有八个客户的送货地址
新增客户9
***/
UPDATE customer SET
shipping_address = customer_street_address
, shipping_zip_code = customer_zip_code
, shipping_city = customer_city
, shipping_state = customer_state ;
INSERT INTO customer
(customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state)
VALUES
('Online Distributors'
, '2323 Louise Dr.'
, 17055
, 'Pittsburgh'
, 'PA'
, '2323 Louise Dr.'
, 17055
, 'Pittsburgh'
, 'PA') ;
INSERT INTO sales_order VALUES
(38, 1, 1, '2015-03-02', '2015-03-02', 1000, 10)
, (39, 2, 2, '2015-03-02', '2015-03-02', 2000, 20)
, (40, 3, 3, '2015-03-02', '2015-03-02', 4000, 40)
, (41, 4, 4, '2015-03-02', '2015-03-02', 6000, 60)
, (42, 5, 1, '2015-03-02', '2015-03-02', 2500, 25)
, (43, 6, 2, '2015-03-02', '2015-03-02', 5000, 50)
, (44, 7, 3, '2015-03-02', '2015-03-02', 7500, 75)
, (45, 8, 4, '2015-03-02', '2015-03-02', 1000, 10)
, (46, 9, 1, '2015-03-02', '2015-03-02', 1000, 10) ;
COMMIT ;
清单(五)- 1-3
图(五)- 1-2
图(五)- 1-3
图(五)- 1-4
图(五)- 1-5
图(五)- 1-6
图(五)- 1-7
验证结果应该如下所示: mysql> select -> customer_number no, -> customer_name name, -> shipping_city, -> shipping_zip_code zip, -> shipping_state st, -> version ver, -> effective_date eff, -> expiry_date exp -> from -> customer_dim; ------ ------------------------ --------------- ------- ------ ----- ------------ ------------ | no | name | shipping_city | zip | st | ver | eff | exp | ------ ------------------------ --------------- ------- ------ ----- ------------ ------------ | 1 | Really Large Customers | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 2 | Small Stores | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 3 | Medium Retailers | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 4 | Good Companies | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 5 | Wonderful Shops | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 6 | Loyal Clients | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-01 | | 7 | Distinguished Agencies | NULL | NULL | NULL | 1 | 2013-03-01 | 2015-03-02 | | 6 | Loyal Clients | NULL | NULL | NULL | 2 | 2015-03-01 | 2015-03-02 | | 8 | Subsidiaries | NULL | NULL | NULL | 1 | 2015-03-01 | 2015-03-02 | | 1 | Really Large Customers | Mechanicsburg | 17050 | PA | 2 | 2015-03-02 | 2200-01-01 | | 2 | Small Stores | Pittsburgh | 17055 | PA | 2 | 2015-03-02 | 2200-01-01 | | 3 | Medium Retailers | Pittsburgh | 17055 | PA | 2 | 2015-03-02 | 2200-01-01 | | 4 | Good Companies | Mechanicsburg | 17050 | PA | 2 | 2015-03-02 | 2200-01-01 | | 5 | Wonderful Shops | Mechanicsburg | 17050 | PA | 2 | 2015-03-02 | 2200-01-01 | | 6 | Loyal Clients | Pittsburgh | 17055 | PA | 3 | 2015-03-02 | 2200-01-01 | | 7 | Distinguished Agencies | Mechanicsburg | 17050 | PA | 2 | 2015-03-02 | 2200-01-01 | | 8 | Subsidiaries | Pittsburgh | 17055 | PA | 2 | 2015-03-02 | 2200-01-01 | | 9 | Online Distributors | Pittsburgh | 17055 | PA | 1 | 2015-03-02 | 2200-01-01 | ------ ------------------------ --------------- ------- ------ ----- ------------ ------------ 18 rows in set (0.00 sec) 已存在客户的新记录有了送货地址。老的(过期)记录没有。9号客户是新加的,具有运输地址。 mysql> select -> order_sk o_sk, -> customer_sk c_sk, -> product_sk p_sk, -> order_date_sk od_sk, -> order_amount amt, -> order_quantity qty -> from -> sales_order_fact; ------ ------ ------ ------- --------- ------ | o_sk | c_sk | p_sk | od_sk | amt | qty | ------ ------ ------ ------- --------- ------ | 1 | 3 | 3 | 4809 | 4000.00 | NULL | | 2 | 4 | 1 | 4854 | 4000.00 | NULL | | 3 | 5 | 2 | 4889 | 6000.00 | NULL | | 4 | 6 | 3 | 4960 | 6000.00 | NULL | | 5 | 7 | 1 | 4993 | 8000.00 | NULL | | 6 | 1 | 2 | 5063 | 8000.00 | NULL | | 7 | 2 | 3 | 5119 | 1000.00 | NULL | | 8 | 3 | 1 | 5155 | 1000.00 | NULL | | 9 | 4 | 2 | 5188 | 2000.00 | NULL | | 10 | 5 | 3 | 5224 | 2500.00 | NULL | | 11 | 6 | 1 | 5264 | 3000.00 | NULL | | 12 | 7 | 2 | 5266 | 3500.00 | NULL | | 13 | 1 | 3 | 5310 | 4000.00 | NULL | | 14 | 2 | 1 | 5356 | 4500.00 | NULL | | 15 | 3 | 2 | 5362 | 1000.00 | NULL | | 16 | 4 | 3 | 5392 | 1000.00 | NULL | | 17 | 5 | 1 | 5489 | 4000.00 | NULL | | 18 | 6 | 2 | 5530 | 4000.00 | NULL | | 19 | 7 | 3 | 5538 | 4000.00 | NULL | | 20 | 1 | 1 | 5539 | 1000.00 | NULL | | 21 | 2 | 2 | 5539 | 2000.00 | NULL | | 22 | 3 | 4 | 5539 | 3000.00 | NULL | | 23 | 4 | 5 | 5539 | 4000.00 | NULL | | 24 | 5 | 2 | 5539 | 1000.00 | NULL | | 25 | 8 | 2 | 5539 | 3000.00 | NULL | | 26 | 7 | 4 | 5539 | 5000.00 | NULL | | 27 | 9 | 5 | 5539 | 7000.00 | NULL | | 28 | 1 | 1 | 5539 | 1000.00 | NULL | | 29 | 2 | 2 | 5539 | 2000.00 | NULL | | 30 | 3 | 4 | 5539 | 4000.00 | NULL | | 31 | 4 | 5 | 5539 | 6000.00 | NULL | | 32 | 5 | 1 | 5539 | 2500.00 | NULL | | 33 | 8 | 2 | 5539 | 5000.00 | NULL | | 34 | 7 | 4 | 5539 | 7500.00 | NULL | | 35 | 9 | 5 | 5539 | 1000.00 | NULL | | 36 | 10 | 1 | 5540 | 1000.00 | 10 | | 37 | 11 | 2 | 5540 | 2000.00 | 20 | | 38 | 12 | 4 | 5540 | 4000.00 | 40 | | 39 | 13 | 5 | 5540 | 6000.00 | 60 | | 40 | 14 | 1 | 5540 | 2500.00 | 25 | | 41 | 15 | 2 | 5540 | 5000.00 | 50 | | 42 | 16 | 4 | 5540 | 7500.00 | 75 | | 43 | 17 | 5 | 5540 | 1000.00 | 10 | | 44 | 18 | 1 | 5540 | 1000.00 | 10 | ------ ------ ------ ------- --------- ------ 44 rows in set (0.00 sec) 只有九个订单有数量,老的销售数据没有。