Impala tpc-h sql 优化
因为impala 现在优化器还差点劲,只能手动改改SQL 提升下性能
下期发 impala-kudu 性能优化一个数量级(测试集 TPC-H 1TB)
q1_pricing_summary_report.sql
代码语言:sql复制EXPLAIN SELECT
L_RETURNFLAG,
L_LINESTATUS,
SUM(L_QUANTITY),
SUM(L_EXTENDEDPRICE),
SUM(L_EXTENDEDPRICE * (1 - L_DISCOUNT)),
SUM((L_EXTENDEDPRICE * (1 - L_DISCOUNT)) * (1 L_TAX)),
AVG(L_QUANTITY),
AVG(L_EXTENDEDPRICE),
AVG(L_DISCOUNT),
CAST(COUNT(1) as int)
FROM lineitem_jasong
WHERE L_SHIPDATE <= '1998-09-02'
GROUP BY
L_RETURNFLAG,
L_LINESTATUS
ORDER BY
L_RETURNFLAG ASC,
L_LINESTATUS ASC
LIMIT 100;
q2_minimum_cost_supplier.sql
代码语言:sql复制EXPLAIN
SELECT
t1.s_acctbal,
t1.s_name,
t1.n_name,
t1.p_partkey,
t1.p_mfgr,
t1.s_address,
t1.s_phone,
t1.s_comment
FROM
(
SELECT
s.s_acctbal,
s.s_name,s
n.n_name,
p.p_partkey,
ps.ps_supplycost,
p.p_mfgr,
s.s_address,
s.s_phone,
s.s_comment
FROM nation AS n
INNER JOIN [SHUFFLE] region AS r ON (n.n_regionkey = r.r_regionkey) AND (r.r_name = 'EUROPE')
INNER JOIN [SHUFFLE] supplier AS s ON s.s_nationkey = n.n_nationkey
INNER JOIN [SHUFFLE] partsupp AS ps ON s.s_suppkey = ps.ps_suppkey
INNER JOIN [SHUFFLE] part AS p ON (p.p_partkey = ps.ps_partkey) AND (p.p_size = 15) AND (p.p_type LIKE '%BRASS')
) AS t1
INNER JOIN [SHUFFLE]
(
SELECT
p_partkey,
min(ps_supplycost) AS ps_min_supplycost
FROM
(
SELECT
s.s_acctbal,
s.s_name,
n.n_name,
p.p_partkey,
ps.ps_supplycost,
p.p_mfgr,
s.s_address,
s.s_phone,
s.s_comment
FROM nation AS n
INNER JOIN [SHUFFLE] region AS r ON (n.n_regionkey = r.r_regionkey) AND (r.r_name = 'EUROPE')
INNER JOIN [SHUFFLE] supplier AS s ON s.s_nationkey = n.n_nationkey
INNER JOIN [SHUFFLE] partsupp AS ps ON s.s_suppkey = ps.ps_suppkey
INNER JOIN [SHUFFLE] part AS p ON (p.p_partkey = ps.ps_partkey) AND (p.p_size = 15) AND (p.p_type LIKE '%BRASS')
) AS t3
GROUP BY p_partkey
) AS t2 ON (t1.p_partkey = t2.p_partkey) AND (t1.ps_supplycost = t2.ps_min_supplycost)
ORDER BY
s_acctbal DESC,
n_name ASC,
s_name ASC,
p_partkey ASC
LIMIT 100;
q3_shipping_priority.sql
代码语言:sql复制EXPLAIN
SELECT
l_orderkey,
sum(l_extendedprice * (1 - l_discount)) AS revenue,
o_orderdate,
o_shippriority
FROM customer AS c
INNER JOIN [SHUFFLE] orders AS o ON (c.c_mktsegment = 'BUILDING') AND (c.c_custkey = o.o_custkey)
INNER JOIN [SHUFFLE] lineitem AS l ON l.l_orderkey = o.o_orderkey
WHERE (o_orderdate < '1995-03-15') AND (l_shipdate > '1995-03-15')
GROUP BY
l_orderkey,
o_orderdate,
o_shippriority
ORDER BY
revenue DESC,
o_orderdate ASC
LIMIT 10;
q4_order_priority.sql
代码语言:sql复制EXPLAIN
SELECT
o_orderpriority,
CAST(count(1) as int) AS order_count
FROM orders AS o
INNER JOIN [SHUFFLE]
(
SELECT DISTINCT l_orderkey AS o_orderkey
FROM lineitem
WHERE l_commitdate < l_receiptdate
) AS t ON (o.o_orderkey = t.o_orderkey) AND (o.o_orderdate >= '1993-07-01') AND (o.o_orderdate < '1993-10-01')
GROUP BY o_orderpriority
ORDER BY o_orderpriority ASC
LIMIT 100;
q5_local_supplier_volume.sql
代码语言:sql复制EXPLAIN
SELECT
n_name,
sum(l_extendedprice * (1 - l_discount)) AS revenue
FROM customer AS c
INNER JOIN [SHUFFLE]
(
SELECT
n_name,
l_extendedprice,
l_discount,
s_nationkey,
o_custkey
FROM orders AS o
INNER JOIN [SHUFFLE]
(
SELECT
n_name,
l_extendedprice,
l_discount,
l_orderkey,
s_nationkey
FROM lineitem AS l
INNER JOIN [SHUFFLE]
(
SELECT
n_name,
s_suppkey,
s_nationkey
FROM supplier AS s
INNER JOIN [SHUFFLE]
(
SELECT
n_name,
n_nationkey
FROM nation AS n
INNER JOIN [SHUFFLE] region AS r ON (n.n_regionkey = r.r_regionkey) AND (r.r_name = 'ASIA')
) AS n1 ON s.s_nationkey = n1.n_nationkey
) AS s1 ON l.l_suppkey = s1.s_suppkey
) AS l1 ON (l1.l_orderkey = o.o_orderkey) AND (o.o_orderdate >= '1994-01-01') AND (o.o_orderdate < '1995-01-01')
) AS o1 ON (c.c_nationkey = o1.s_nationkey) AND (c.c_custkey = o1.o_custkey)
GROUP BY n_name
ORDER BY revenue DESC
LIMIT 100;
q6_forecast_revenue_change.sql
代码语言:sql复制EXPLAIN
SELECT sum(l_extendedprice * l_discount) AS revenue
FROM lineitem
WHERE
l_shipdate >= '1994-01-01'
AND l_shipdate < '1995-01-01'
AND l_discount >= 0.05 AND l_discount <= 0.07
AND l_quantity < 24;
q7_volume_shipping.sql
代码语言:sql复制EXPLAIN
SELECT
supp_nation,
cust_nation,
l_year,
sum(volume) AS revenue
FROM
(
SELECT
supp_nation,
cust_nation,
year(l_shipdate) AS l_year,
l_extendedprice * (1 - l_discount) AS volume
FROM
(
SELECT
n1.n_name AS supp_nation,
n2.n_name AS cust_nation,
n1.n_nationkey AS s_nationkey,
n2.n_nationkey AS c_nationkey
FROM nation AS n1
INNER JOIN [SHUFFLE] nation AS n2 ON (n1.n_regionkey = n2.n_regionkey) AND (n1.n_name = 'FRANCE') AND (n2.n_name = 'GERMANY')
UNION ALL
SELECT
n1.n_name AS supp_nation,
n2.n_name AS cust_nation,
n1.n_nationkey AS s_nationkey,
n2.n_nationkey AS c_nationkey
FROM nation AS n1
INNER JOIN [SHUFFLE] nation AS n2 ON (n1.n_regionkey = n2.n_regionkey) AND (n2.n_name = 'FRANCE') AND (n1.n_name = 'GERMANY')
) AS t
INNER JOIN [SHUFFLE]
(
SELECT
l_shipdate,
l_extendedprice,
l_discount,
c_nationkey,
s_nationkey
FROM supplier AS s
INNER JOIN [SHUFFLE]
(
SELECT
l_shipdate,
l_extendedprice,
l_discount,
l_suppkey,
c_nationkey
FROM customer AS c
INNER JOIN [SHUFFLE]
(
SELECT
l_shipdate,
l_extendedprice,
l_discount,
l_suppkey,
o_custkey
FROM orders AS o
INNER JOIN [SHUFFLE] lineitem AS l ON (o.o_orderkey = l.l_orderkey) AND (l.l_shipdate >= '1995-01-01') AND (l.l_shipdate <= '1996-12-31')
) AS l1 ON c.c_custkey = l1.o_custkey
) AS l2 ON s.s_suppkey = l2.l_suppkey
) AS l3 ON (l3.c_nationkey = t.c_nationkey) AND (l3.s_nationkey = t.s_nationkey)
) AS shipping
GROUP BY
supp_nation,
cust_nation,
l_year
ORDER BY
supp_nation ASC,
cust_nation ASC,
l_year ASC
LIMIT 100;
q8_national_market_share.sql
代码语言:sql复制EXPLAIN
SELECT
CAST(o_year as string),
sum(case when nation = 'BRAZIL' then volume else 0.0 end) / sum(volume) as mkt_share
FROM
(
SELECT
year(o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) AS volume,
n2.n_name AS nation
FROM nation AS n2
INNER JOIN [SHUFFLE]
(
SELECT
o_orderdate,
l_discount,
l_extendedprice,
s_nationkey
FROM supplier AS s
INNER JOIN [SHUFFLE]
(
SELECT
o_orderdate,
l_discount,
l_extendedprice,
l_suppkey
FROM part AS p
INNER JOIN [SHUFFLE]
(
SELECT
o_orderdate,
l_partkey,
l_discount,
l_extendedprice,
l_suppkey
FROM lineitem AS l
INNER JOIN [SHUFFLE]
(
SELECT
o_orderdate,
o_orderkey
FROM orders AS o
INNER JOIN [SHUFFLE]
(
SELECT c.c_custkey
FROM customer AS c
INNER JOIN [SHUFFLE]
(
SELECT n1.n_nationkey
FROM nation AS n1
INNER JOIN [SHUFFLE] region AS r ON (n1.n_regionkey = r.r_regionkey) AND (r.r_name = 'AMERICA')
) AS n11 ON c.c_nationkey = n11.n_nationkey
) AS c1 ON c1.c_custkey = o.o_custkey
) AS o1 ON (l.l_orderkey = o1.o_orderkey) AND (o1.o_orderdate >= '1995-01-01') AND (o1.o_orderdate < '1996-12-31')
) AS l1 ON (p.p_partkey = l1.l_partkey) AND (p.p_type = 'ECONOMY ANODIZED STEEL')
) AS p1 ON s.s_suppkey = p1.l_suppkey
) AS s1 ON s1.s_nationkey = n2.n_nationkey
) AS all_nation
GROUP BY o_year
ORDER BY o_year ASC
LIMIT 100;
q9_product_type_profit.sql
代码语言:sql复制EXPLAIN
SELECT
nation,
CAST(o_year as string),
sum(amount) AS sum_profit
FROM
(
SELECT
n_name AS nation,
year(o_orderdate) AS o_year,
(l_extendedprice * (1 - l_discount)) - (ps_supplycost * l_quantity) AS amount
FROM orders AS o
INNER JOIN [SHUFFLE]
(
SELECT
l_extendedprice,
l_discount,
l_quantity,
l_orderkey,
n_name,
ps_supplycost
FROM part AS p
INNER JOIN [SHUFFLE]
(
SELECT
l_extendedprice,
l_discount,
l_quantity,
l_partkey,
l_orderkey,
n_name,
ps_supplycost
FROM partsupp AS ps
INNER JOIN [SHUFFLE]
(
SELECT
l_suppkey,
l_extendedprice,
l_discount,
l_quantity,
l_partkey,
l_orderkey,
n_name
FROM
(
SELECT
s_suppkey,
n_name
FROM nation AS n
INNER JOIN [SHUFFLE] supplier AS s ON n.n_nationkey = s.s_nationkey
) AS s1
INNER JOIN [SHUFFLE] lineitem AS l ON s1.s_suppkey = l.l_suppkey
) AS l1 ON (ps.ps_suppkey = l1.l_suppkey) AND (ps.ps_partkey = l1.l_partkey)
) AS l2 ON (p.p_name LIKE '%green%') AND (p.p_partkey = l2.l_partkey)
) AS l3 ON o.o_orderkey = l3.l_orderkey
) AS profit
GROUP BY
nation,
o_year
ORDER BY
nation ASC,
o_year DESC
LIMIT 100;
q10_returned_item.sql
代码语言:sql复制EXPLAIN
SELECT
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) AS revenue,
CAST(CAST(c_acctbal * 1000 as int) / 1000 as string)
n_name,
c_address,
c_phone,
c_comment
FROM customer AS c
INNER JOIN [SHUFFLE] orders AS o ON (c.c_custkey = o.o_custkey) AND (o.o_orderdate >= '1993-10-01') AND (o.o_orderdate < '1994-01-01')
INNER JOIN [SHUFFLE] nation AS n ON c.c_nationkey = n.n_nationkey
INNER JOIN [SHUFFLE] lineitem AS l ON (l.l_orderkey = o.o_orderkey) AND (l.l_returnflag = 'R')
GROUP BY
c_custkey,
c_name,
CAST(c_acctbal * 1000 as int),
c_phone,
n_name,
c_address,
c_comment
ORDER BY revenue DESC
LIMIT 20;
q12_shipping.sql
代码语言:sql复制EXPLAIN
SELECT
l_shipmode,
sum(case when (o_orderpriority = '1-URGENT') OR (o_orderpriority = '2-HIGH') then 1 else 0 end ) AS high_line_count,
sum(case when (o_orderpriority != '1-URGENT') AND (o_orderpriority != '2-HIGH') then 1 else 0 end ) AS low_line_count
FROM orders AS o
INNER JOIN [SHUFFLE] lineitem AS l ON (o.o_orderkey = l.l_orderkey) AND (l.l_commitdate < l.l_receiptdate) AND (l.l_shipdate < l.l_commitdate) AND (l.l_receiptdate >= '1994-01-01') AND (l.l_receiptdate < '1995-01-01')
WHERE (l.l_shipmode = 'MAIL') OR (l.l_shipmode = 'SHIP')
GROUP BY l_shipmode
ORDER BY l_shipmode ASC
LIMIT 100;
q13_customer_distribution.sql
代码语言:sql复制EXPLAIN
SELECT
CAST(c_count as int),
CAST(count(1) as int) as custdist
FROM
(
SELECT
c_custkey,
count(o_orderkey) AS c_count
FROM customer AS c
LEFT JOIN orders AS o ON (c.c_custkey = o.o_custkey) AND (NOT (o.o_comment LIKE '%special%requests%'))
GROUP BY c_custkey
) AS c_orders
GROUP BY c_count
ORDER BY
custdist DESC,
c_count DESC
LIMIT 100;
q14_promotion_effect.sql
代码语言:sql复制EXPLAIN
SELECT
100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice*(1-l_discount)
else 0.0
end
) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
FROM part AS p
INNER JOIN [SHUFFLE] lineitem AS l ON (l.l_partkey = p.p_partkey)
AND (l.l_shipdate >= '1995-09-01')
AND (l.l_shipdate < '1995-10-01');
q15_top_supplier.sql
代码语言:sql复制EXPLAIN
SELECT
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
FROM supplier AS s
INNER JOIN [SHUFFLE]
(
SELECT
l_suppkey AS supplier_no,
sum(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM lineitem
WHERE (l_shipdate >= '1996-01-01') AND (l_shipdate < '1996-04-01')
GROUP BY l_suppkey
) AS r ON s.s_suppkey = r.supplier_no
INNER JOIN [SHUFFLE]
(
SELECT max(total_revenue) AS max_revenue
FROM
(
SELECT
l_suppkey AS supplier_no,
sum(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM lineitem
WHERE (l_shipdate >= '1996-01-01') AND (l_shipdate < '1996-04-01')
GROUP BY l_suppkey
) AS r
) AS m ON r.total_revenue = m.max_revenue
ORDER BY s_suppkey ASC
LIMIT 100;
q16_parts_supplier_relationship.sql
代码语言:sql复制EXPLAIN
SELECT
p_brand,
p_type,
p_size,
cast(count(distinct ps_suppkey) as int) as supplier_cnt
FROM
(
SELECT *
FROM
(
SELECT
p_brand,
p_type,
p_size,
ps_suppkey
FROM partsupp AS ps
INNER JOIN [SHUFFLE] part AS p ON (p.p_partkey = ps.ps_partkey) AND (p.p_brand != 'Brand#45') AND (NOT (p.p_type LIKE 'MEDIUM POLISHED%'))
INNER JOIN [SHUFFLE]
(
SELECT s_suppkey
FROM supplier
WHERE NOT (s_comment LIKE '%Customer%Complaints%')
) AS s ON ps.ps_suppkey = s.s_suppkey
) AS q16_tmp
WHERE (p_size = 49) OR (p_size = 14) OR (p_size = 23) OR (p_size = 45) OR (p_size = 19) OR (p_size = 3) OR (p_size = 36) OR (p_size = 9)
) AS q16_all
GROUP BY
p_brand,
p_type,
p_size
ORDER BY
supplier_cnt DESC,
p_brand ASC,
p_type ASC,
p_size ASC
LIMIT 100;
q17_small_quantity_order_revenue.sql
代码语言:sql复制EXPLAIN
SELECT sum(l_extendedprice) / 7. AS avg_yearly
FROM
(
SELECT
l_quantity,
l_extendedprice,
t_avg_quantity
FROM
(
SELECT
l_partkey AS t_partkey,
0.2 * avg(l_quantity) AS t_avg_quantity
FROM lineitem
GROUP BY l_partkey
) AS t
INNER JOIN [SHUFFLE]
(
SELECT
l_quantity,
l_partkey,
l_extendedprice
FROM part AS p
INNER JOIN [SHUFFLE] lineitem AS l ON (p.p_partkey = l.l_partkey) AND (p.p_brand = 'Brand#23') AND (p.p_container = 'MED BOX')
) AS l1 ON l1.l_partkey = t.t_partkey
) AS a
WHERE l_quantity < t_avg_quantity;
q18_large_volume_customer.sql
代码语言:sql复制EXPLAIN
SELECT
c_name,
c_custkey,
o_orderkey,
o_orderdate,
CAST(o_totalprice * 1000 as int) / 1000,
sum(l_quantity)
FROM customer AS c
INNER JOIN [SHUFFLE] orders AS o ON c.c_custkey = o.o_custkey
INNER JOIN [SHUFFLE]
(
SELECT
l_orderkey,
sum(l_quantity) AS t_sum_quantity
FROM lineitem
GROUP BY l_orderkey
) AS t ON (o.o_orderkey = t.l_orderkey) AND (t.t_sum_quantity > 300)
INNER JOIN [SHUFFLE] lineitem AS l ON o.o_orderkey = l.l_orderkey
GROUP BY
c_name,
c_custkey,
o_orderkey,
o_orderdate,
CAST(o_totalprice * 1000 as int)
ORDER BY
CAST(o_totalprice * 1000 as int) DESC,
o_orderdate ASC
LIMIT 100;
q19_discounted_revenue.sql
代码语言:sql复制explain select
sum(l_extendedprice * (1 - l_discount) ) as revenue
from
lineitem l join part p
on
p.p_partkey = l.l_partkey
where
(
p_brand = 'Brand#12'
and p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG'
and l_quantity >= 1 and l_quantity <= 11
and p_size >= 1 and p_size <= 5
and l_shipmode REGEXP 'AIR||AIR REG'
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#23'
and p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK'
and l_quantity >= 10 and l_quantity <= 20
and p_size >= 1 and p_size <= 10
and l_shipmode REGEXP 'AIR||AIR REG'
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#34'
and p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG'
and l_quantity >= 20 and l_quantity <= 30
and p_size >= 1 and p_size <= 15
and l_shipmode REGEXP 'AIR||AIR REG'
and l_shipinstruct = 'DELIVER IN PERSON'
);
q20_potential_part_promotion.sql
代码语言:sql复制EXPLAIN
SELECT
s_name,
s_address
FROM supplier AS s
INNER JOIN [SHUFFLE] nation AS n ON (s.s_nationkey = n.n_nationkey) AND (n.n_name = 'CANADA')
INNER JOIN [SHUFFLE]
(
SELECT ps_suppkey
FROM
(
SELECT
ps_suppkey,
ps_availqty,
sum_quantity
FROM partsupp AS ps
INNER JOIN [SHUFFLE]
(
SELECT DISTINCT p_partkey
FROM part
WHERE p_name LIKE 'forest%'
) AS t1 ON ps.ps_partkey = t1.p_partkey
INNER JOIN [SHUFFLE]
(
SELECT
l_partkey,
l_suppkey,
0.5 * sum(l_quantity) AS sum_quantity
FROM lineitem
WHERE (l_shipdate >= '1994-01-01') AND (l_shipdate < '1995-01-01')
GROUP BY
l_partkey,
l_suppkey
) AS t2 ON (ps.ps_partkey = t2.l_partkey) AND (ps.ps_suppkey = t2.l_suppkey)
) AS q20_tmp3
WHERE ps_availqty > sum_quantity
GROUP BY ps_suppkey
) AS t4 ON s.s_suppkey = t4.ps_suppkey
ORDER BY s_name ASC
LIMIT 100;
q21_suppliers_who_kept_orders_waiting.sql
代码语言:sql复制EXPLAIN
SELECT
s_name,
cast(count(1) as int) as numwait
FROM
(
SELECT s_name
FROM
(
SELECT
s_name,
t2.l_orderkey,
l_suppkey,
count_suppkey,
max_suppkey
FROM
(
SELECT
l_orderkey,
cast(count(distinct l_suppkey) as int) count_suppkey,
max(l_suppkey) AS max_suppkey
FROM lineitem
WHERE l_receiptdate > l_commitdate
GROUP BY l_orderkey
) AS t2
RIGHT JOIN
(
SELECT
s_name,
l_orderkey,
l_suppkey
FROM
(
SELECT
s_name,
t1.l_orderkey,
l_suppkey,
count_suppkey,
max_suppkey
FROM
(
SELECT
l_orderkey,
cast(count(distinct l_suppkey) as int) count_suppkey,
max(l_suppkey) AS max_suppkey
FROM lineitem
GROUP BY l_orderkey
) AS t1
INNER JOIN [SHUFFLE]
(
SELECT
s_name,
l_orderkey,
l_suppkey
FROM orders AS o
INNER JOIN [SHUFFLE]
(
SELECT
s_name,
l_orderkey,
l_suppkey
FROM nation AS n
INNER JOIN [SHUFFLE] supplier AS s ON (s.s_nationkey = n.n_nationkey) AND (n.n_name = 'SAUDI ARABIA')
INNER JOIN [SHUFFLE] lineitem AS l ON s.s_suppkey = l.l_suppkey
WHERE l.l_receiptdate > l.l_commitdate
) AS l1 ON (o.o_orderkey = l1.l_orderkey) AND (o.o_orderstatus = 'F')
) AS l2 ON l2.l_orderkey = t1.l_orderkey
) AS a
WHERE (count_suppkey > 1) OR ((count_suppkey = 1) AND (l_suppkey != max_suppkey))
) AS l3 ON l3.l_orderkey = t2.l_orderkey
) AS b
WHERE (count_suppkey is null) OR ((count_suppkey = 1) AND (l_suppkey = max_suppkey))
) AS c
GROUP BY s_name
ORDER BY
numwait DESC,
s_name ASC
LIMIT 100;