常用spark优化参数
强制使用spark engine
代码语言:javascript
复制set tqs.query.engine.type = sparkCli;
set spark.yarn.priority = 4;
双写HDFS开启:
代码语言:javascript
复制set spark.shuffle.hdfs.enable=true;
set spark.shuffle.io.maxRetries=1;
set spark.shuffle.io.retryWait=0s;
set spark.network.timeout=120s;
## 双写HDFS开启避免fetch failed,且基本上只有20min以上大任务再开启
调整全局任务并行度
代码语言:javascript
复制set spark.sql.shuffle.partitions=400;
set spark.default.paralleism=400;
set spark.executore.cores=4;
动态资源申请
代码语言:javascript
复制set spark.dynamicAllocation = True;
set spark.dynamicAllocation.minExecutors = 30;
set spark.dynamicAllocation.maxExecutors = 200;
set spark.dynamicAllocation.initExectors = 30;
## 动态资源申请,保证尽快起任务,不适用时归还资源
memory
代码语言:javascript
复制set spark.exector.memory=10g;
set spark.executor.memoryOverhead=10g;
set spark.driver.memory=3g;
## memory:executor memory = memory memoryoverhead
join
代码语言:javascript
复制set spark.shuffle.statistic.verbose=true; -- 收集join数据
set spark.sql.join.perferSortMergejoin=false; -- disable sort to enable hash
set spark.sql.autoBroadcastJoinThreshold=134217728; -- 如果不设置跟autoBroadcastJoinThreshold一致,则被覆盖
AE:skewed
代码语言:javascript
复制set spark.sql.adaptive.skewedJoin.enable=true;
set spark.sql.adaptive.skewedpartitionMaxSplits=3;
set spark.sql.adaptive.skewedPartitionFactor=3;
set spark.sql.adaptive.skewedPartitionSizeThreshold=52428800;
set spark.sql.adaptive.skewedPartitionRowCountThreshold=5000000;
AE:partition
代码语言:javascript
复制set spark.sql.adaptive.maxNumPostShufflePartitions=1000;
set spark.sql.adaptive.minNumPostShufflePartitions=10;
set spark.sql.adaptive.shuffle.targetPostShuffleInputSize=60;
## 解决partition太多,reducer生成太多文件的问题,自动进行文件合并;
input
代码语言:javascript
复制set spark.sql.hive.convertMetastoreParquet=true;
set spark.sql.parquet.adaptiveFileSplit=true;
set spark.sql.files.maxPartitionBytes=314572800;
set spark.sql.files.openCostinBytes=16777216;
Output
代码语言:javascript
复制set spark.merge.files.enabled=true;
set spark.merge.files.number=512;