Forked from bernhardschaefer/spark-submit-streaming-yarn.sh
Created
October 20, 2017 05:23
-
-
Save oeegee/6a43b4e62de948db58654a3152f91eaa to your computer and use it in GitHub Desktop.
spark-submit template for running Spark Streaming on YARN (referenced in https://www.inovex.de/blog/247-spark-streaming-on-yarn-in-production/)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Minimum TODOs on a per job basis: | |
# 1. define name, application jar path, main class, queue and log4j-yarn.properties path | |
# 2. remove properties not applicable to your Spark version (Spark 1.x vs. Spark 2.x) | |
# 3. tweak num_executors, executor_memory (+ overhead), and backpressure settings | |
# the two most important settings: | |
num_executors=6 | |
executor_memory=3g | |
# 3-5 cores per executor is a good default balancing HDFS client throughput vs. JVM overhead | |
# see http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ | |
executor_cores=3 | |
# backpressure | |
receiver_max_rate=100 | |
receiver_initial_rate=30 | |
spark-submit --master yarn --deploy-mode cluster \ | |
--name <my-job-name> \ | |
--class <main-class> \ | |
--driver-memory 2g \ | |
--num-executors ${num_executors} --executor-cores ${executor_cores} --executor-memory ${executor_memory} \ | |
--queue <realtime_queue> \ | |
--files <hdfs:///path/to/log4j-yarn.properties> \ | |
--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \ | |
--conf spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \ | |
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer `# Kryo Serializer is much faster than the default Java Serializer` \ | |
--conf spark.locality.wait=10 `# Increase job parallelity by reducing Spark Delay Scheduling (potentially big performance impact (!)) (Default: 3s)` \ | |
--conf spark.task.maxFailures=8 `# Increase max task failures before failing job (Default: 4)` \ | |
--conf spark.ui.killEnabled=false `# Prevent killing of stages and corresponding jobs from the Spark UI` \ | |
--conf spark.logConf=true `# Log Spark Configuration in driver log for troubleshooting` \ | |
`# SPARK STREAMING CONFIGURATION` \ | |
--conf spark.streaming.blockInterval=200 `# [Optional] Tweak to balance data processing parallelism vs. task scheduling overhead (Default: 200ms)` \ | |
--conf spark.streaming.receiver.writeAheadLog.enable=true `# Prevent data loss on driver recovery` \ | |
--conf spark.streaming.backpressure.enabled=true \ | |
--conf spark.streaming.backpressure.pid.minRate=10 `# [Optional] Reduce min rate of PID-based backpressure implementation (Default: 100)` \ | |
--conf spark.streaming.receiver.maxRate=${receiver_max_rate} `# [Spark 1.x]: Workaround for missing initial rate (Default: not set)` \ | |
--conf spark.streaming.kafka.maxRatePerPartition=${receiver_max_rate} `# [Spark 1.x]: Corresponding max rate setting for Direct Kafka Streaming (Default: not set)` \ | |
--conf spark.streaming.backpressure.initialRate=${receiver_initial_rate} `# [Spark 2.x]: Initial rate before backpressure kicks in (Default: not set)` \ | |
`# YARN CONFIGURATION` \ | |
--conf spark.yarn.driver.memoryOverhead=512 `# [Optional] Set if --driver-memory < 5GB` \ | |
--conf spark.yarn.executor.memoryOverhead=1024 `# [Optional] Set if --executor-memory < 10GB` \ | |
--conf spark.yarn.maxAppAttempts=4 `# Increase max application master attempts (needs to be <= yarn.resourcemanager.am.max-attempts in YARN, which defaults to 2) (Default: yarn.resourcemanager.am.max-attempts)` \ | |
--conf spark.yarn.am.attemptFailuresValidityInterval=1h `# Attempt counter considers only the last hour (Default: (none))` \ | |
--conf spark.yarn.max.executor.failures=$((8 * ${num_executors})) `# Increase max executor failures (Default: max(numExecutors * 2, 3))` \ | |
--conf spark.yarn.executor.failuresValidityInterval=1h `# Executor failure counter considers only the last hour` \ | |
</path/to/spark-application.jar> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment