CDH使用独立的Spark
1. 下载安装spark
2. 配置环境变量
# vim ~/.bashrc
export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
export SPARK_CONF_DIR=/bigdata/spark/conf
export SPARK_HOME=/data/csz-project/app/spark
export PATH=$SPARK_HOME/bin:$PATH
export SPARK_HISTORY_OPTS="-Dspark.history.retainedApplications=3 -Dspark.history.fs.logDirectory=hdfs:///spark/logs"
# source ~/.bashrc
3. mysql 包
# yum install mysql-connector-java
# cp /usr/share/java/mysql-connector-java.jar /bigdata/spark/jars/
4. jar包上传
hdfs dfs -mkdir /spark
hdfs dfs -mkdir /spark/jars
hdfs dfs -mkdir /spark/logs
cd /bigdata/spark/jars
hdfs dfs -put ./*.jar /spark/jars
5. 配置spark
# vim /bigdata/spark/conf/spark-defaults.conf
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://bigdata1/spark/logs
spark.driver.memory 2g //注意内存大小
spark.executor.memory 2g //注意内存大小
spark.shuffle.service.enabled true
spark.shuffle.service.port 7337
spark.dynamicAllocation.enabled true
spark.dynamicAllocation.minExecutors 1
spark.dynamicAllocation.maxExecutors 6
spark.dynamicAllocation.schedulerBacklogTimeout 1s
spark.dynamicAllocation.sustainedSchedulerBacklogTimeout 5s
spark.submit.deployMode client
spark.yarn.jars hdfs://bigdata1/spark/jars/*
spark.serializer org.apache.spark.serializer.KryoSerializer