# Master ssh-keygen -t rsa cat /home/kali/.ssh/id_rsa.pub # Worker mkdir ~/.ssh chmod 700 ~/.ssh touch ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys vi ~/.ssh/authorized_keys安装 OpenJDK 11
环境Ubuntu 16
# Add the repository sudo add-apt-repository ppa:openjdk-r/ppa # Update package list sudo apt-get update # Install openjdk-11-jdk sudo apt install openjdk-11-jdk安装Spark
sudo mkdir /opt/spark cd /opt/spark wget --no-check-certificate https://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop2.7.tgz sudo tar -xzvf spark-3.1.3-bin-hadoop2.7.tgz sudo ln -s ./spark-3.1.3-bin-hadoop2.7 spark_latest
$ sudo vim ~/.profile export SPARK_HOME=/opt/spark/spark_latest export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin $ source ~/.profile配置hosts
vim /etc/hosts 192.168.174.128 host1 192.168.174.129 host2配置Spark
Master
$ vim spark-defaults.conf # node1是设备名 spark.master node1:7077 $SPARK_HOME/sbin/start-master.sh
Worker
vim conf/spark-env.sh # 根据内存大小配置 SPARK_DRIVER_MEMORY=1000m SPARK_EXECUTOR_MEMORY=512m SPARK_WORKER_MEMORY=512m $SPARK_HOME/sbin/start-worker.sh node0:7077运行测试
MASTER=spark://node0:7077 $SPARK_HOME/bin/run-example org.apache.spark.examples.SparkPi
输出:
运行 spark-shellPi is roughly 3.1404157020785104
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame
// 定义列表数据 seq
val seq: Seq[(String, Int)] = Seq(("Bob", 14), ("Alice", 18))
val rdd: RDD[(String, Int)] = sc.parallelize(seq)
// 定义并封装Schema
val schema:StructType = StructType( Array(
StructField("name", StringType),
StructField("age", IntegerType)
))
// createDataFrame 方法有两个形参,第一个参数正是 RDD,第二个参数是 Schema。
// rdd 转换为 RDD[Row]
val rowRDD: RDD[Row] = rdd.map(fileds => Row(fileds._1, fileds._2))
// 创建 DataFrame
val dataFrame: DataFrame = spark.createDataFrame(rowRDD,schema)
dataFrame.show
参考
https://github.com/scala/scala
http://jdk.java.net/archive/



