从0搭建hadoop集群_大数据系统

一基本环境配置

1. vi /etc/hosts(所有节点)
10.8.20.11 azurepana01
10.8.20.12 azurepana02
10.8.20.13 azurepana03
10.8.20.14 azurepana04

设置ssh免密登陆(所有节点)
ssh-keygen -t rsa

拷贝公钥(所有节点)：
ssh-copy-id GSPAgent@azurepana01
ssh-copy-id GSPAgent@azurepana02
ssh-copy-id GSPAgent@azurepana03
ssh-copy-id GSPAgent@azurepana04

关闭selinux(所有节点)
vi /etc/selinux/config
SELINUX=disabled

关闭防火墙(所有节点)
systemctl stop firewalld
systemctl disable firewalld
systemctl status firewalld

修改时区(所有节点)
sudo cp -f /usr/share/zoneinfo/Asia/Shanghai     /etc/localtime
sudo clock -w

二 .安装jdk

卸载openJDK(所有节点)
rpm -qa | grep jdk
...
yum -y remove ...

上传jdk-8u202-linux-x64.rpm到/opt/cdh/rpm/目录

安装jdk
yum localinstall jdk*.rpm -y

配置JAVA_HOME 修改环境变量
vi /etc/profile
# 末尾追加以下内容
export JAVA_HOME=/usr/java/default
export PATH=$JAVA_HOME/bin:$PATH
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
# 使修改生效
source /etc/profile

三.在hive主节点安装mysql（元数据库）

1. 上传包
mysql-5.7.26-1.el7.x86_64.rpm-bundle.tar
mysql-connector-java.jar到/opt/cdh/rpm/目录

2.在所有节点安装mysql驱动程序，执行以下命令：
sudo mkdir -p /usr/share/java
sudo cp mysql-connector-java.jar /usr/share/java/mysql-connector-java.jar

3.卸载干净mariadb(主节点)
# 查看mariadb服务
 rpm -qa | grep mariadb
# 卸载mariadb服务
rpm -e --nodeps mariadb-libs-*

4. 安装mysql服务(只在元数据库节点)
# 创建mysql目录,并上传tar包到该目录
mkdir /opt/cdh/mysql
# 解压
tar -xvf mysql-5.7.26-1.el7.x86_64.rpm-bundle.tar -C /opt/cdh/mysql
按以下顺序安装相关rpm：
安装cdh时需要
[root@cdhmaster01 mysql]#  rpm -ivh mysql-community-common-5.7.26-1.el7.x86_64.rpm
警告：mysql-community-common-5.7.26-1.el7.x86_64.rpm: 头V3 DSA/SHA1 Signature, 密钥 ID 5072e1f5: NOKEY
准备中...                          ################################# [100%]
正在升级/安装...
   1:mysql-community-common-5.7.26-1.e################################# [100%]
[root@cdhmaster01 mysql]#  rpm -ivh mysql-community-libs-5.7.26-1.el7.x86_64.rpm
警告：mysql-community-libs-5.7.26-1.el7.x86_64.rpm: 头V3 DSA/SHA1 Signature, 密钥 ID 5072e1f5: NOKEY
准备中...                          ################################# [100%]
正在升级/安装...
   1:mysql-community-libs-5.7.26-1.el7################################# [100%]
[root@cdhmaster01 mysql]# rpm -ivh mysql-community-libs-compat-5.7.26-1.el7.x86_64.rpm
警告：mysql-community-libs-compat-5.7.26-1.el7.x86_64.rpm: 头V3 DSA/SHA1 Signature, 密钥 ID 5072e1f5: NOKEY
准备中...                          ################################# [100%]
正在升级/安装...
   1:mysql-community-libs-compat-5.7.2################################# [100%]
[root@cdhmaster01 mysql]# rpm -ivh mysql-community-client-5.7.26-1.el7.x86_64.rpm
警告：mysql-community-client-5.7.26-1.el7.x86_64.rpm: 头V3 DSA/SHA1 Signature, 密钥 ID 5072e1f5: NOKEY
准备中...                          ################################# [100%]
正在升级/安装...
   1:mysql-community-client-5.7.26-1.e################################# [100%]
[root@cdhmaster01 mysql]#  rpm -ivh mysql-community-server-5.7.26-1.el7.x86_64.rpm
警告：mysql-community-server-5.7.26-1.el7.x86_64.rpm: 头V3 DSA/SHA1 Signature, 密钥 ID 5072e1f5: NOKEY
准备中...                          ################################# [100%]
正在升级/安装...
   1:mysql-community-server-5.7.26-1.e################################# [100%]
[root@cdhmaster01 mysql]#

#启动mysql
 sudo service mysqld start
#查看默认生成的密码
sudo grep 'temporary password' /var/log/mysqld.log
#登陆mysql
mysql -uroot -p

#第一次重设密码
ALTER USER 'root'@'localhost' IDENTIFIED BY 'MyNewPasswd4!';

#修改host值
mysql>use mysql;
mysql> update user set host = '%' where user ='root';
mysql>flush privileges;
mysql> select host,user from user where user='root';

#修改为简单密码
#只验证密码长度
set global validate_password_policy=0;
mysql>flush privileges;
#查看密码长度
select @@validate_password_length;
修改默认密码的长度（这里修改为6）
set global validate_password_length=6;
mysql>flush privileges;
#重设密码为passwd
ALTER USER 'root'@'%' IDENTIFIED BY 'passwd';
mysql>flush privileges;
exit;
#设置开机自启动
chkconfig mysqld on

#重新用新密码登陆
mysql -uroot -ppasswd

mysql>  Grant all privileges on *.* to 'root'@'%' identified by 'passwd' with grant option;
Query OK, 0 rows affected, 1 warning (0.00 sec)

mysql> flush privileges;
Query OK, 0 rows affected (0.00 sec)

四.安装zookeeper

1.安装zookeeper(所有节点)
mkdir /opt/cluster
sudo chown GSPAgent:GSPAgent /opt/cluster
..上传包
sudo mkdir /opt/cluster/hadoop
mkdir /opt/cluster/zookeeper
mkdir /opt/cluster/zookeeper/data
vi /opt/cluster/zookeeper/data/myid
1 (1 or 2 or 3)

tar -xzvf apache-zookeeper-* -C ./zookeeper/

vi ./zoo.cfg
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=5
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=2
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
#dataDir=/tmp/zookeeper
dataDir=/opt/cluster/zookeeper/data
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1

## Metrics Providers
#
# https://prometheus.io Metrics Exporter
#metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider
#metricsProvider.httpPort=7000
#metricsProvider.exportJvmInfo=true

server.1=azurepana02:2888:3888
server.2=azurepana03:2888:3888
server.3=azurepana04:2888:3888

cp ./zoo.cfg ./zookeeper/apache-zookeeper-3.6.3-bin/conf/zoo.cfg

# 所有节点
zkServer.sh start
zkServer.sh status
zkServer.sh stop

五. 安装hadoop

1.上传安装包到其中一个节点
upload hadoop-3.2.2.tar.gz to /opt/cluster
tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz -C /opt/cluster/hadoop

2.配置 core-site.xml，hdfs-site.xml，yarn-site.xml，mapred-site.xml，workers 文件

vi /opt/cluster/hadoop/hadoop-3.2.2/etc/hadoop/core-site.xml








    
        hadoop.tmp.dir
        /opt/cluster/hadoop/tmp
    
    
        fs.defaultFS
        hdfs://azurepana01:9820
    
    
        
        ha.zookeeper.quorum
        azurepana02:2181,azurepana03:2181,azurepana04:2181
    

　　
　　　　hadoop.proxyuser.GSPAgent.hosts
　　　　*
　　
　　
　　　　hadoop.proxyuser.GSPAgent.groups
　　　　*
　　



vi /opt/cluster/hadoop/hadoop-3.2.2/etc/hadoop/hdfs-site.xml









    
        dfs.namenode.name.dir
        /opt/cluster/hadoop/dfs/name
     
     
         dfs.datanode.data.dir
         /opt/cluster/hadoop/dfs/data
     
     
        dfs.replication
        2
    

　
　　　　dfs.webhdfs.enabled
　　　　true
　　

    
        
        dfs.nameservices
        mycluster
    
    
        
        dfs.permissions.enabled
        false
    
    
        
        dfs.ha.namenodes.mycluster
        nn1,nn2
    
    
        
        dfs.namenode.rpc-address.mycluster.nn1
        azurepana01:9820
    
    
        
        dfs.namenode.rpc-address.mycluster.nn2
        azurepana02:9820
    
    
        
        dfs.namenode.http-address.mycluster.nn1
        azurepana01:9870
    
    
        
        dfs.namenode.http-address.mycluster.nn2
        azurepana02:9870
    

    
        
        dfs.namenode.shared.edits.dir
        qjournal://azurepana02:8485;azurepana03:8485;azurepana04:8485/mycluster
    

    
        
        dfs.client.failover.proxy.provider.hbzx
        org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
    

    
        
        dfs.ha.fencing.methods
        sshfence
    

    
        dfs.ha.fencing.ssh.private-key-files
        /home/GSPAgent/.ssh/id_rsa
    

    
        
        dfs.journalnode.edits.dir
        /opt/cluster/hadoop/journal/data
    

    
        
        dfs.ha.automatic-failover.enabled
        true
    



dfs.journalnode.edits.dir
/home/GSPAgent/data/journaldata/jn



dfs.ha.fencing.methods
shell(/bin/true)




dfs.ha.fencing.ssh.connect-timeout
10000


dfs.namenode.handler.count
100




vi /opt/cluster/hadoop/hadoop-3.2.2/etc/hadoop/yarn-site.xml








    
        yarn.nodemanager.aux-services
        mapreduce_shuffle
    
    
        yarn.nodemanager.env-whitelist
        JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME
    

    
        
        yarn.resourcemanager.ha.enabled
        true
    
    
        
        yarn.resourcemanager.cluster-id
        mycluster
    
    
        
        yarn.resourcemanager.ha.rm-ids
        rm1,rm2
    
    
        
        yarn.resourcemanager.hostname.rm1
        azurepana01
    
    
        
        yarn.resourcemanager.hostname.rm2
        azurepana02
    
    
        
        yarn.resourcemanager.webapp.address.rm1
        azurepana01:8088
    
    
        
        yarn.resourcemanager.webapp.address.rm2
        azurepana02:8088
    
    
        
      
        hadoop.zk.address
        azurepana02:2181,azurepana03:2181,azurepana04:2181
    

    
        
        yarn.nodemanager.resource.detect-hardware-capabilities
        true
    


     yarn.scheduler.capacity.maximum-am-resource-percent
     0.5
 



vi /opt/cluster/hadoop/hadoop-3.2.2/etc/hadoop/mapred-site.xml








    
        mapreduce.framework.name
        yarn
    



vi workers 
azurepana02
azurepana03
azurepana04



3.sudo vi /etc/profile

export JAVA_HOME=/usr/java/default

export PATH=$JAVA_HOME/bin:$PATH
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar

export HADOOP_HOME=/opt/cluster/hadoop/hadoop-3.2.2
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop

export SPARK_HOME=/opt/cluster/spark-3.1.2-bin-hadoop3.2
export PATH=$PATH:$SPARK_HOME/bin

# By default, we want umask to get set. This sets it for login shell

export SCALA_HOME=/opt/cluster/scala-2.11.8
export PATH=$PATH:$SCALA_HOME/bin:$SCALA_HOME/sbin

export HIVE_HOME=/opt/cluster/apache-hive-3.1.2-bin
export PATH=$PATH:$HIVE_HOME/bin

export FLUME_HOME=/opt/cluster/flume
export PATH=$PATH:$FLUME_HOME/bin

export KAFKA_HOME=/opt/cluster/kafka/kafka_2.11-1.1.0
export PATH=$PATH:$KAFKA_HOME/bin

export DATAX_HOME=/opt/cluster/datax
export PATH=$PATH:$DATAX_HOME/bin


4.source /etc/profile

5.从该节点分发已配置的文件到其他节点
tar -zcvf /opt/cluster/hadoop/hadoop-3.2.2.tar.gz /opt/cluster/hadoop/hadoop-3.2.2
scp hadoop-3.2.2.tar.gz azurepana02:/opt/cluster/hadoop
tar -xzvf /opt/cluster/hadoop/hadoop-3.2.2.tar.gz -C /opt/cluster/hadoop/

6.启动zookeeeper格式化hdfs
zkServer.sh start(所有节点)

hdfs zkfc -formatZK  
hdfs --daemon start journalnode
#hdfs --daemon stop journalnode

hdfs namenode -format
hdfs --daemon start namenode

# 另一个节点同步
hdfs namenode -bootstrapStandby
# 检查nn1状态(因为mysql安装在nn1所在节点，所以必须保证nn1 active)
hdfs haadmin -getServiceState nn1

# 强行切换name node 主节点(因为mysql安装在nn1所在节点，所以必须保证nn1 active，如果nn1不是active,需要在nn2所在节点执行以下)
/opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh stop zkfc
/opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh start zkfc

六.hive安装

upload apache-hive-3.1.2-bin.tar.gz to /opt/cluster
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C ./
cd /opt/cluster/apache-hive-3.1.2-bin/conf
cp hive-env.sh.template  hive-env.sh
vi hive-env.sh
export HADOOP_HOME=/opt/cluster/hadoop/hadoop-3.2.2
export HIVE_CONF_DIR=/opt/cluster/apache-hive-3.1.2-bin/conf
#export SPARK_HOME=/opt/cluster/spark-3.1.2-bin-hadoop3.2
#export HIVE_AUX_JARS_PATH=$(find ${SPARK_HOME}/jars/ -name '*.jar' -and -not -name '*hadoop*' -printf '%p:' | head -c-1)
#export HIVE_AUX_JARS_PATH=${SPARK_HOME}/jars
export HIVE_AUX_JARS_PATH=/opt/cluster/apache-hive-3.1.2-bin/lib

vi /opt/cluster/apache-hive-3.1.2-bin/conf/hive-site.xml



    datanucleus.schema.autoCreateAll
    true
 


    hive.metastore.schema.verification
    false




javax.jdo.option.ConnectionURL
jdbc:mysql://azurepana02:3306/hive?createDatabaseIfNotExist=true&useSSL=false&serverTimezone=CST&characterEncoding=utf8




javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver




javax.jdo.option.ConnectionUserName
root


javax.jdo.option.ConnectionPassword
passwd



hive.metastore.warehouse.dir
/user/hive/warehouse



hive.exec.scratchdir
/opt/cluster/apache-hive-3.1.2-bin/tmp




hive.querylog.location
/opt/cluster/apache-hive-3.1.2-bin/log




hive.metastore.uris
thrift://azurepana02:9083




hive.server2.thrift.port
10000


hive.server2.thrift.bind.host
0.0.0.0


hive.server2.webui.host
0.0.0.0


 
    hive.server2.authentication
    NONE
  

  
    hive.server2.active.passive.ha.enable
    true
  


    hive.metastore.event.db.notification.api.auth
    false


 
     hive.metastore.local
     false



    hive.server2.thrift.client.user
    GSPAgent
    Username to use against thrift client
  
  
    hive.server2.thrift.client.password
    GSPAgent
    Password to use against thrift client
  


hive.server2.webui.port
10002



hive.server2.long.polling.timeout
5000



hive.server2.enable.doAs
true



datanucleus.autoCreateSchema
false



datanucleus.fixedDatastore
true



hive.execution.engine
mr



hive.cli.print.header
true


hive.resultset.use.unique.column.names
false



hive.metastore.client.socket.timeout
1800



hive.server.read.socket.timeout
1800



hive.server.write.socket.timeout
1800



hive.server.thrift.socket.timeout
1800



hive.client.thrift.socket.timeout
1800



hive.merge.mapfiles
true



hive.merge.mapredfiles
true



hive.stats.column.autogather
false



hive.exec.dynamic.partition
true



hive.exec.dynamic.partition.mode
nonstrict





cd /opt/cluster/apache-hive-3.1.2-bin/lib
mv guava-19.0.jar guava-19.0.jar.bak
cp /opt/cluster/hadoop/hadoop-3.2.2/share/hadoop/common/lib/guava-27.0-jre.jar /opt/cluster/apache-hive-3.1.2-bin/lib/
cp mysql-connector-java-5.1.49.jar /opt/cluster/apache-hive-3.1.2-bin/lib/
sudo vi /etc/profile
source /etc/profile
schematool -dbType mysql -initSchema
nohub /opt/cluster/apache-hive-3.1.2-bin/bin/hive --service metastore & (非必须)

# 发送到其他节点
tar -zcvf /opt/cluster/apache-hive-3.1.2-bin.tar.gz /opt/cluster/apache-hive-3.1.2-bin
scp ./apache-hive-3.1.2-bin.tar.gz azurepana04:/opt/cluster/
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C ./
sudo vi /etc/profile
source /etc/profile
# 测试
hive

hive问题解决

1.Operation category READ is not supported in state standby
# 强行切换name node 主节点
hdfs haadmin -getServiceState nn1
/opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh stop zkfc
/opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh start zkfc

yarn rmadmin -getServiceState rm1
yarn rmadmin -transitionToActive  --forcemanual  rm1


2.The specified datastore driver ("com.mysql.jdbc.Driver") was not found in the CLASSPATH
cp mysql-connector-java.jar /opt/cluster/apache-hive-3.1.2-bin/lib/

3.metaException(message:Version information not found in metastore.)

    hive.metastore.schema.verification
    false


4.hive> show databases;
FAILED: HiveException java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHivemetaStoreClient
# 后台启动hive metastore
nohup /opt/cluster/apache-hive-3.1.2-bin/bin/hive --service metastore 2>&1 >> /opt/cluster/apache-hive-3.1.2-bin/log.log &
nohup hiveserver2 >> /opt/cluster/apache-hive-3.1.2-bin/hiveserver2.log &
hive --service beeline
hive --service beeline -u jdbc:hive2://azurepana02:10000/default -n GSPAgent
hive --service beeline -u jdbc:hive2://azurepana02:10000/mytest -n GSPAgent
!connect jdbc:hive2://azurepana02:10000/mytest
beeline -u jdbc:hive2://azurepana02:10000/default -n GSPAgent
set hive.execution.engine=spark

5.required table missing : "`DBS`" in Catalog "" Schema "". DataNucleus requires this table
在hive-site.xml中配置

    datanucleus.schema.autoCreateAll
    true
 

6.Error: Table 'CTLGS' already exists (state=42S01,code=1050)
去mysql数据库的hive下删掉该表

7.hive insert 在非overwrite插入数据，如果使用非overwrite模式并且表已经存在数据的时候可能遇到如下错误:
Loading data to table test.test1
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.StatsTask

set hive.stats.column.autogather=false;

8.提交spark submit 时Exception in thread "main" org.apache.spark.sql.AnalysisException: Table or view not found: test.test1; line 1 pos 13;
'Project [*]
+- 'UnresolvedRelation [test, test1], [], false

cp /opt/cluster/apache-hive-3.1.2-bin/conf/hive-site.xml /opt/cluster/spark-3.1.2-bin-hadoop3.2/conf/hive-site.xml

七. spark安装

upload spark-3.1.2-bin-hadoop3.2.tgz to /opt/cluster
tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz -C ./
sudo vi /etc/profile
source /etc/profile
cd $SPARK_HOME
cp conf/spark-env.sh.template conf/spark-env.sh

vi conf/spark-env.sh
export JAVA_HOME=/usr/java/default
export HADOOP_HOME=/opt/cluster/hadoop/hadoop-3.2.2
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
# 显式配置主机号和端口号
export SPARK_MASTER_HOST=azurepana02
export SPARK_MASTER_PORT=7077
# 主机内存和核
export SPARK_WORKER_MEMORY=4g
export SPARK_WORKER_CORES=2
export SPARK_DIST_CLASSPATH=$(/opt/cluster/hadoop/hadoop-3.2.2/bin/hadoop classpath)


cp conf/workers.template conf/workers
vi conf/workers
azurepana01
azurepana02
azurepana04


cp $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
# 发送到其他节点
tar -zcvf /opt/cluster/spark-3.1.2-bin-hadoop3.2.tar.gz /opt/cluster/spark-3.1.2-bin-hadoop3.2
scp ./spark-3.1.2-bin-hadoop3.2.tar.gz azurepana04:/opt/cluster/

# 其他节点解压
tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz -C ./
# 其他节点修改/etc/profile
sudo vi /etc/profile
source /etc/profile
# 测试
spark-shell
val file=sc.textFile("hdfs:/test/test.txt")
file.collect

自行编译spark(可选)

./dev/make-distribution.sh --name build --tgz -Phive-3.1.2 -Phive-thriftserver -Phadoop-3.2 -Phadoop-provided -Pyarn -Pscala-2.12 -Dhadoop.version=3.2.2 -DskipTests

./dev/make-distribution.sh --name build --tgz -Phive-3.1.2 -Phive-thriftserver -Phadoop-3.2 -Phadoop-provided -Pparquet-provided,-Porc-provided,-Pyarn -Pscala-2.12 -Dhadoop.version=3.2.2 -DskipTests


/opt/cluster/spark-3.1.2-bin-hadoop3.2

/opt/cluster/apache-hive-3.1.2-bin/log
/opt/cluster/apache-hive-3.1.2-bin/tmp

/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/stop-all.sh
/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/start-all.sh
tail -100f /opt/cluster/spark-3.1.2-bin-hadoop3.2/logs/spark-GSPAgent-org.apache.spark.deploy.master.Master-1-azurepana04.out
tail -100f /opt/cluster/spark-3.1.2-bin-hadoop3.2/logs/spark-GSPAgent-org.apache.spark.deploy.worker.Worker-1-azurepana04.out

cp $HADOOP_HOME/etc/hadoop/core-site.xml $SPARK_HOME/conf/
cp $HADOOP_HOME/etc/hadoop/hdfs-site.xml $SPARK_HOME/conf/
cp $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/

ls jars |grep hive*

/opt/cluster/spark-3.1.2-bin-without-hadoop/jars

hive on spark hive3.1.2集成spark2.4.7安装(可选)

mv spark-2.4.7-bin-without-hadoop spark-3.1.2-bin-hadoop3.2
cd $SPARK_HOME/conf/
cp spark-env.sh.template spark-env.sh
vi spark-env.sh
export JAVA_HOME=/usr/java/default
export HADOOP_HOME=/opt/cluster/hadoop/hadoop-3.2.2
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export SPARK_HOME=/opt/cluster/spark-3.1.2-bin-hadoop3.2
export SPARK_DIST_CLASSPATH=$(hdfs classpath)
#export MASTER_WEBUI_PORT=8079
export SPARK_LOG_DIR=/home/GSPAgent/tmp/data/spark/logs
export SPARK_LIBRARY_PATH=${SPARK_HOME}/jars

cp spark-defaults.conf.template spark-defaults.conf
vi spark-defaults.conf
spark.executor.memory               1g
spark.driver.cores                  1
spark.driver.maxResultSize          0

vi $HIVE_HOME/conf/hive-site.xml

hive.execution.engine
spark

 
    spark.master
    yarn-cluster
  

 
    spark.serializer
    org.apache.spark.serializer.KryoSerializer
  

  
    spark.driver.cores
    1
  

 
    spark.executor.extraJavaOptions
    -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
  



cd $SPARK_HOME/jars
cp spark-core_*.jar spark-network-common_*.jar spark-unsafe_*.jar spark-yarn_*.jar $HIVE_HOME/lib/
ls -al $HIVE_HOME/lib/ | grep 'spark'
cd $SPARK_HOME/jars
ls -al | grep 'orc-core'
mv orc-core-*.jar orc-core-1.5.5-nohive.jar.bak
hive
select count(1) from test.test1;

八.flume安装

mv apache-flume-1.8.0-bin flume
sudo vi /etc/profile
cd ./flume/conf
cp flume-env.sh.template flume-env.sh
vi flume-env.sh
export JAVA_HOME=/usr/java/default
# 新增flume任务配置文件(一个同步任务一个配置文件)
/opt/cluster/flume/conf/table_1.conf
# 启动flume agent a1
flume-ng agent -c . -f /opt/cluster/flume/conf/table_1.conf -n a1 -Dflume.root.logger=INFO,console

flume-ng agent -c . -f /opt/cluster/flume/conf/spool.conf -n a1 -Dflume.root.logger=INFO,console
echo "spool test1" > /opt/cluster/flume/conf/logs/spool_text.log

九.kafka 安装测试

cd /opt/cluster
tar -zxvf kafka_2.11-1.1.0.tgz
mv kafka_2.11-1.1.0 kafka
sudo vi /etc/profile
source /etc/profile

vi server.properties
broker.id=1
delete.topic.enable=true
auto.create.topics.enable=true
default.replication.factor=2
listeners=PLAINTEXT://azurepana01:9092
log.dirs=/opt/cluster/kafka/logs
zookeeper.connect=azurepana02:2181,azurepana03:2181,azurepana04:2181
# Timeout in ms for connecting to zookeeper
zookeeper.connection.timeout.ms=6000


mkdir logs

vi kafka-start-all.sh

vi kafka-stop-all.sh

sh kafka-start-all.sh
kafka-topics.sh --create --zookeeper azurepana02:2181 --replication-factor 1 --partitions 1 --topic test_topic
kafka-topics.sh --describe --zookeeper azurepana02:2181 --topic test_topic
kafka-topics.sh --list --zookeeper azurepana02:2181  
kafka-console-producer.sh --broker-list azurepana02:9092 --topic test_topic
kafka-console-consumer.sh --zookeeper azurepana02:2181 --topic test_topic --from-beginning

10 各个组件一键启动脚本

[GSPAgent@azurepana01 start]$ cat kafka-start-all.sh
ssh azurepana01  << eeooff
kafka-server-start.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff

ssh azurepana02 << eeooff
kafka-server-start.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff

ssh azurepana03 << eeooff
kafka-server-start.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff

[GSPAgent@azurepana01 start]$ cat kafka-stop-all.sh
ssh azurepana01  << eeooff
kafka-server-stop.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff

ssh azurepana02 << eeooff
kafka-server-stop.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff

ssh azurepana03 << eeooff
kafka-server-stop.sh  -daemon /opt/cluster/kafka/kafka_2.11-1.1.0/config/server.properties
exit
eeooff


[GSPAgent@azurepana01 start]$ cat start-hadoop.sh
#/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/start-all.sh
/opt/cluster/hadoop/hadoop-3.2.2/sbin/start-all.sh

state=`hdfs haadmin -getServiceState nn1`
echo $state
if [ "${state}" = "standby" ];then
        /opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh stop zkfc
        sleep 15s
        /opt/cluster/hadoop/hadoop-3.2.2/sbin/hadoop-daemon.sh start zkfc
fi
state=`hdfs haadmin -getServiceState nn1`
if [ "${state}" = "active" ];then
        echo "启动成功..."
        exit 0
else
        echo "切换nn1失败..."
        exit 1
fi

[GSPAgent@azurepana01 start]$ cat stop-hadoop.sh
#/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/stop-all.sh
/opt/cluster/hadoop/hadoop-3.2.2/sbin/stop-all.sh


[GSPAgent@azurepana01 start]$ cat zookeeper-start-all.sh
ssh azurepana02  << eeooff
zkServer.sh start
exit
eeooff

ssh azurepana03 << eeooff
zkServer.sh start
exit
eeooff

ssh azurepana04 << eeooff
zkServer.sh start
exit
eeooff

[GSPAgent@azurepana01 start]$ cat zookeeper-start-all.sh
ssh azurepana02  << eeooff
zkServer.sh start
exit
eeooff

ssh azurepana03 << eeooff
zkServer.sh start
exit
eeooff

ssh azurepana04 << eeooff
zkServer.sh start
exit
eeooff

#ssh azurepana02

[GSPAgent@azurepana01 start]$ cat zookeeper-stop-all.sh
ssh azurepana02  << eeooff
zkServer.sh stop
exit
eeooff

ssh azurepana03 << eeooff
zkServer.sh stop
exit
eeooff

ssh azurepana04 << eeooff
zkServer.sh stop
exit
eeooff

[GSPAgent@azurepana01 start]$ cat start-spark.sh
/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/start-all.sh

[GSPAgent@azurepana01 start]$ cat stop-spark.sh
/opt/cluster/spark-3.1.2-bin-hadoop3.2/sbin/stop-all.sh

从0搭建hadoop集群

大数据系统相关栏目本月热门文章