后续需要编写HDFS HA集群的启动和关闭的Shell脚本,在Shell脚本中会涉及到 ssh nodeX 命令,将会出现提示fingerprint信息,比较烦人, 如何让ssh不提示fingerprint信息?
/etc/ssh/ssh_config(客户端配置文件) 区别于sshd_config(服务端配置文件)
[root@node1 ~]# vim /etc/ssh/ssh_config # StrictHostKeyChecking ask StrictHostKeyChecking no #将 修改后的文件拷贝到node2、node3、node4 [root@node1 ~]# vim /etc/ssh/ssh_config [root@node1 ~]# scp /etc/ssh/ssh_config node2:/etc/ssh/ ssh_config 100% 2301 894.3KB/s 00:00 [root@node1 ~]# scp /etc/ssh/ssh_config node3:/etc/ssh/ ssh_config 100% 2301 579.9KB/s 00:00 [root@node1 ~]# scp /etc/ssh/ssh_config node4:/etc/ssh/ ssh_config 100% 2301 298.3KB/s 00:003.HDFS配置
关闭hdfs集群后,删除四台节点上/var/itbaizhan/hadoop/full目录和/opt/hadoop3.1.3/logs目录下的全部内容
rm -rf /var/itbaizhan/hadoop/full rm -rf /opt/hadoop3.1.3/logs
以下一律在node1上操作,做完后scp到node2、node3、node4
- hadoop-env.sh配置JDK
[root@node1 hadoop]# cd /opt/hadoop-3.1.3/etc/hadoop [root@node1 hadoop]# vim hadoop-env.sh export JAVA_HOME=/usr/java/default
- 修改workers指定datanode的位置
[root@node1 hadoop]# vim workers node2 node3 node4
- 修改core-site.xml
fs.defaultFS hdfs://mycluster hadoop.tmp.dir /var/itbaizhan/hadoop/ha ha.zookeeper.quorum node2:2181,node3:2181,node4:2181 hadoop.http.staticuser.user root
- hdfs-site.xml
dfs.journalnode.edits.dir ${hadoop.tmp.dir}/dfs/journalnode/ dfs.nameservices mycluster dfs.ha.namenodes.mycluster nn1,nn2 dfs.namenode.rpc-address.mycluster.nn1 node1:9820 dfs.namenode.rpc-address.mycluster.nn2 node2:9820 dfs.namenode.http-address.mycluster.nn1 node1:9870 dfs.namenode.http-address.mycluster.nn2 node2:9870 dfs.namenode.shared.edits.dir qjournal://node1:8485;node2:8485;node3:8485/mycluster dfs.client.failover.proxy.provider.mycluster org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider dfs.ha.fencing.methods sshfence dfs.ha.fencing.ssh.private-key-files /root/.ssh/id_dsa dfs.ha.automatic-failover.enabled true
- 先同步配置文件到node2、node3、node4
#node1上执行: [root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node2:`pwd` [root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node3:`pwd` [root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node4:`pwd`3.首次启动HDFS HA集群
a) 启动zookeeper集群, node2、node3、node4分别执行:
zkServer.sh start
b) 在node1node2node3上启动三台journalnode
hdfs --daemon start journalnode
c) 选择node1,格式化HDFS
[root@node1 hadoop]# hdfs namenode -format #看到如下提示,表示格式化成功 2021-10-15 13:21:33,318 INFO common.Storage: Storage directory /var/itbaizhan/hadoop/ha/dfs/name has been successfully formatted.
/var/itbaizhan/hadoop/ha/dfs/name/current/目录下产生了fsimage文件
[root@node1 hadoop]# ll /var/itbaizhan/hadoop/ha/dfs/name/current/ 总用量 16 -rw-r--r-- 1 root root 391 10月 15 13:21 fsimage_0000000000000000000 -rw-r--r-- 1 root root 62 10月 15 13:21 fsimage_0000000000000000000.md5 -rw-r--r-- 1 root root 2 10月 15 13:21 seen_txid -rw-r--r-- 1 root root 218 10月 15 13:21 VERSION
格式化后,启动namenode进程
[root@node1 hadoop]# hdfs --daemon start namenode [root@node1 hadoop]# jps 7347 JournalNode 7689 NameNode 7737 Jps
d) 在另一台node2上同步元数据,然后在该节点上启动NameNode。
[root@node2 ~]# hdfs namenode -bootstrapStandby
#出现以下提示:
2021-10-15 13:26:36,101 INFO ha.BootstrapStandby: Found nn: nn1, ipc: node1/192.168.20.101:9820
=====================================================
about to bootstrap Standby ID nn2 from:
Nameservice ID: mycluster
Other Namenode ID: nn1
Other NN's HTTP address: http://node1:9870
Other NN's IPC address: node1/192.168.20.101:9820
Namespace ID: 1743499963
Block pool ID: BP-166908272-192.168.20.101-1634275293276
Cluster ID: CID-38fac5df-ed87-46c5-a4e0-f92ce7008c07
Layout version: -64
isUpgradeFinalized: true
=====================================================
#启动NameNode
[root@node2 ~]# hdfs --daemon start namenode
[root@node2 ~]# jps
7249 QuorumPeerMain
8019 Jps
7466 JournalNode
7980 NameNode # 看到NameNode进程表示NameNode正常启动了。
e) 初始化zookeeper上的内容 一定是在namenode节点(node1或node2)上
[root@node4 hadoop]# zkCli.sh [zk: localhost:2181(CONNECTED) 1] ls / [itbaizhan, registry, wzyy, zk001, zookeeper]
接下来在node1上执行
[root@node1 ~]# hdfs zkfc -formatZK 2021-10-15 13:30:20,048 INFO ha.ActiveStandbyElector: Successfully created /hadoop-ha/mycluster in ZK.
然后在node4上接着执行:
[zk: localhost:2181(CONNECTED) 1] ls / [zookeeper, hadoop-ha] [zk: localhost:2181(CONNECTED) 2] ls /hadoop-ha [mycluster] [z: localhost:2181(CONNECTED) 3] ls /hadoop-ha/mycluster []
执行到此处,还没有启动3个DataNode和2个ZKFC进程。
f) 启动hadoop集群,在node1执行
[root@node1 ~]# start-dfs.sh #出现如下错误提示 ERROR: Attempting to operate on hdfs journalnode as root ERROR: but there is no HDFS_JOURNALNODE_USER defined. Aborting operation. Starting ZK Failover Controllers on NN hosts [node1 node2] ERROR: Attempting to operate on hdfs zkfc as root ERROR: but there is no HDFS_ZKFC_USER defined. Aborting operation. #解决办法:修改start-dfs.sh文件 [root@node1 ~]# vim /opt/hadoop-3.1.3/sbin/start-dfs.sh #添加 HDFS_JOURNALNODE_USER=root HDFS_ZKFC_USER=root #为了防止关闭时出现类似的错误提示,修改stop-dfs.sh [root@node1 ~]# vim /opt/hadoop-3.1.3/sbin/stop-dfs.sh #添加 HDFS_JOURNALNODE_USER=root HDFS_ZKFC_USER=root #再次启动 [root@node1 hadoop]# start-dfs.sh
在启动zkCli.sh的节点node4上观察:
[zk: localhost:2181(CONNECTED) 5] ls /hadoop-ha/mycluster [ActiveBreadCrumb, ActiveStandbyElectorLock] [zk: localhost:2181(CONNECTED) 6] get -s /hadoop-ha/mycluster/ActiveStandbyElectorLock myclusternn1node1 �L(�> cZxid = 0x600000008 ctime = Fri Oct 15 13:40:10 CST 2021 mZxid = 0x600000008 mtime = Fri Oct 15 13:40:10 CST 2021 pZxid = 0x600000008 cversion = 0 dataVersion = 0 aclVersion = 0 ephemeralOwner = 0x300006fd40a0002 dataLength = 29 numChildren = 0
node1占用着锁,它的状态是active的。浏览器访问:http://node1:9870
node2为standby,浏览器地址栏输入:http://node2:9870
将Active NameNode对应节点node1上NameNode进程kill掉:
[root@node1 hadoop]# jps 10337 Jps 7347 JournalNode 9701 DFSZKFailoverController 7689 NameNode [root@node1 hadoop]# kill -9 7689 #或者 [root@node1 hadoop]# hdfs --daemon stop namenode [root@node1 hadoop]# jps 7347 JournalNode 9701 DFSZKFailoverController 10381 Jps
node4上继续查看:
[zk: localhost:2181(CONNECTED) 12] get -s /hadoop-ha/mycluster/ActiveStandbyElectorLock myclusternn2node2 �L(�> cZxid = 0x60000006c ......
但是通过浏览器访问发现Active NameNode不能自动进行切换。这是因为缺少一个rpm包:psmisc。接下来在四台节点上安装psmisc包。
yum install -y psmisc
node1访问不了,node2 从Standby变为了Active。
node1上再次启动namenode:
[root@node1 hadoop]# hdfs --daemon start namenode
node1变为standby,变为备机。



