文章内容输出来源:拉勾教育大数据高薪训练营
大数据技术解决的是什么问题?
大数据技术解决的主要是海量数据的存储和计算。
Apache Hadoop的重要组成
Hadoop=HDFS(分布式文件系统)+MapReduce(分布式计算框架)+Yarn(资源协调框架)+Common模块
/etc/hosts
192.168.0.230 mr00
192.168.0.231 mr01
192.168.0.232 mr02
# 关闭防火墙
systemctl stop firewalld
# 禁止防火墙开机自启
systemctl disable firewalld
vim /etc/selinux/config
SELINUX=enforcing—> SELINUX=disabled
yum -y install vim* net-tools lrzsz rsync
ssh-keygen -t rsa
# 将公钥追加到authorized_keys
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# 修改权限,不加权限无法实现
chmod 600 ~/.ssh/authorized_keys
# 追加密钥到master
ssh mr01 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
ssh mr02 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
# 查看信任主机是否添加
# 将ssh文件分发到其余节点中
scp /root/.ssh/authorized_keys root@mr01:/root/.ssh/authorized_keys
scp /root/.ssh/authorized_keys root@mr02:/root/.ssh/authorized_keys
tar -xzvf hadoop-2.7.7.tar.gz -C /opt/hadoop/
cd /opt/hadoop/
mv hadoop-2.7.7 hadoop2.9
export JAVA_HOME=/opt/java/jdk1.8
export JRE_HOME=/opt/java/jdk1.8/jre
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar:$JRE_HOME/lib
export HADOOP_HOME=/opt/hadoop/hadoop2.9
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR"
export PATH=.:${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
节点 | ip | 角色 |
---|---|---|
mr00 | 192.168.0.230 | NN,DN,NM |
mr01 | 192.168.0.231 | DN,NM |
mr02 | 192.168.0.232 | 2NN,DN,RM,NM |
export JAVA_HOME=${JAVA_HOME} # 修改为
export JAVA_HOME=/opt/java/jdk1.8
<property>
<name>hadoop.temp.dirname>
<value>file:/root/hadoop/tmpvalue>
property>
<property>
<name>fs.defaultFSname>
<value>hdfs://mr00:9000value>
property>
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>mr02:50090value>
property>
<property>
<name>dfs.replicationname>
<value>3value>
property>
<property>
<name>dfs.namenode.name.dirname>
<value>file:/root/hadoop/namevalue>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>file:/root/hadoop/datavalue>
property>
mr00
mr01
mr02
export JAVA_HOME=${JAVA_HOME} # 修改为
export JAVA_HOME=/opt/java/jdk1.8
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>mr02value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
touch rsync-script.sh
---+---rsync-script.sh---+---
#!/bin/bash
paramnum=$#
if((paramnum==0)); then
echo Missing parameter;
exit;
fi
p1=$1
echo $p1
file_name=`basename $p1`
pdir=`cd -P $(dirname $p1); pwd`
user=`whoami`
echo $user
echo file_name=$file_name
echo pdir=$pdir
for ((host=0; host<3; host++));
do
echo "----------------mr0$host------------------------"
rsync -rvl $pdir/$file_name $user@mr0$host:$pdir
done
hadoop namenode -format
INFO common.Storage: Storage directory /root/hadoop/name has been successfully formatted.
NN初始化之后会在配置的name.dir下创建版本信息,fsimage为文件系统镜像
[root@mr00 opt]# ll /root/hadoop/name/current/
total 16
-rw-r–r-- 1 root root 323 Jul 1 23:50 fsimage_0000000000000000000
-rw-r–r-- 1 root root 62 Jul 1 23:50 fsimage_0000000000000000000.md5
-rw-r–r-- 1 root root 2 Jul 1 23:50 seen_txid
-rw-r–r-- 1 root root 218 Jul 1 23:50 VERSION
启动NN
hadoop-daemon.sh start namenode
启动DN
hadoop-daemon.sh start datanode
hadoop的web
http://192.168.0.230:50070/
启动RM
yarn-daemon.sh start resourcemanager
启动NM
yarn-daemon.sh start nodemanager
yarn的web
http://192.168.0.232:8088/
启动hdfs
[root@mr00 sbin]# start-dfs.sh
Starting namenodes on [mr00]
mr00: starting namenode, logging to /opt/hadoop/hadoop2.9/logs/hadoop-root-namenode-mr00.out
mr01: starting datanode, logging to /opt/hadoop/hadoop2.9/logs/hadoop-root-datanode-mr01.out
mr02: starting datanode, logging to /opt/hadoop/hadoop2.9/logs/hadoop-root-datanode-mr02.out
mr00: starting datanode, logging to /opt/hadoop/hadoop2.9/logs/hadoop-root-datanode-mr00.out
Starting secondary namenodes [mr02]
mr02: starting secondarynamenode, logging to /opt/hadoop/hadoop2.9/logs/hadoop-root-secondarynamenode-mr02.out
启动yarn
[root@mr02 ~]# start-yarn.sh
starting yarn daemons
starting resourcemanager, logging to /opt/hadoop/hadoop2.9/logs/yarn-root-resourcemanager-mr02.out
mr01: starting nodemanager, logging to /opt/hadoop/hadoop2.9/logs/yarn-root-nodemanager-mr01.out
mr02: starting nodemanager, logging to /opt/hadoop/hadoop2.9/logs/yarn-root-nodemanager-mr02.out
mr00: starting nodemanager, logging to /opt/hadoop/hadoop2.9/logs/yarn-root-nodemanager-mr00.out
hdfs文件系统创建文件夹
[root@mr00 ~]# hdfs dfs -mkdir -p /root/test
[root@mr00 ~]# hdfs dfs -ls /
Found 1 items
drwxr-xr-x - root supergroup 0 2020-07-03 17:23 /root
[root@mr00 ~]#
上传文件
[root@mr00 ~]# touch test.txt
[root@mr00 ~]# echo This is a testing! > test.txt
[root@mr00 ~]# hdfs dfs -put test.txt /root/test/
[root@mr00 ~]# hdfs dfs -ls /root/test
Found 1 items
-rw-r--r-- 3 root supergroup 19 2020-07-03 17:25 /root/test/test.txt
[root@mr00 ~]#
下载文件
[root@mr00 ~]# hdfs dfs -get /root/test/test.txt
[root@mr00 ~]# ll
total 12
-rw-------. 1 root root 1621 Jun 30 22:54 anaconda-ks.cfg
drwxr-xr-x 4 root root 30 Jul 1 23:56 hadoop
-rwxrwxrwx 1 root root 377 Jul 1 23:36 rsync-script
-rw-r--r-- 1 root root 19 Jul 3 17:26 test.txt
[root@mr00 ~]#
MapReduce
## 创建待Wordcount文件,并上传到hdfs
[root@mr00 ~]# hdfs dfs -mkdir /wcinput
[root@mr00 ~]# touch wc.txt
[root@mr00 ~]# vim wc.txt
[root@mr00 ~]# vim wc.txt
[root@mr00 ~]# hdfs dfs -put wc.txt /wcinput
## 执行MapReduce代码
[root@mr00 ~]# hadoop jar /opt/hadoop/hadoop2.9/share/hadoop/mapreduce/hadoop-
## 查询结果
[root@mr00 ~]# hdfs dfs -cat /wcoutput/part-r-00000
doop 1
hadoop 1
hdfs 1
lagou 3
mapreduce 3
yarn 2
vim mapred-site.xml
添加以下内容
<property>
<name>mapreduce.jobhistory.addressname>
<value>linux121:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>linux121:19888value>
property>
# 分发
rsync-script mapred-site.xml
启动历史服务器
mr-jobhistory-daemon.sh start historyserver
历史服务器web
http://192.168.0.230:19888/jobhistory
vim yarn-site.xml
yarn.log-aggregation-enable
true
yarn.log-aggregation.retain-seconds
604800
# 重启hdfs和yarn
# 删除hdfs的wcoutput
[root@mr00 hadoop]# hdfs dfs -rm -R /wcoutput
Deleted /wcoutput
# 关闭历史服务器
mr-jobhistory-daemon.sh stop historyserver
# 重新运行MapReduce
hadoop jar /opt/hadoop/hadoop2.9/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.2.jar wordcount /wcinput /wcoutput
文章内容输出来源:拉勾教育大数据高薪训练营