# 安装 Java 8
sudo apt-get install openjdk-8-jdk
# 验证安装
java -version
# 下载 Hadoop 3.0.4
wget https://archive.apache.org/dist/hadoop/common/hadoop-3.0.4/hadoop-3.0.4.tar.gz
# 解压到指定目录
sudo tar -zxvf hadoop-3.0.4.tar.gz -C /usr/local/
sudo ln -s /usr/local/hadoop-3.0.4 /usr/local/hadoop
# 设置环境变量
echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc
echo "export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc
source ~/.bashrc
# 进入配置目录
cd $HADOOP_HOME/etc/hadoop
# 修改 core-site.xml
vi core-site.xml
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://localhost:9000value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>/usr/local/hadoop/tmpvalue>
property>
configuration>
vi hdfs-site.xml
<configuration>
<property>
<name>dfs.replicationname>
<value>1value>
property>
<property>
<name>dfs.namenode.name.dirname>
<value>/usr/local/hadoop/tmp/namevalue>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>/usr/local/hadoop/tmp/datavalue>
property>
configuration>
vi yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.classname>
<value>org.apache.hadoop.mapred.ShuffleHandlervalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>localhostvalue>
property>
<property>
<name>yarn.nodemanager.env-whitelistname>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOMEvalue>
property>
configuration>
vi mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.application.classpathname>
<value>$HADOOP_HOME/share/hadoop/mapreduce/*,$HADOOP_HOME/share/hadoop/mapreduce/lib/*value>
property>
configuration>
# 格式化 NameNode
hdfs namenode -format
# 启动 HDFS
start-dfs.sh
# 启动 YARN
start-yarn.sh
# 验证服务状态
jps
正常情况下会看到 NameNode
、DataNode
、ResourceManager
、NodeManager
进程。
使用 Maven 管理依赖,项目结构如下:
mr-demo/
├── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── com/
│ │ │ └── example/
│ │ │ ├── WordCountMapper.java
│ │ │ ├── WordCountReducer.java
│ │ │ └── WordCountDriver.java
│ │ └── resources/
│ └── test/
│ └── java/
└── pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.examplegroupId>
<artifactId>mr-demoartifactId>
<version>1.0-SNAPSHOTversion>
<packaging>jarpackaging>
<properties>
<maven.compiler.source>1.8maven.compiler.source>
<maven.compiler.target>1.8maven.compiler.target>
<hadoop.version>3.0.4hadoop.version>
properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-commonartifactId>
<version>${hadoop.version}version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-hdfsartifactId>
<version>${hadoop.version}version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-coreartifactId>
<version>${hadoop.version}version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-jobclientartifactId>
<version>${hadoop.version}version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-coreartifactId>
exclusion>
exclusions>
dependency>
dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<version>3.8.1version>
<configuration>
<source>1.8source>
<target>1.8target>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-shade-pluginartifactId>
<version>3.2.4version>
<configuration>
<filters>
<filter>
<artifact>*:*artifact>
<excludes>
<exclude>META-INF/*.SFexclude>
<exclude>META-INF/*.DSAexclude>
<exclude>META-INF/*.RSAexclude>
excludes>
filter>
filters>
configuration>
<executions>
<execution>
<phase>packagephase>
<goals>
<goal>shadegoal>
goals>
execution>
executions>
plugin>
plugins>
build>
project>
package com.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 分割文本行成单词
String line = value.toString();
String[] words = line.split(" ");
// 输出单词和计数1
for (String w : words) {
word.set(w);
context.write(word, one);
}
}
}
package com.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
// 累加单词出现次数
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
package com.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws Exception {
// 检查参数
if (args.length != 2) {
System.err.println("Usage: WordCountDriver );
System.exit(2);
}
// 创建作业配置
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCountDriver.class);
// 设置Mapper和Reducer类
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交作业并等待完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
# 在项目根目录执行
mvn clean package
编译完成后,会在target
目录下生成:
mr-demo-1.0-SNAPSHOT.jar
:未包含依赖的应用JARmr-demo-1.0-SNAPSHOT-shaded.jar
:包含所有依赖的胖JAR(推荐使用)Hadoop 3.0.4的核心JAR位于:
$HADOOP_HOME/share/hadoop/common/
├── hadoop-common-3.0.4.jar
├── hadoop-nfs-3.0.4.jar
└── ...
$HADOOP_HOME/share/hadoop/mapreduce/
├── hadoop-mapreduce-client-core-3.0.4.jar
├── hadoop-mapreduce-client-common-3.0.4.jar
└── ...
$HADOOP_HOME/share/hadoop/hdfs/
├── hadoop-hdfs-3.0.4.jar
└── ...
# 创建输入数据文件
echo "Hello Hadoop MapReduce" > input.txt
echo "Hadoop is a distributed system" > input2.txt
# 上传到HDFS
hdfs dfs -mkdir -p /user/input
hdfs dfs -put input*.txt /user/input
# 使用胖JAR提交作业
hadoop jar target/mr-demo-1.0-SNAPSHOT-shaded.jar \
com.example.WordCountDriver \
/user/input \
/user/output
# 通过YARN Web UI查看(默认端口8088)
open http://localhost:8088/cluster
# 或通过命令行查看
yarn application -list
# 查看输出结果
hdfs dfs -cat /user/output/part-r-00000
# 错误信息:Permission denied
hdfs dfs -chmod -R 777 /user
# 错误信息:Output directory hdfs://... already exists
hdfs dfs -rm -r /user/output
# 修改yarn-site.xml增加内存配置
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>8192</value> <!-- 8GB -->
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>8192</value>
</property>
# 使用shade插件打包时排除冲突依赖
<exclusion>
<groupId>com.example</groupId>
<artifactId>conflict-library</artifactId>
</exclusion>
// 在Driver中添加参数配置
Configuration conf = new Configuration();
conf.set("mapreduce.map.memory.mb", "2048"); // Map任务内存
conf.set("mapreduce.reduce.memory.mb", "4096"); // Reduce任务内存
conf.set("mapreduce.map.cpu.vcores", "2"); // Map任务CPU核心数
// 自定义Partitioner
public class WordCountPartitioner extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text key, IntWritable value, int numPartitions) {
return key.toString().length() % numPartitions;
}
}
// 在Driver中设置
job.setPartitionerClass(WordCountPartitioner.class);
job.setNumReduceTasks(4); // 设置4个Reduce任务
通过本文的实践,我们完成了Hadoop 3.0.4环境下MapReduce程序的开发与部署,主要包括:
MapReduce作为Hadoop的核心计算模型,虽然在实时计算场景下逐渐被Spark/Flink替代,但仍是大数据处理的基础。掌握MapReduce的原理与开发,有助于深入理解分布式计算的核心思想。
在实际应用中,可根据数据规模和业务需求,进一步优化任务参数、自定义数据类型和开发复杂的联合查询逻辑,充分发挥Hadoop集群的计算能力。