基于 Hadoop 3.3.4 源码,聚焦关键类与方法实现
// 客户端创建文件输出流
FileSystem fs = FileSystem.get(conf);
FSDataOutputStream out = fs.create(new Path("/user/data.log"));
调用链:
// NameNode处理create请求
public HdfsFileStatus create(String src, ...) throws IOException {
// 检查权限和路径
FSPermissionChecker pc = getPermissionChecker();
// 在命名空间创建文件节点
stat = FSDirWriteFileOp.startFile(
fsd, src, permissions, holder, clientMachine, flag, createParent);
// 返回文件状态但不分配数据块
return stat;
}
关键点:此时仅创建元数据,未分配数据块(延迟分配策略)
// 当客户端写入数据时触发块分配
public void write(byte b[], int off, int len) throws IOException {
// 检查当前数据块是否已满
if (currentPacket == null ||
currentPacket.getNumChunks() >= chunksPerPacket) {
// 申请新数据块
allocateBlockAndPipeline();
}
// 将数据写入Packet
currentPacket.write(data, offset, len);
}
private void allocateBlockAndPipeline() throws IOException {
// 调用NameNode分配新块
LocatedBlock lb = nextBlockOutputStream(src);
// 建立数据管道
createBlockOutputStream(lb);
}
private LocatedBlock nextBlockOutputStream(String src) throws IOException {
// RPC调用NameNode获取新块位置
return namenode.addBlock(src, clientName, previous, excludeNodes);
}
public LocatedBlock addBlock(String src, String clientName, ...) {
// 选择目标DataNode
DatanodeStorageInfo[] targets = blockManager.chooseTarget4Write(
replication, clientNode, excludedNodes, blockSize);
// 创建新块并持久化元数据
block = new Block(blockId, blockSize, GenerationStamp);
fsd.addBlock(src, block, targets);
// 返回LocatedBlock包含DN位置
return new LocatedBlock(block, targets);
}
// DataStreamer线程处理数据包发送
protected void run() throws IOException {
while (toProcess != null) {
// 发送数据包到管道第一个节点
sendPacket(packet);
// 添加等待ACK队列
ackQueue.add(packet);
}
}
private void sendPacket(Packet packet) throws IOException {
// 写入数据包到输出流
out.write(packet.getBuf(), 0, packet.getLength());
// DataNode内部转发逻辑
// DataXceiver.writeBlock()
if (isDatanode(targets.length)) {
// 通过Netty转发到下一跳
next.send(out);
}
}
FileSystem fs = FileSystem.get(conf);
FSDataInputStream in = fs.open(new Path("/user/data.log"));
调用链:
public LocatedBlocks getBlockLocations(String src, long offset, long length) {
// 检查文件是否存在
INode inode = fsd.getINode(src);
// 获取文件所有块的位置信息
BlockInfo[] blocks = inode.getBlocks();
List<LocatedBlock> locatedBlocks = new ArrayList<>();
for (BlockInfo blk : blocks) {
// 获取块所在的DataNode
DatanodeStorageInfo[] locs = blockManager.getDatanodeManager()
.getStorages(blk);
// 按网络拓扑排序
blockManager.sortLocatedBlocks(src, locs);
locatedBlocks.add(new LocatedBlock(blk, locs));
}
return new LocatedBlocks(locatedBlocks);
}
public int read(long position, byte[] buffer, int offset, int length) {
// 计算目标数据块
LocatedBlock targetBlock = getBlockAt(position);
// 选择最佳DataNode
DatanodeInfo chosenNode = chooseDataNode(targetBlock);
// 创建BlockReader读取数据
blockReader = new BlockReader(chosenNode, targetBlock);
return blockReader.read(buffer, offset, length);
}
// 当客户端与DataNode同节点时触发
public FetchResponse getBlock(ExtendedBlock block, long offset) {
// 检查是否支持短路读取
if (isShortCircuitAvailable(block)) {
// 直接通过文件描述符读取
FileInputStream fis = new FileInputStream(
volume.getBlockFile(block.getBlockPoolId(), block.getLocalBlock()));
fis.getChannel().position(offset);
return new FetchResponse(fis);
}
// 否则走网络读取
return remoteFetch(block, offset);
}
public int read(byte[] buf, int off, int len) throws IOException {
// 通过Netty连接DataNode
if (conn == null) {
conn = datanode.newConnection();
out = conn.getOutputStream();
in = conn.getInputStream();
}
// 发送读请求
new Sender(out).readBlock(block, accessToken, clientName, offset, len);
// 接收数据流
return in.read(buf, off, len);
}
public void run() {
while (running) {
// 接收ACK响应
PipelineAck ack = readPipelineAck();
// 处理错误状态码
if (ack.getSeqno() == Packet.HEART_BEAT_SEQNO) {
// 心跳包处理
} else if (ack.isSuccess()) {
// 成功则移除确认队列
ackQueue.remove(ack.getSeqno());
} else {
// 失败触发管道重建
closePipeline();
setupPipeline();
resendPackets();
}
}
}
private DatanodeInfo refetchLocations() throws IOException {
// 当读取失败时自动重试
for (int retry = 0; retry < maxRetry; retry++) {
// 获取新块位置
LocatedBlock newBlock = namenode.getBlockLocations(src, offset);
// 选择新节点
DatanodeInfo newNode = chooseNewNode(newBlock);
if (connectToDatanode(newNode)) {
return newNode;
}
}
throw new IOException("Failed to find valid datanode");
}
public int read(byte[] buf, int off, int len) throws IOException {
// 读取数据
int nRead = dataIn.read(buf, off, len);
// 读取校验块
int checksumLen = getChecksumLength(len);
byte[] checksums = new byte[checksumLen];
checksumIn.readFully(checksums);
// 验证校验和
if (!validateChecksums(buf, off, nRead, checksums)) {
// 标记坏块并抛出异常
reportBadBlock();
throw new ChecksumException("Corrupt block", offset);
}
return nRead;
}
public ByteBuffer readZeroCopy(long offset, int length) throws IOException {
// 使用transferTo避免内核拷贝
FileChannel fc = file.getChannel();
return fc.map(MapMode.READ_ONLY, offset, length);
}
public void prefetch(LocatedBlock block) {
// 后台线程预取后续块
executor.submit(() -> {
for (Block next : getNextBlocks(block)) {
// 提前建立连接
BlockReader reader = createBlockReader(next);
reader.prefetch();
}
});
}
public FSDataInputStream open(Path p) {
// 缓存打开的文件描述符
if (cache.containsKey(p)) {
return cache.get(p).duplicate();
}
FSDataInputStream in = rawOpen(p);
cache.put(p, in);
return in;
}
public void removeDatanode(DatanodeInfo node) {
// 找出受影响的数据块
List<Block> blocks = node.getBlocks();
for (Block block : blocks) {
// 检查副本数量
if (countReplicas(block) < replication) {
// 触发副本复制
replicateBlock(block);
}
}
}
private void handleTimeout(Packet packet) {
if (retryCount.get() < maxRetries) {
// 重建管道
closePipeline();
setupPipeline();
// 重发数据包
resendPacket(packet);
retryCount.incrementAndGet();
} else {
throw new IOException("Write timeout after retries");
}
}
代码分析总结:
生产环境调试建议:
- 启用
org.apache.hadoop.hdfs.protocol.datatransfer
的 TRACE 日志- 使用 JVM 参数
-Ddfs.client.read.shortcircuit=true
强制短路读取- 通过
hdfs dfsadmin -metaSave
捕获 NameNode 内部状态