DN的主要工作用来存取数据,与其产生块交互的主要有两种角色:客户端和其他DN,数据块的收发是比较繁重的工作,虽然DN不必面临高并发的场景,但如果是串行服务的话必然会降低效率,为此,DN在每次接到块操作请求时,都会产生一个线程用于服务,这里说的dataXceiverServer就类似一餐馆老板,每来一个客人,就派一个小伙计出来服务,一对一的。DataXceiver就是小伙计的角色。dataXceiverServer的创建时机如下
void startDataNode(Configuration conf, AbstractList<File> dataDirs, SecureResources resources ) throws IOException { ............. // 创建一个ServerSocket ServerSocket ss; if(secureResources == null) { //如果指定写超时时间,则用ServerSocketChannel来创建,该方法可以设置非阻塞模式,在网络环境差的情况下非常有用 ss = (socketWriteTimeout > 0) ? ServerSocketChannel.open().socket() : new ServerSocket(); //将ServerSocket与端口绑定,默认端口为50010 Server.bind(ss, socAddr, 0); } else { ss = resources.getStreamingSocket(); } //设置接收缓冲区 ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE); // 获得真正绑定的端口 tmpPort = ss.getLocalPort(); selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(), tmpPort); //更新注册体内容,用于向NN汇报 this.dnRegistration.setName(machineName + ":" + tmpPort); LOG.info("Opened info server at " + tmpPort); //创建一个线程组,对外监听的DataXceiverServer和对外服务的DataXceiver都放在该组中,注意这里都是后台守护线程 this.threadGroup = new ThreadGroup("dataXceiverServer"); //开始创建dataXceiverServer,该服务器会在DN最后做主循环前启动 this.dataXceiverServer = new Daemon(threadGroup, new DataXceiverServer(ss, conf, this)); this.threadGroup.setDaemon(true); // auto destroy when empty ................ }
下面看下dataXceiverServer的线程执行体,和我们自己编写的网络服务端差不多
public void run() { //一直循环 while (datanode.shouldRun) { try { //监听中..... Socket s = ss.accept(); s.setTcpNoDelay(true); //创建一个新的线程服务客户端,注意他也是放在dataXceiverServer这个线程组里的 new Daemon(datanode.threadGroup, new DataXceiver(s, datanode, this)).start(); } catch (SocketTimeoutException ignored) { // wake up to see if should continue to run } catch (AsynchronousCloseException ace) { LOG.warn(datanode.dnRegistration + ":DataXceiveServer:" + StringUtils.stringifyException(ace)); datanode.shouldRun = false; } catch (IOException ie) { LOG.warn(datanode.dnRegistration + ":DataXceiveServer: IOException due to:" + StringUtils.stringifyException(ie)); } catch (Throwable te) { LOG.error(datanode.dnRegistration + ":DataXceiveServer: Exiting due to:" + StringUtils.stringifyException(te)); datanode.shouldRun = false; } } try { //关闭ss ss.close(); } catch (IOException ie) { LOG.warn(datanode.dnRegistration + ":DataXceiveServer: Close exception due to: " + StringUtils.stringifyException(ie)); } LOG.info("Exiting DataXceiveServer"); }
上面这个run方法并不是核心,真正干活的是DataXceiver,让我们看看他的执行体
public void run() { DataInputStream in=null; try { //构建读入流,因为先要读取一些验证信息 in = new DataInputStream( new BufferedInputStream(NetUtils.getInputStream(s), SMALL_BUFFER_SIZE)); //传输版本对比 short version = in.readShort(); if ( version != DataTransferProtocol.DATA_TRANSFER_VERSION ) { throw new IOException( "Version Mismatch" ); } //判断是否为本地操作,用于更新相关统计信息 boolean local = s.getInetAddress().equals(s.getLocalAddress()); //读取操作码 byte op = in.readByte(); // 确保活动线程数不大于dataXceiverServer.maxXceiverCount(256),这个限制还是很宽松的 int curXceiverCount = datanode.getXceiverCount(); if (curXceiverCount > dataXceiverServer.maxXceiverCount) { throw new IOException("xceiverCount " + curXceiverCount + " exceeds the limit of concurrent xcievers " + dataXceiverServer.maxXceiverCount); } //记录启动时间 long startTime = DataNode.now(); //根据不同操作码执行操作 switch ( op ) { //读取数据块 case DataTransferProtocol.OP_READ_BLOCK: readBlock( in ); datanode.myMetrics.addReadBlockOp(DataNode.now() - startTime); if (local) datanode.myMetrics.incrReadsFromLocalClient(); else datanode.myMetrics.incrReadsFromRemoteClient(); break; //写入数据块 case DataTransferProtocol.OP_WRITE_BLOCK: writeBlock( in ); datanode.myMetrics.addWriteBlockOp(DataNode.now() - startTime); if (local) datanode.myMetrics.incrWritesFromLocalClient(); else datanode.myMetrics.incrWritesFromRemoteClient(); break; //替换数据块 case DataTransferProtocol.OP_REPLACE_BLOCK: // for balancing purpose; send to a destination replaceBlock(in); datanode.myMetrics.addReplaceBlockOp(DataNode.now() - startTime); break; //拷贝数据块 case DataTransferProtocol.OP_COPY_BLOCK: // for balancing purpose; send to a proxy source copyBlock(in); datanode.myMetrics.addCopyBlockOp(DataNode.now() - startTime); break; //检测数据块 case DataTransferProtocol.OP_BLOCK_CHECKSUM: //get the checksum of a block getBlockChecksum(in); datanode.myMetrics.addBlockChecksumOp(DataNode.now() - startTime); break; default: throw new IOException("Unknown opcode " + op + " in data stream"); } } catch (Throwable t) { LOG.error(datanode.dnRegistration + ":DataXceiver",t); } finally { LOG.debug(datanode.dnRegistration + ":Number of active connections is: " + datanode.getXceiverCount()); //关闭相关流并移除服务socket IOUtils.closeStream(in); IOUtils.closeSocket(s); dataXceiverServer.childSockets.remove(s); } }
以读取数据块为例,看下是如何操作的
private void readBlock(DataInputStream in) throws IOException { // // 读取都信息 // long blockId = in.readLong(); //读取blockId Block block = new Block( blockId, 0 , in.readLong());//读取stamp标记 long startOffset = in.readLong();//读取偏移量 long length = in.readLong();//读取长度 String clientName = Text.readString(in);//客户端名称 Token<BlockTokenIdentifier> accessToken = new Token<BlockTokenIdentifier>(); accessToken.readFields(in); //获得输出流用于发送block OutputStream baseStream = NetUtils.getOutputStream(s, datanode.socketWriteTimeout); DataOutputStream out = new DataOutputStream( new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE)); if (datanode.isBlockTokenEnabled) { try { datanode.blockTokenSecretManager.checkAccess(accessToken, null, block, BlockTokenSecretManager.AccessMode.READ); } catch (InvalidToken e) { try { out.writeShort(DataTransferProtocol.OP_STATUS_ERROR_ACCESS_TOKEN); out.flush(); throw new IOException("Access token verification failed, for client " + remoteAddress + " for OP_READ_BLOCK for block " + block); } finally { IOUtils.closeStream(out); } } } // 构建blockSender用于发送block BlockSender blockSender = null; //构建日志信息类似下面这种,我们在日志里会经常看到 //src: /127.0.0.1:50010, dest: /127.0.0.1:50243, bytes: %d, op: HDFS_READ, cliID: DFSClient_-880133444, offset: %d, srvID: DS-1789183053-125.120.30.128-50010-1379249313769, blockid: blk_5420252401562768646_1005, duration: %d //源地址、目的地址、发送量、操作类型、客户端名称、偏移量等等.... final String clientTraceFmt = clientName.length() > 0 && ClientTraceLog.isInfoEnabled() ? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress, "%d", "HDFS_READ", clientName, "%d", datanode.dnRegistration.getStorageID(), block, "%d") : datanode.dnRegistration + " Served block " + block + " to " + s.getInetAddress(); try { try { //当读取的块大于一个块大小的时候,length值则为一个块大小,这是HDFS的IO单位 blockSender = new BlockSender(block, startOffset, length, true, true, false, datanode, clientTraceFmt); } catch(IOException e) { out.writeShort(DataTransferProtocol.OP_STATUS_ERROR); throw e; } out.writeShort(DataTransferProtocol.OP_STATUS_SUCCESS); // 发送操作状态 long read = blockSender.sendBlock(out, baseStream, null); // 真正的发送数据 if (blockSender.isBlockReadFully()) { // 如果数据块发送完毕,则等客户端返回状态,以确定是否成功 try { if (in.readShort() == DataTransferProtocol.OP_STATUS_CHECKSUM_OK && datanode.blockScanner != null) { datanode.blockScanner.verifiedByClient(block); } } catch (IOException ignored) {} } //更新相关统计信息 datanode.myMetrics.incrBytesRead((int) read); datanode.myMetrics.incrBlocksRead(); } catch ( SocketException ignored ) { // Its ok for remote side to close the connection anytime. datanode.myMetrics.incrBlocksRead(); } catch ( IOException ioe ) { /* What exactly should we do here? * Earlier version shutdown() datanode if there is disk error. */ LOG.warn(datanode.dnRegistration + ":Got exception while serving " + block + " to " + s.getInetAddress() + ":\n" + StringUtils.stringifyException(ioe) ); throw ioe; } finally { //关闭相关流 IOUtils.closeStream(out); IOUtils.closeStream(blockSender); } }
发送数据的操作是我们关心的,这也是BlockSender的主要功能,他首先会向客户端发送校验信息,比如校验时chunk的大小,然后会确定发送包的大小,这些都确定好后,则真正开始发送
long sendBlock(DataOutputStream out, OutputStream baseStream, BlockTransferThrottler throttler) throws IOException { if( out == null ) { throw new IOException( "out stream is null" ); } this.throttler = throttler;//节流器 long initialOffset = offset;//偏移量 long totalRead = 0;//本次读取的总长度,用于返回 OutputStream streamForSendChunks = out; //记录开始时间 final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0; try { try { //写入校验头信息,告诉客户端如何校验 checksum.writeHeader(out); if ( chunkOffsetOK ) { out.writeLong( offset );//从哪里开始校验 } out.flush();//刷新到客户端 } catch (IOException e) { //socket error throw ioeToSocketException(e); } //下面这段判断主要用于确定发送的包大小 int maxChunksPerPacket; int pktSize = DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER; if (transferToAllowed && !verifyChecksum && baseStream instanceof SocketOutputStream && blockIn instanceof FileInputStream) { FileChannel fileChannel = ((FileInputStream)blockIn).getChannel(); // 记录文件位置,发送块的其实位置 blockInPosition = fileChannel.position(); streamForSendChunks = baseStream; // 计算每个包发送多少chunks maxChunksPerPacket = (Math.max(BUFFER_SIZE, MIN_BUFFER_WITH_TRANSFERTO) + bytesPerChecksum - 1)/bytesPerChecksum; // 计算包大小,此值会确定分配缓存的大小 pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket; } else { maxChunksPerPacket = Math.max(1, (BUFFER_SIZE + bytesPerChecksum - 1)/bytesPerChecksum); pktSize += (bytesPerChecksum + checksumSize) * maxChunksPerPacket; } //分配缓冲区 ByteBuffer pktBuf = ByteBuffer.allocate(pktSize); //循环发送chunks while (endOffset > offset) { long len = sendChunks(pktBuf, maxChunksPerPacket, streamForSendChunks); offset += len; totalRead += len + ((len + bytesPerChecksum - 1)/bytesPerChecksum* checksumSize); seqno++; } try { 写结束标记位并刷新 out.writeInt(0); // mark the end of block out.flush(); } catch (IOException e) { //socket error throw ioeToSocketException(e); } } catch (RuntimeException e) { LOG.error("unexpected exception sending block", e); throw new IOException("unexpected runtime exception", e); } finally { if (clientTraceFmt != null) { //记录发送时间 final long endTime = System.nanoTime(); ClientTraceLog.info(String.format(clientTraceFmt, totalRead, initialOffset, endTime - startTime)); } close(); } //记录是否发送完毕 blockReadFully = (initialOffset == 0 && offset >= blockLength); return totalRead; }
如何发送chunk的呢?
private int sendChunks(ByteBuffer pkt, int maxChunks, OutputStream out) throws IOException { // 至多读取一个包大小的chunks int len = Math.min((int) (endOffset - offset), bytesPerChecksum*maxChunks); // 通过该计算确定一个包读取整数个chunk,以防客户端校验出错 if (len > bytesPerChecksum && len % bytesPerChecksum != 0) { len -= len % bytesPerChecksum; } if (len == 0) { return 0; } //计算本次读取多少chunk int numChunks = (len + bytesPerChecksum - 1)/bytesPerChecksum; int packetLen = len + numChunks*checksumSize + 4; pkt.clear(); // 写包头信息,长度、偏移、序列号 pkt.putInt(packetLen); pkt.putLong(offset); pkt.putLong(seqno); pkt.put((byte)((offset + len >= endOffset) ? 1 : 0)); //why no ByteBuf.putBoolean()? pkt.putInt(len); int checksumOff = pkt.position(); int checksumLen = numChunks * checksumSize; //获取包存储区,用于存放发送的数据 byte[] buf = pkt.array(); if (checksumSize > 0 && checksumIn != null) { try { //读取一组chunk的checksum checksumIn.readFully(buf, checksumOff, checksumLen); } catch (IOException e) { LOG.warn(" Could not read or failed to veirfy checksum for data" + " at offset " + offset + " for block " + block + " got : " + StringUtils.stringifyException(e)); IOUtils.closeStream(checksumIn); checksumIn = null; if (corruptChecksumOk) { if (checksumOff < checksumLen) { // Just fill the array with zeros. Arrays.fill(buf, checksumOff, checksumLen, (byte) 0); } } else { throw e; } } } //更新偏移量,开始读取数据 int dataOff = checksumOff + checksumLen; if (blockInPosition < 0) { //normal transfer IOUtils.readFully(blockIn, buf, dataOff, len); if (verifyChecksum) { int dOff = dataOff; int cOff = checksumOff; int dLeft = len; for (int i=0; i<numChunks; i++) { checksum.reset(); int dLen = Math.min(dLeft, bytesPerChecksum); checksum.update(buf, dOff, dLen); if (!checksum.compare(buf, cOff)) {//校验码对比,如果失败则记录日志,在DataBlockScanner中还会用到,具体看下篇 throw new ChecksumException("Checksum failed at " + (offset + len - dLeft), len); } dLeft -= dLen; dOff += dLen; cOff += checksumSize; } } // only recompute checksum if we can't trust the meta data due to // concurrent writes if (memoizedBlock.hasBlockChanged(len)) { ChecksumUtil.updateChunkChecksum( buf, checksumOff, dataOff, len, checksum ); } try { out.write(buf, 0, dataOff + len); } catch (IOException e) { throw ioeToSocketException(e); } } else { try { //use transferTo(). Checks on out and blockIn are already done. SocketOutputStream sockOut = (SocketOutputStream) out; FileChannel fileChannel = ((FileInputStream) blockIn).getChannel(); if (memoizedBlock.hasBlockChanged(len)) { fileChannel.position(blockInPosition); IOUtils.readFileChannelFully( fileChannel, buf, dataOff, len ); //更新校验码 ChecksumUtil.updateChunkChecksum( buf, checksumOff, dataOff, len, checksum ); sockOut.write(buf, 0, dataOff + len); } else { //写出数据 sockOut.write(buf, 0, dataOff); // no need to flush. since we know out is not a buffered stream. sockOut.transferToFully(fileChannel, blockInPosition, len); } blockInPosition += len; } catch (IOException e) { /* exception while writing to the client (well, with transferTo(), * it could also be while reading from the local file). */ throw ioeToSocketException(e); } } if (throttler != null) { // 发送完一个chunk,则通过节流器判断是否超出阈值,如果则产生等待,以节省网络带宽 throttler.throttle(packetLen); } return len; }