Datanode在启动过程中会校验数据目录进行较多的检验,比如是否需要升级、格式化,是否和nn版本一致,是否需要rollback,并最终更新current目录下的VERSION文件信息。下面从startDataNode这个函数入手开始介绍整个校验流程,这个函数比较长,我们只针对校验部分详细分析,因为这个函数包含的内容太多了,相关内容以后还会介绍
void startDataNode(Configuration conf, AbstractList<File>dataDirs, SecureResources resources ) throws IOException { if(UserGroupInformation.isSecurityEnabled()&& resources == null) throw new RuntimeException("Cannotstart secure cluster without " + "privilegedresources."); this.secureResources = resources; //获得本地主机名,如果没有设置slave.host.name则通过下面两个参数,通过网口和DNS来获得 if (conf.get("slave.host.name")!= null) { machineName =conf.get("slave.host.name"); } if (machineName == null) { //注意两个可配置参数,网口:ethX DNS服务器:1.1.1.1类似这种形式 machineName = DNS.getDefaultHost( conf.get("dfs.datanode.dns.interface","default"), conf.get("dfs.datanode.dns.nameserver","default")); } //获得配置文件中NN的地址,用于下面创建动态代理,因为在创建动态代理时要和NN通信 InetSocketAddress nameNodeAddr =NameNode.getServiceAddress(conf, true); //socket连接超时时间 this.socketTimeout = conf.getInt("dfs.socket.timeout", HdfsConstants.READ_TIMEOUT); //socket写超时时间 this.socketWriteTimeout =conf.getInt("dfs.datanode.socket.write.timeout", HdfsConstants.WRITE_TIMEOUT); //影响本datanode向客户端或其他datanode发送数据块的缓存分配尺寸,具体可见sendBlock函数 this.transferToAllowed =conf.getBoolean("dfs.datanode.transferTo.allowed", true); //写包的大小 this.writePacketSize =conf.getInt("dfs.write.packet.size", 64*1024); //创建注册体,用于DN向NN注册时,服务端的校验 InetSocketAddress socAddr =DataNode.getStreamingAddr(conf); int tmpPort = socAddr.getPort(); storage = new DataStorage(); // construct registration this.dnRegistration = newDatanodeRegistration(machineName + ":" + tmpPort); // 创建代理并通过握手获得NN的版本、ID信息 this.namenode = (DatanodeProtocol) RPC.waitForProxy(DatanodeProtocol.class, DatanodeProtocol.versionID, nameNodeAddr, conf); // get version and id info from thename-node NamespaceInfo nsInfo = handshake(); StartupOption startOpt =getStartupOption(conf); assert startOpt != null : "Startupoption must be set."; boolean simulatedFSDataset = conf.getBoolean("dfs.datanode.simulateddatastorage", false); if (simulatedFSDataset) { //因为我这里不是伪分布式,所以会走下面的逻辑 } else { // 这里才开始校验,也是我们重点关注的部分 storage.recoverTransitionRead(nsInfo,dataDirs, startOpt); // adjust this.dnRegistration.setStorageInfo(storage); //initialize data node internal structure this.data = new FSDataset(storage, conf); } ................. }
void recoverTransitionRead(NamespaceInfo nsInfo, Collection<File> dataDirs, StartupOptionstartOpt ) throws IOException { assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() : "Data-node and name-node layout versions must be thesame."; // 1. For each data directory calculate its state and // check whether all is consistent before transitioning. // Format and recover. this.storageID = ""; this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size()); ArrayList<StorageState> dataDirStates = new ArrayList<StorageState>(dataDirs.size()); for(Iterator<File> it =dataDirs.iterator(); it.hasNext();) { File dataDir = it.next(); StorageDirectory sd = new StorageDirectory(dataDir); StorageState curState; try { //数据目录状态分析,是否存在,权限分析、是否需要升级 curState = sd.analyzeStorage(startOpt); // 根据检测后的状态分别执行不同操作,正常启动、格式化、恢复 switch(curState) { case NORMAL: break; case NON_EXISTENT: // 数据目录不存在,则直接忽略 LOG.info("Storage directory " + dataDir + " does not exist."); it.remove(); continue; case NOT_FORMATTED: // format LOG.info("Storage directory " + dataDir + " is not formatted."); LOG.info("Formatting ..."); //数据目录格式化 format(sd, nsInfo); break; default: // 从上一次升级或回滚的失败中恢复 sd.doRecover(curState); } } catch (IOException ioe) { sd.unlock(); throw ioe; } // add to the storage list addStorageDir(sd); dataDirStates.add(curState); } if (dataDirs.size() == 0) // none of the data dirs exist throw new IOException( "All specified directories are notaccessible or do not exist."); // 2.执行真正的升级或回滚操作 for(int idx = 0; idx < getNumStorageDirs(); idx++) { doTransition(getStorageDir(idx), nsInfo,startOpt); assert this.getLayoutVersion() == nsInfo.getLayoutVersion() : "Data-node and name-node layoutversions must be the same."; assert this.getCTime() == nsInfo.getCTime() : "Data-node and name-node CTimes mustbe the same."; } // 3. 更新所有目录的版本文件信息 this.writeAll(); }现在看如何分析数据目录的,这决定了后两步的操作
public StorageState analyzeStorage(StartupOptionstartOpt) throws IOException { assert root != null : "rootis null"; String rootPath = root.getCanonicalPath(); try { // 是否存在 if (!root.exists()) { // storage directory does not exist if (startOpt != StartupOption.FORMAT) { LOG.info("Storage directory " + rootPath + " does not exist."); return StorageState.NON_EXISTENT; } LOG.info(rootPath + " does not exist. Creating ..."); if (!root.mkdirs()) throw new IOException("Cannotcreate directory " + rootPath); } // 是否为一个目录 if (!root.isDirectory()) { LOG.info(rootPath + "is not a directory."); return StorageState.NON_EXISTENT; } //是否有些权限 if (!root.canWrite()) { LOG.info("Cannot access storage directory" + rootPath); return StorageState.NON_EXISTENT; } } catch(SecurityException ex) { LOG.info("Cannot access storage directory" + rootPath, ex); return StorageState.NON_EXISTENT; } this.lock(); // 对数据目录加锁,防止并发访问 if (startOpt == HdfsConstants.StartupOption.FORMAT) return StorageState.NOT_FORMATTED; if (startOpt != HdfsConstants.StartupOption.IMPORT) { //make sure no conversion is required checkConversionNeeded(this); } // 获得版本文件 File versionFile = getVersionFile(); boolean hasCurrent = versionFile.exists(); // 一系列的临时文件校验,如果这些临时目录存在,则说明这个存储是不正常的,下面会看到这些校验 boolean hasPrevious = getPreviousDir().exists(); boolean hasPreviousTmp = getPreviousTmp().exists(); boolean hasRemovedTmp = getRemovedTmp().exists(); boolean hasFinalizedTmp =getFinalizedTmp().exists(); boolean hasCheckpointTmp = getLastCheckpointTmp().exists(); if (!(hasPreviousTmp || hasRemovedTmp || hasFinalizedTmp ||hasCheckpointTmp)) { // no temp dirs - no recovery if (hasCurrent) return StorageState.NORMAL; if (hasPrevious) throw new InconsistentFSStateException(root, "version file in current directory ismissing."); return StorageState.NOT_FORMATTED; } if ((hasPreviousTmp?1:0) + (hasRemovedTmp?1:0) + (hasFinalizedTmp?1:0) +(hasCheckpointTmp?1:0) > 1) // more than one temp dirs throw new InconsistentFSStateException(root, "too many temporary directories."); // # of temp dirs == 1 should eitherrecover or complete a transition if (hasCheckpointTmp) { return hasCurrent ? StorageState.COMPLETE_CHECKPOINT : StorageState.RECOVER_CHECKPOINT; } if (hasFinalizedTmp) { if (hasPrevious) throw new InconsistentFSStateException(root, STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_FINALIZED + "cannotexist together."); return StorageState.COMPLETE_FINALIZE; } if (hasPreviousTmp) { if (hasPrevious) throw new InconsistentFSStateException(root, STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_PREVIOUS + "cannot exist together."); if (hasCurrent) return StorageState.COMPLETE_UPGRADE; return StorageState.RECOVER_UPGRADE; } assert hasRemovedTmp : "hasRemovedTmp must be true"; if (!(hasCurrent ^ hasPrevious)) throw new InconsistentFSStateException(root, "one and only one directory " + STORAGE_DIR_CURRENT + " or" + STORAGE_DIR_PREVIOUS + "must be present when " + STORAGE_TMP_REMOVED + "exists."); if (hasCurrent) return StorageState.COMPLETE_ROLLBACK; return StorageState.RECOVER_ROLLBACK; }在第二步中会做升级或回滚的操作,如果启动参数为NORMAL则直接返回
private void doTransition( StorageDirectory sd, NamespaceInfonsInfo, StartupOptionstartOpt ) throws IOException { //是否需要回滚 if (startOpt == StartupOption.ROLLBACK) doRollback(sd, nsInfo); // rollback if applicable //读取版本文件信息 sd.read(); //检测版本文件 checkVersionUpgradable(this.layoutVersion); assert this.layoutVersion >= FSConstants.LAYOUT_VERSION : "Future version is not allowed"; //namespaceid校验 if (getNamespaceID() !=nsInfo.getNamespaceID()) throw new IOException( "Incompatible namespaceIDs in " + sd.getRoot().getCanonicalPath() + ": namenode namespaceID = " + nsInfo.getNamespaceID() + "; datanode namespaceID = " + getNamespaceID()); //layout版本校验 if (this.layoutVersion == FSConstants.LAYOUT_VERSION && this.cTime == nsInfo.getCTime()) return; //regular startup // verify necessity of a distributed upgrade verifyDistributedUpgradeProgress(nsInfo); if (this.layoutVersion > FSConstants.LAYOUT_VERSION || this.cTime < nsInfo.getCTime()) { //执行升级操作 doUpgrade(sd, nsInfo); // upgrade return; } // layoutVersion == LAYOUT_VERSION && this.cTime> nsInfo.cTime // must shutdown throw new IOException("Datanodestate: LV = " + this.getLayoutVersion() + " CTime = " + this.getCTime() + " is newer than the namespace state:LV = " +nsInfo.getLayoutVersion() + " CTime = " + nsInfo.getCTime()); }至此数据目录的校验如果成功,则会继续执行,需要注意的是在升级和回滚阶段的操作还是比较复杂的,如果正常启动则比较简单。下一篇讲dn内部数据结构的初始化。