public synchronized List<Task> assignTasks(TaskTracker taskTracker) throws IOException { //获得一个TT状态,该TT在心跳时传过来,代表发送心跳的那个TT TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus(); //获得整个集群的状态,包括有效TT,黑名单、灰名单信息 ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus(); //有效TT的数量 final int numTaskTrackers = clusterStatus.getTaskTrackers(); //获得集群map的计算能力(最多能跑多少map) final int clusterMapCapacity = clusterStatus.getMaxMapTasks(); //同上,这个是针对reduce的 final int clusterReduceCapacity = clusterStatus.getMaxReduceTasks(); //获得作业队列,在上篇初始化时我们可以看到,他已经被放入了作业队列中了 Collection<JobInProgress> jobQueue = jobQueueJobInProgressListener.getJobQueue(); //获得当前TT的计算能力,最多能运行多少map和reduce,当前正在运行的多少 final int trackerMapCapacity = taskTrackerStatus.getMaxMapSlots(); final int trackerReduceCapacity = taskTrackerStatus.getMaxReduceSlots(); final int trackerRunningMaps = taskTrackerStatus.countMapTasks(); final int trackerRunningReduces = taskTrackerStatus.countReduceTasks(); // 创建一个任务集合,用于返回 List<Task> assignedTasks = new ArrayList<Task>(); // // Compute (running + pending) map and reduce task numbers across pool // int remainingReduceLoad = 0; int remainingMapLoad = 0; synchronized (jobQueue) { for (JobInProgress job : jobQueue) {//遍历所有作业 if (job.getStatus().getRunState() == JobStatus.RUNNING) { //计算该作业的剩余负载 remainingMapLoad += (job.desiredMaps() - job.finishedMaps()); if (job.scheduleReduces()) {//判断是否需要启动reduce remainingReduceLoad += //计算剩余reduce负载 (job.desiredReduces() - job.finishedReduces()); } } } } // 计算map的负载比例 double mapLoadFactor = 0.0; if (clusterMapCapacity > 0) { mapLoadFactor = (double)remainingMapLoad / clusterMapCapacity; } //计算reduce负载比例 double reduceLoadFactor = 0.0; if (clusterReduceCapacity > 0) { reduceLoadFactor = (double)remainingReduceLoad / clusterReduceCapacity; } // // In the below steps, we allocate first map tasks (if appropriate), // and then reduce tasks if appropriate. We go through all jobs // in order of job arrival; jobs only get serviced if their // predecessors are serviced, too. // // // We assign tasks to the current taskTracker if the given machine // has a workload that's less than the maximum load of that kind of // task. // However, if the cluster is close to getting loaded i.e. we don't // have enough _padding_ for speculative executions etc., we only // schedule the "highest priority" task i.e. the task from the job // with the highest priority. // //获得当前TT能执行map总数 final int trackerCurrentMapCapacity = Math.min((int)Math.ceil(mapLoadFactor * trackerMapCapacity), trackerMapCapacity); //获得有效事物槽 int availableMapSlots = trackerCurrentMapCapacity - trackerRunningMaps; boolean exceededMapPadding = false; if (availableMapSlots > 0) { //判断是否需要保留负载 exceededMapPadding = exceededPadding(true, clusterStatus, trackerMapCapacity); } int numLocalMaps = 0; int numNonLocalMaps = 0; scheduleMaps: for (int i=0; i < availableMapSlots; ++i) {//根据有效事物槽分配任务 synchronized (jobQueue) { for (JobInProgress job : jobQueue) {//只处理运行任务 if (job.getStatus().getRunState() != JobStatus.RUNNING) { continue; } Task t = null; // 构建一个本地map任务 t = job.obtainNewNodeOrRackLocalMapTask(taskTrackerStatus, numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts()); if (t != null) { assignedTasks.add(t);//加入返回集合 ++numLocalMaps; // Don't assign map tasks to the hilt! // Leave some free slots in the cluster for future task-failures, // speculative tasks etc. beyond the highest priority job if (exceededMapPadding) {//超出保留负载就不再分配负载了 break scheduleMaps; } // Try all jobs again for the next Map task break; } // 创建一非本地任务 t = job.obtainNewNonLocalMapTask(taskTrackerStatus, numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts()); if (t != null) { assignedTasks.add(t); ++numNonLocalMaps; // We assign at most 1 off-switch or speculative task // This is to prevent TaskTrackers from stealing local-tasks // from other TaskTrackers. break scheduleMaps; } } } } int assignedMaps = assignedTasks.size(); //reduce创建,和map逻辑大致相同 final int trackerCurrentReduceCapacity = Math.min((int)Math.ceil(reduceLoadFactor * trackerReduceCapacity), trackerReduceCapacity); final int availableReduceSlots = Math.min((trackerCurrentReduceCapacity - trackerRunningReduces), 1); boolean exceededReducePadding = false; if (availableReduceSlots > 0) { exceededReducePadding = exceededPadding(false, clusterStatus, trackerReduceCapacity); synchronized (jobQueue) { for (JobInProgress job : jobQueue) { if (job.getStatus().getRunState() != JobStatus.RUNNING || job.numReduceTasks == 0) { continue; } Task t = job.obtainNewReduceTask(taskTrackerStatus, numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts() ); if (t != null) { assignedTasks.add(t); break; } // Don't assign reduce tasks to the hilt! // Leave some free slots in the cluster for future task-failures, // speculative tasks etc. beyond the highest priority job if (exceededReducePadding) { break; } } } } if (LOG.isDebugEnabled()) { LOG.debug("Task assignments for " + taskTrackerStatus.getTrackerName() + " --> " + "[" + mapLoadFactor + ", " + trackerMapCapacity + ", " + trackerCurrentMapCapacity + ", " + trackerRunningMaps + "] -> [" + (trackerCurrentMapCapacity - trackerRunningMaps) + ", " + assignedMaps + " (" + numLocalMaps + ", " + numNonLocalMaps + ")] [" + reduceLoadFactor + ", " + trackerReduceCapacity + ", " + trackerCurrentReduceCapacity + "," + trackerRunningReduces + "] -> [" + (trackerCurrentReduceCapacity - trackerRunningReduces) + ", " + (assignedTasks.size()-assignedMaps) + "]"); } return assignedTasks; }系统预留负载:高优先级作业、推测执行任务,计算过程如下:
private boolean exceededPadding(boolean isMapTask, ClusterStatus clusterStatus, int maxTaskTrackerSlots) { //获得当前有效TT int numTaskTrackers = clusterStatus.getTaskTrackers(); //当前集群中正在执行的任务 int totalTasks = (isMapTask) ? clusterStatus.getMapTasks() : clusterStatus.getReduceTasks(); //当前集群中能够执行的任务数 int totalTaskCapacity = isMapTask ? clusterStatus.getMaxMapTasks() : clusterStatus.getMaxReduceTasks(); //获得作业队列 Collection<JobInProgress> jobQueue = jobQueueJobInProgressListener.getJobQueue(); //判断所有正在运行作业 boolean exceededPadding = false; synchronized (jobQueue) { int totalNeededTasks = 0; for (JobInProgress job : jobQueue) { if (job.getStatus().getRunState() != JobStatus.RUNNING || job.numReduceTasks == 0) { continue; } // // Beyond the highest-priority task, reserve a little // room for failures and speculative executions; don't // schedule tasks to the hilt. // totalNeededTasks += isMapTask ? job.desiredMaps() : job.desiredReduces(); int padding = 0; if (numTaskTrackers > MIN_CLUSTER_SIZE_FOR_PADDING) { //计算保留负载,因为要应对一些高优先级作业和推测执行 padding = Math.min(maxTaskTrackerSlots, (int) (totalNeededTasks * padFraction)); } //如果正在运行的任务加上预留任务槽大于整个集群的负载能力,则超出预留属性为真 if (totalTasks + padding >= totalTaskCapacity) { exceededPadding = true; break; } } } return exceededPadding; }
setup和cleanup任务是不受调度器影响的,由JT自己分配,源码部分在JT的heartbeat中,如下:
if (recoveryManager.shouldSchedule() && acceptNewTasks && !isBlacklisted) { TaskTrackerStatus taskTrackerStatus = getTaskTrackerStatus(trackerName); if (taskTrackerStatus == null) { LOG.warn("Unknown task tracker polling; ignoring: " + trackerName); } else { //分配setup cleanup任务 List<Task> tasks = getSetupAndCleanupTasks(taskTrackerStatus); if (tasks == null ) { //进入任务调度器获得一个任务 tasks = taskScheduler.assignTasks(taskTrackers.get(trackerName)); } if (tasks != null) { for (Task task : tasks) { expireLaunchingTasks.addNewTask(task.getTaskID()); if(LOG.isDebugEnabled()) { LOG.debug(trackerName + " -> LaunchTask: " + task.getTaskID()); } actions.add(new LaunchTaskAction(task)); } } } }