聊聊PowerJob的ProcessorTracker

本文主要研究一下PowerJob的ProcessorTracker

ProcessorTracker

tech/powerjob/worker/core/tracker/processor/ProcessorTracker.java

@Slf4j
public class ProcessorTracker {

    /**
     * 记录创建时间
     */
    private long startTime;
    private WorkerRuntime workerRuntime;
    /**
     * 任务实例信息
     */
    private InstanceInfo instanceInfo;
    /**
     * 冗余 instanceId,方便日志
     */
    private Long instanceId;

    private ProcessorBean processorBean;
    /**
     * 在线日志
     */
    private OmsLogger omsLogger;
    /**
     * ProcessResult 上报失败的重试队列
     */
    private Queue statusReportRetryQueue;
    /**
     * 上一次空闲时间(用于闲置判定)
     */
    private long lastIdleTime;
    /**
     * 上次完成任务数量(用于闲置判定)
     */
    private long lastCompletedTaskCount;

    private String taskTrackerAddress;

    private ThreadPoolExecutor threadPool;

    private ScheduledExecutorService timingPool;

    private static final int THREAD_POOL_QUEUE_MAX_SIZE = 128;
    /**
     * 长时间空闲的 ProcessorTracker 会发起销毁请求
     */
    private static final long MAX_IDLE_TIME = 120000;
    /**
     * 当 ProcessorTracker 出现根本性错误(比如 Processor 创建失败,所有的任务直接失败)
     */
    private boolean lethal = false;

    private String lethalReason;

    /**
     * 创建 ProcessorTracker(其实就是创建了个执行用的线程池 T_T)
     */
    @SuppressWarnings("squid:S1181")
    public ProcessorTracker(TaskTrackerStartTaskReq request, WorkerRuntime workerRuntime) {
        try {
            // 赋值
            this.startTime = System.currentTimeMillis();
            this.workerRuntime = workerRuntime;
            this.instanceInfo = request.getInstanceInfo();
            this.instanceId = request.getInstanceInfo().getInstanceId();
            this.taskTrackerAddress = request.getTaskTrackerAddress();

            this.omsLogger = OmsLoggerFactory.build(instanceId, request.getLogConfig(), workerRuntime);
            this.statusReportRetryQueue = Queues.newLinkedBlockingQueue();
            this.lastIdleTime = -1L;
            this.lastCompletedTaskCount = 0L;

            // 初始化 线程池,TimingPool 启动的任务会检查 ThreadPool,所以必须先初始化线程池,否则NPE
            initThreadPool();
            // 初始化定时任务
            initTimingJob();
            // 初始化 Processor
            processorBean = workerRuntime.getProcessorLoader().load(new ProcessorDefinition().setProcessorType(instanceInfo.getProcessorType()).setProcessorInfo(instanceInfo.getProcessorInfo()));
            log.info("[ProcessorTracker-{}] ProcessorTracker was successfully created!", instanceId);
        } catch (Throwable t) {
            log.warn("[ProcessorTracker-{}] create ProcessorTracker failed, all tasks submitted here will fail.", instanceId, t);
            lethal = true;
            lethalReason = ExceptionUtils.getMessage(t);
        }
    }

    //......
}    

ProcessorTracker接收TaskTrackerStartTaskReq参数,然后初始化线程池、初始化定时任务、初始化processorBean,它提供了submitTask、destroy方法

initThreadPool

    /**
     * 初始化线程池
     */
    private void initThreadPool() {

        int poolSize = calThreadPoolSize();
        // 待执行队列,为了防止对内存造成较大压力,内存队列不能太大
        BlockingQueue queue = new ArrayBlockingQueue<>(THREAD_POOL_QUEUE_MAX_SIZE);
        // 自定义线程池中线程名称 (PowerJob Processor Pool -> PPP)
        ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("PPP-%d").build();
        // 拒绝策略:直接抛出异常
        RejectedExecutionHandler rejectionHandler = new ThreadPoolExecutor.AbortPolicy();

        threadPool = new ThreadPoolExecutor(poolSize, poolSize, 60L, TimeUnit.SECONDS, queue, threadFactory, rejectionHandler);

        // 当没有任务执行时,允许销毁核心线程(即线程池最终存活线程个数可能为0)
        threadPool.allowCoreThreadTimeOut(true);
    }

    /**
     * 计算线程池大小
     */
    private int calThreadPoolSize() {
        ExecuteType executeType = ExecuteType.valueOf(instanceInfo.getExecuteType());
        ProcessorType processorType = ProcessorType.valueOf(instanceInfo.getProcessorType());

        // 脚本类自带线程池,不过为了少一点逻辑判断,还是象征性分配一个线程
        if (processorType == ProcessorType.PYTHON || processorType == ProcessorType.SHELL) {
            return 1;
        }

        if (executeType == ExecuteType.MAP_REDUCE || executeType == ExecuteType.MAP) {
            return instanceInfo.getThreadConcurrency();
        }
        if (TimeExpressionType.FREQUENT_TYPES.contains(instanceInfo.getTimeExpressionType())) {
            return instanceInfo.getThreadConcurrency();
        }
        return 2;
    }    

initThreadPool创建BlockingQueue大小为128的ThreadPoolExecutor

initTimingJob

    /**
     * 初始化定时任务
     */
    private void initTimingJob() {

        // PowerJob Processor TimingPool
        ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("PPT-%d").build();
        timingPool = Executors.newSingleThreadScheduledExecutor(threadFactory);

        timingPool.scheduleAtFixedRate(new CheckerAndReporter(), 0, 10, TimeUnit.SECONDS);
    }

initTimingJob通过Executors.newSingleThreadScheduledExecutor创建ScheduledExecutorService,然后每隔10s调度CheckerAndReporter

ProcessorTrackerActor

tech/powerjob/worker/actors/ProcessorTrackerActor.java

@Slf4j
@Actor(path = RemoteConstant.WPT_PATH)
public class ProcessorTrackerActor {

    private final WorkerRuntime workerRuntime;

    public ProcessorTrackerActor(WorkerRuntime workerRuntime) {
        this.workerRuntime = workerRuntime;
    }

    /**
     * 处理来自TaskTracker的task执行请求
     * @param req 请求
     */
    @Handler(path = RemoteConstant.WPT_HANDLER_START_TASK, processType = ProcessType.NO_BLOCKING)
    public void onReceiveTaskTrackerStartTaskReq(TaskTrackerStartTaskReq req) {

        Long instanceId = req.getInstanceInfo().getInstanceId();

        // 创建 ProcessorTracker 一定能成功
        ProcessorTracker processorTracker = ProcessorTrackerManager.getProcessorTracker(
                instanceId,
                req.getTaskTrackerAddress(),
                () -> new ProcessorTracker(req, workerRuntime));

        TaskDO task = new TaskDO();

        task.setTaskId(req.getTaskId());
        task.setTaskName(req.getTaskName());
        task.setTaskContent(req.getTaskContent());
        task.setFailedCnt(req.getTaskCurrentRetryNums());
        task.setSubInstanceId(req.getSubInstanceId());

        processorTracker.submitTask(task);
    }

    /**
     * 处理来自TaskTracker停止任务的请求
     * @param req 请求
     */
    @Handler(path = RemoteConstant.WPT_HANDLER_STOP_INSTANCE)
    public void onReceiveTaskTrackerStopInstanceReq(TaskTrackerStopInstanceReq req) {

        Long instanceId = req.getInstanceId();
        List removedPts = ProcessorTrackerManager.removeProcessorTracker(instanceId);
        if (!CollectionUtils.isEmpty(removedPts)) {
            removedPts.forEach(ProcessorTracker::destroy);
        }
    }
}

ProcessorTrackerActor提供了onReceiveTaskTrackerStartTaskReq,用于处理startTask,这里会获取或者创建ProcessorTracker,然后执行processorTracker.submitTask

getSuitableWorkers

tech/powerjob/server/remote/worker/WorkerClusterQueryService.java

    public List getSuitableWorkers(JobInfoDO jobInfo) {

        List workers = Lists.newLinkedList(getWorkerInfosByAppId(jobInfo.getAppId()).values());

        workers.removeIf(workerInfo -> filterWorker(workerInfo, jobInfo));

        DispatchStrategy dispatchStrategy = DispatchStrategy.of(jobInfo.getDispatchStrategy());
        switch (dispatchStrategy) {
            case RANDOM:
                Collections.shuffle(workers);
                break;
            case HEALTH_FIRST:
                workers.sort((o1, o2) -> o2.getSystemMetrics().calculateScore() - o1.getSystemMetrics().calculateScore());
                break;
            default:
                // do nothing
        }

        // 限定集群大小(0代表不限制)
        if (!workers.isEmpty() && jobInfo.getMaxWorkerCount() > 0 && workers.size() > jobInfo.getMaxWorkerCount()) {
            workers = workers.subList(0, jobInfo.getMaxWorkerCount());
        }
        return workers;
    }

    private Map getWorkerInfosByAppId(Long appId) {
        ClusterStatusHolder clusterStatusHolder = getAppId2ClusterStatus().get(appId);
        if (clusterStatusHolder == null) {
            log.warn("[WorkerManagerService] can't find any worker for app(appId={}) yet.", appId);
            return Collections.emptyMap();
        }
        return clusterStatusHolder.getAllWorkers();
    }    

WorkerClusterQueryService提供了getSuitableWorkers方法,可以根据JobInfoDO信息来查找合适的worker;它首先根据appId来查找对应的clusterStatusHolder,再获取该集群所有worker信息;然后通过filterWorker方法移除(DesignatedWorkerFilter、DisconnectedWorkerFilter、SystemMetricsWorkerFilter)

小结

ProcessorTracker接收TaskTrackerStartTaskReq参数,然后初始化线程池、初始化定时任务、初始化processorBean,它提供了submitTask、destroy方法;ProcessorTrackerActor提供了onReceiveTaskTrackerStartTaskReq,用于处理startTask,这里会获取或者创建ProcessorTracker,然后执行processorTracker.submitTask;DispatchService的dispatch方法会调用getSuitableWorkers方法来确定ServerScheduleJobReq的allWorkerAddress,最后HeavyTaskTracker会根据ProcessorTrackerStatusHolder的getAvailableProcessorTrackers来进行任务派发。如果job的processBean在某些worker不存在的话,会报错PowerJobException: fetch Processor failed, please check your processorType and processorInfo config,然后ptReportTask的taskStatus为WORKER_PROCESS_FAILED。

你可能感兴趣的:(java)