7. Spark源码解析之org.apache.spark.deploy.SparkSubmit源码解析

前面解读launch.main的时候已经了解了spark-submit的提交流程,这里大概看下流程。

当打jar提交到集群运行的时候,一般会设置一些参数,例如本地提交examples的SparkPi:

spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://192.168.2.1:7077 \
D:\spark\spark-2.4.3\examples\target\original-spark-examples_2.11-2.4.3.jar

所以首先会调用spark-submit脚本,主要是调用spark-class脚本,把SparkSubmit 类和输入的参数都作为spark-class的参数。

exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

spark-class中会执行下面的shell,进入launcher.main,参数仍然是SparkSubmit类和参数

# java -Xmx128m -cp ...jars org.apache.spark.launcher.Main "$@"
"$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"

 launcher.main中会解析过滤参数,构建执行命令,返回给spark-class脚本,最后通过 exec "${CMD[@]}" 真正调用SparkSubmit类。这里详细解读下SparknSubmit类。

先看看最前的说明。

/**
 * Whether to submit, kill, or request the status of an application.
 * The latter two operations are currently supported only for standalone and Mesos cluster modes.
 * 这个类主要是提交app,终止和请求状态,但目前终止和请求只能在standalone和mesos模式下
 */

// 继承了枚举类,定义了4个属性,多了一个打印spark版本
private[deploy] object SparkSubmitAction extends Enumeration {
  type SparkSubmitAction = Value
  val SUBMIT, KILL, REQUEST_STATUS, PRINT_VERSION = Value
}

 惯例首先进入object SparkSubmit

object SparkSubmit extends CommandLineUtils with Logging {

  // Cluster managers
  // spark支持的调度方式,yarn,standalne,mesos,local,kubernetes
  private val YARN = 1
  private val STANDALONE = 2
  private val MESOS = 4
  private val LOCAL = 8
  private val KUBERNETES = 16
  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL | KUBERNETES

  // Deploy modes
  // yarn的两种部署模式
  private val CLIENT = 1
  private val CLUSTER = 2
  private val ALL_DEPLOY_MODES = CLIENT | CLUSTER

  // Special primary resource names that represent shells rather than application jars.
  // shell命令相关的常量
  private val SPARK_SHELL = "spark-shell"
  private val PYSPARK_SHELL = "pyspark-shell"
  private val SPARKR_SHELL = "sparkr-shell"
  private val SPARKR_PACKAGE_ARCHIVE = "sparkr.zip"
  private val R_PACKAGE_ARCHIVE = "rpkg.zip"

  // 找不到类的错误码
  private val CLASS_NOT_FOUND_EXIT_STATUS = 101

  // Following constants are visible for testing.
  // 测试用的常量
  private[deploy] val YARN_CLUSTER_SUBMIT_CLASS =
    "org.apache.spark.deploy.yarn.YarnClusterApplication"
  private[deploy] val REST_CLUSTER_SUBMIT_CLASS = classOf[RestSubmissionClientApp].getName()
  private[deploy] val STANDALONE_CLUSTER_SUBMIT_CLASS = classOf[ClientApp].getName()
  private[deploy] val KUBERNETES_CLUSTER_SUBMIT_CLASS =
    "org.apache.spark.deploy.k8s.submit.KubernetesClientApplication"

  override def main(args: Array[String]): Unit = {
    // 这里先创建了SparkSubmit实例
    val submit = new SparkSubmit() {
      self =>
      
      // 重写了class SparkSubmit的解析加载参数方法
      override protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
        new SparkSubmitArguments(args) {
          override protected def logInfo(msg: => String): Unit = self.logInfo(msg)

          override protected def logWarning(msg: => String): Unit = self.logWarning(msg)
        }
      }

      // 日志输出方法
      override protected def logInfo(msg: => String): Unit = printMessage(msg)

      // warning输出方法
      override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg")

      // 重写任务提交方法,捕获异常
      override def doSubmit(args: Array[String]): Unit = {
        try {
          // 这里会进入class SparkSubmit的doSubmit()
          super.doSubmit(args)
        } catch {
          case e: SparkUserAppException =>
            exitFn(e.exitCode)
        }
      }

    }

    // 调用上面SparkSubmit实例的doSubmit()
    submit.doSubmit(args)
  }

  /**
   * Return whether the given primary resource represents a user jar.
   */
  private[deploy] def isUserJar(res: String): Boolean = {
    !isShell(res) && !isPython(res) && !isInternal(res) && !isR(res)
  }

  /**
   * Return whether the given primary resource represents a shell.
   */
  private[deploy] def isShell(res: String): Boolean = {
    (res == SPARK_SHELL || res == PYSPARK_SHELL || res == SPARKR_SHELL)
  }

  /**
   * Return whether the given main class represents a sql shell.
   */
  private[deploy] def isSqlShell(mainClass: String): Boolean = {
    mainClass == "org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
  }

  /**
   * Return whether the given main class represents a thrift server.
   */
  private def isThriftServer(mainClass: String): Boolean = {
    mainClass == "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2"
  }

  /**
   * Return whether the given primary resource requires running python.
   */
  private[deploy] def isPython(res: String): Boolean = {
    res != null && res.endsWith(".py") || res == PYSPARK_SHELL
  }

  /**
   * Return whether the given primary resource requires running R.
   */
  private[deploy] def isR(res: String): Boolean = {
    res != null && res.endsWith(".R") || res == SPARKR_SHELL
  }

  private[deploy] def isInternal(res: String): Boolean = {
    res == SparkLauncher.NO_RESOURCE
  }

}

class SparkSubmit

Object SparkSubmit中,创建了SparkSubmit实例

private[spark] class SparkSubmit extends Logging {

  import DependencyUtils._
  import SparkSubmit._

  // 执行Submit的方法
  def doSubmit(args: Array[String]): Unit = {
    // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
    // be reset before the application starts.
    // 初始化logging系统,并跟日志判断是否需要在app启动时重启
    val uninitLog = initializeLogIfNecessary(true, silent = true)

    // 调用parseArguments()解析参数,解析了提交的参数及spark配置文件
    val appArgs = parseArguments(args)

    // 参数不重复则输出配置
    if (appArgs.verbose) {
      logInfo(appArgs.toString)
    }

    // 匹配输入的执行请求,也就是提交,终止,请求状态和打印版本
    // 在解析的时候将执行状态封装到了SparkSubmitAction中,这里进行匹配
    // 如果没有执行状态,则SparkSubmitArguments默认设置为SparkSubmitAction.SUBMIT
    // 这里提交会进入submit()
    appArgs.action match {
      case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
      case SparkSubmitAction.KILL => kill(appArgs)
      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
      case SparkSubmitAction.PRINT_VERSION => printVersion()
    }
  }

 /**
   * 解析参数的方法
   * 这里首先进入了Object SparkSubmit重写的parseArguments()中
   * parseArguments其实就是SparkSubmitArguments类的实例,先创建了SparkSubmitArguments(args)实例
   * 而SparkSubmitArguments继承了SparkSubmitArgumentsParser抽象类
   * SparkSubmitArgumentsParser继承了SparkSubmitOptionParser
   * SparkSubmitOptionParser其实也是launcher.main中解析参数的OptionParser.parser()继承的父类
   * SparkSubmitArguments类中,定义了一堆参数,其实就是各种运行模式需要的参数。
   * 这里解析了submit所有模式需要的参数和spark默认配置
   */
  protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
    new SparkSubmitArguments(args)
  }

  /**
   * Kill an existing submission using the REST protocol. Standalone and Mesos cluster mode only.
   */
  // 终止Submit,只有Standalone和Mesos模式有用
  private def kill(args: SparkSubmitArguments): Unit = {
    new RestSubmissionClient(args.master)
      .killSubmission(args.submissionToKill)
  }

  /**
   * Request the status of an existing submission using the REST protocol.
   * Standalone and Mesos cluster mode only.
   */
  // 请求状态,只有Standalone和Mesos模式有用
  private def requestStatus(args: SparkSubmitArguments): Unit = {
    new RestSubmissionClient(args.master)
      .requestSubmissionStatus(args.submissionToRequestStatusFor)
  }

  /** Print version information to the log. */
  // 打印版本信息
  private def printVersion(): Unit = {
    logInfo("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version %s
      /_/
                        """.format(SPARK_VERSION))
    logInfo("Using Scala %s, %s, %s".format(
      Properties.versionString, Properties.javaVmName, Properties.javaVersion))
    logInfo(s"Branch $SPARK_BRANCH")
    logInfo(s"Compiled by user $SPARK_BUILD_USER on $SPARK_BUILD_DATE")
    logInfo(s"Revision $SPARK_REVISION")
    logInfo(s"Url $SPARK_REPO_URL")
    logInfo("Type --help for more information.")
  }
}

 submit()

通过匹配会进入submit(),先准备运行环境,然后调用doRunMain(),再调用runMain()。

  /**
   * Submit the application using the provided parameters.
   *
   * This runs in two steps. First, we prepare the launch environment by setting up
   * the appropriate classpath, system properties, and application arguments for
   * running the child main class based on the cluster manager and the deploy mode.
   * Second, we use this launch environment to invoke the main method of the child
   * main class.
   */
 /** 
   * 通过匹配SUBMIT执行的submit()
   * 如上所说,分成两部
   * 首先是根据不同调度模式和yarn不同模式,导入调用类的路径,默认配置及输入参数,准备相应的启动环境
   * 然后通过对应的环境来调用相应子类的main方法
   * 这里因为涉及到重复调用,所以采用了@tailrec尾递归,即重复调用方法的最后一句并返回结果
   * 即:runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
   */
  @tailrec
  private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {

 /** 先准备运行环境,传入解析的各种参数
   * 这里会先进入
   * lazy val secMgr = new SecurityManager(sparkConf)
   * 先初始化SecurityManager后,再进入prepareSubmitEnvironment()
   * prepareSubmitEnvironment()代码比较长,放到最下面去解析
   */
    val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)

    // 主要是调用runMain()启动相应环境的main()的方法
    // 环境准备好以后,会先往下运行判断,这里是在等着调用
    def doRunMain(): Unit = {
      // 提交时可以指定--proxy-user,如果没有指定,则获取当前用户
      if (args.proxyUser != null) {
        val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
          UserGroupInformation.getCurrentUser())
        try {
          proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
            // 这里是真正的执行,runMain()
            override def run(): Unit = {
              runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
            }
          })
        } catch {
          case e: Exception =>
            // Hadoop's AuthorizationException suppresses the exception's stack trace, which
            // makes the message printed to the output by the JVM not very helpful. Instead,
            // detect exceptions with empty stack traces here, and treat them differently.
            // hadoop的权限验证不允许堆栈跟踪,所以这里以空的堆栈来判断异常
            if (e.getStackTrace().length == 0) {
              error(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
            } else {
              throw e
            }
        }
        // 定义了用户则直接执行runMain()
      } else {
        runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
      }
    }

    // Let the main class re-initialize the logging system once it starts.
    // 启动main后重新初始化logging
    if (uninitLog) {
      Logging.uninitialize()
    }

    // In standalone cluster mode, there are two submission gateways:
    //   (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper
    //   (2) The new REST-based gateway introduced in Spark 1.3
    // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
    // to use the legacy gateway if the master endpoint turns out to be not a REST server.
    // standalone模式有两种提交网关,
    // 使用o.a.s.apply.client作为包装器的传统RPC网关和基于REST服务的网关,spark1.3后默认使用REST
    // 如果master终端没有使用REST服务,spark会故障切换到RPC
    // 这里判断standalone模式和使用REST服务
    if (args.isStandaloneCluster && args.useRest) {

      // 异常捕获,判断正确的话输出信息,进入doRunMain()
      try {
        logInfo("Running Spark using the REST application submission protocol.")
        doRunMain()
      } catch {

        // Fail over to use the legacy submission gateway
        // 否则异常输出信息,并设置submit失败
        case e: SubmitRestConnectionException =>
          logWarning(s"Master endpoint ${args.master} was not a REST server. " +
            "Falling back to legacy submission gateway instead.")
          args.useRest = false
          submit(args, false)
      }

    // In all other modes, just run the main class as prepared
    // 其他模式,按准备的环境调用上面的doRunMain()运行相应的main()
    // 在进入前,初始化了SparkContext和SparkSession
    } else {
      doRunMain()
    }
  }

prepareSubmitEnvironment() 

看看环境准备到底做了些什么,代码比较长,其实都是准备各种模式的配置参数。

  /**
   * Prepare the environment for submitting an application.
   *
   * @param args the parsed SparkSubmitArguments used for environment preparation.
   * @param conf the Hadoop Configuration, this argument will only be set in unit test.
   * @return a 4-tuple:
   *        (1) the arguments for the child process,
   *        (2) a list of classpath entries for the child,
   *        (3) a map of system properties, and
   *        (4) the main class for the child
   *
   * Exposed for testing.
   */
  /**
   * SparkSubmit的运行环境准备方法
   * args:通过SparkSubmitArguments解析的参数
   * conf: hadoop的conf设置,只在测试的时候使用
   * 返回元组(子类参数,子类路径,k/v格式spark配置,子类main)
   */
  private[deploy] def prepareSubmitEnvironment(
      args: SparkSubmitArguments,
      conf: Option[HadoopConfiguration] = None)
      : (Seq[String], Seq[String], SparkConf, String) = {
    // Return values
    val childArgs = new ArrayBuffer[String]()
    val childClasspath = new ArrayBuffer[String]()
    val sparkConf = new SparkConf()
    var childMainClass = ""

    // Set the cluster manager
    // 集群管理器
    // 也就是提交时指定--master local/yarn/yarn-client/yarn-cluster/spark://192.168.2.1:7077
    // 或者 mesos,k8s等运行模式
    val clusterManager: Int = args.master match {
      case "yarn" => YARN
      // spark2以后,建议以--master yarn --deploy-mode client/cluster方式提交
      case "yarn-client" | "yarn-cluster" =>
        logWarning(s"Master ${args.master} is deprecated since 2.0." +
          " Please use master \"yarn\" with specified deploy mode instead.")
        YARN
      case m if m.startsWith("spark") => STANDALONE
      case m if m.startsWith("mesos") => MESOS
      case m if m.startsWith("k8s") => KUBERNETES
      case m if m.startsWith("local") => LOCAL
      // 其他则报错
      case _ =>
        error("Master must either be yarn or start with spark, mesos, k8s, or local")
        -1
    }

    // Set the deploy mode; default is client mode
    // 判断--deploy-mode的模式
    // 如果没有设置,默认为client模式,--master yarn提交,不加--deploy-mode clinet其实也是yarn-client模式
    // 设置了cluster才会以yarn-cluster模式运行
    var deployMode: Int = args.deployMode match {
      case "client" | null => CLIENT
      case "cluster" => CLUSTER
      case _ =>
        error("Deploy mode must be either client or cluster")
        -1
    }

    // Because the deprecated way of specifying "yarn-cluster" and "yarn-client" encapsulate both
    // the master and deploy mode, we have some logic to infer the master and deploy mode
    // from each other if only one is specified, or exit early if they are at odds.
    // 这里是判断设置了--master yarn-client/cluster,--deploy-mode也设置了值的解决方法
    if (clusterManager == YARN) {
      (args.master, args.deployMode) match {
        case ("yarn-cluster", null) =>
          deployMode = CLUSTER
          args.master = "yarn"
        // 主要是这里--master yarn-cluster --deploy-mode client,模式冲突了,会报错
        case ("yarn-cluster", "client") =>
          error("Client deploy mode is not compatible with master \"yarn-cluster\"")
        case ("yarn-client", "cluster") =>
          error("Cluster deploy mode is not compatible with master \"yarn-client\"")
        // 不是上面的情况,设置master为yarn
        // 即yarn-client和clinet,yarn-cluster和cluster,设置成2.0以后建议模式,--master yarn
        case (_, mode) =>
          args.master = "yarn"
      }

      // Make sure YARN is included in our build if we're trying to use it
      // 判断yarn类已经编译,并且不是testing模式
      if (!Utils.classIsLoadable(YARN_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) {
        error(
          "Could not load YARN classes. " +
          "This copy of Spark may not have been compiled with YARN support.")
      }
    }

    // 判断k8s模式master和非testing模式
    if (clusterManager == KUBERNETES) {
      args.master = Utils.checkAndGetK8sMasterUrl(args.master)
      // Make sure KUBERNETES is included in our build if we're trying to use it
      if (!Utils.classIsLoadable(KUBERNETES_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) {
        error(
          "Could not load KUBERNETES classes. " +
            "This copy of Spark may not have been compiled with KUBERNETES support.")
      }
    }

    // Fail fast, the following modes are not supported or applicable
    // 判断不可用的设置模式
    (clusterManager, deployMode) match {
      // python目前不支持standalone cluster模式
      case (STANDALONE, CLUSTER) if args.isPython =>
        error("Cluster deploy mode is currently not supported for python " +
          "applications on standalone clusters.")
      // R目前不支持standalone cluster模式
      case (STANDALONE, CLUSTER) if args.isR =>
        error("Cluster deploy mode is currently not supported for R " +
          "applications on standalone clusters.")
      // local本地运行不能设置cluster模式
      case (LOCAL, CLUSTER) =>
      // 其他的如null,shell都不支持cluster模式
        error("Cluster deploy mode is not compatible with master \"local\"")
      case (_, CLUSTER) if isShell(args.primaryResource) =>
        error("Cluster deploy mode is not applicable to Spark shells.")
      case (_, CLUSTER) if isSqlShell(args.mainClass) =>
        error("Cluster deploy mode is not applicable to Spark SQL shell.")
      case (_, CLUSTER) if isThriftServer(args.mainClass) =>
        error("Cluster deploy mode is not applicable to Spark Thrift server.")
      case _ =>
    }

    // Update args.deployMode if it is null. It will be passed down as a Spark property later.
    // args.deployMode为空则设置deployMode值为参数,因为上面判断了args.deployMode为空deployMode为client
    (args.deployMode, deployMode) match {
      case (null, CLIENT) => args.deployMode = "client"
      case (null, CLUSTER) => args.deployMode = "cluster"
      case _ =>
    }

    // 判断运行模式的常量
    val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
    val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
    val isStandAloneCluster = clusterManager == STANDALONE && deployMode == CLUSTER
    val isKubernetesCluster = clusterManager == KUBERNETES && deployMode == CLUSTER
    val isMesosClient = clusterManager == MESOS && deployMode == CLIENT

    // 主要是添加相应的依赖
    if (!isMesosCluster && !isStandAloneCluster) {
      // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
      // too for packages that include Python code
      val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(
        args.packagesExclusions, args.packages, args.repositories, args.ivyRepoPath,
        args.ivySettingsPath)

      if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
        args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
        if (args.isPython || isInternal(args.primaryResource)) {
          args.pyFiles = mergeFileLists(args.pyFiles, resolvedMavenCoordinates)
        }
      }

      // install any R packages that may have been passed through --jars or --packages.
      // Spark Packages may contain R source code inside the jar.
      if (args.isR && !StringUtils.isBlank(args.jars)) {
        RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose)
      }
    }

    // 将args.sparkProperties参数加入sparkConf配置中
    args.sparkProperties.foreach { case (k, v) => sparkConf.set(k, v) }
    // 设置hadoop配置,如果为空,加载spark配置
    val hadoopConf = conf.getOrElse(SparkHadoopUtil.newConfiguration(sparkConf))
    // 工作临时目录
    val targetDir = Utils.createTempDir()

    // assure a keytab is available from any place in a JVM
    // 判断当前模式下sparkConf的k/v键值对中key是否在JVM中全局可用
    if (clusterManager == YARN || clusterManager == LOCAL || isMesosClient) {
      // 当前运行环境的用户不为空,args中yarn模式参数key列表不为空,则提示key列表文件不存在
      if (args.principal != null) {
        if (args.keytab != null) {
          require(new File(args.keytab).exists(), s"Keytab file: ${args.keytab} does not exist")
          // Add keytab and principal configurations in sysProps to make them available
          // for later use; e.g. in spark sql, the isolated class loader used to talk
          // to HiveMetastore will use these settings. They will be set as Java system
          // properties and then loaded by SparkConf
          // 将args的key列表和当前使用者添加sparkConf中
          // 加载到JVM中,提供给如Hivemetastore的加载器使用
          sparkConf.set(KEYTAB, args.keytab)
          sparkConf.set(PRINCIPAL, args.principal)
          UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
        }
      }
    }

    // Resolve glob path for different resources.
    // 设置全局资源,也就是合并各种模式依赖的路径的资源和hadoopConf中设置路径的资源
    // 各种jars,file,pyfile和压缩包
    args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
    args.files = Option(args.files).map(resolveGlobPaths(_, hadoopConf)).orNull
    args.pyFiles = Option(args.pyFiles).map(resolveGlobPaths(_, hadoopConf)).orNull
    args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull

    // 创建SecurityManager实例
    lazy val secMgr = new SecurityManager(sparkConf)

    // client模式下载资源
    // In client mode, download remote files.
    // client模式下,先将需要的远程文件下载到本地
    var localPrimaryResource: String = null
    var localJars: String = null
    var localPyFiles: String = null
    if (deployMode == CLIENT) {
      localPrimaryResource = Option(args.primaryResource).map {
        downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr)
      }.orNull
      localJars = Option(args.jars).map {
        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
      }.orNull
      localPyFiles = Option(args.pyFiles).map {
        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
      }.orNull
    }

    // yarn模式下载资源
    // When running in YARN, for some remote resources with scheme:
    //   1. Hadoop FileSystem doesn't support them.
    //   2. We explicitly bypass Hadoop FileSystem with "spark.yarn.dist.forceDownloadSchemes".
    // We will download them to local disk prior to add to YARN's distributed cache.
    // For yarn client mode, since we already download them with above code, so we only need to
    // figure out the local path and replace the remote one.
    // yarn模式下,hdfs不支持加载到内存,所以采用"spark.yarn.dist.forceDownloadSchemes"方案
    // 所以先把方案列表文件下载到本地,再通过相应方案加载资源到分布式内存中
    // 在yarn-client模式中,上面的代码中已经把远程文件下载到了本地,只需要获取本地路径替换掉远程路径即可
    if (clusterManager == YARN) {
      // 加载方案列表
      val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)

      // 判断是否需要下载的方法
      def shouldDownload(scheme: String): Boolean = {
        forceDownloadSchemes.contains("*") || forceDownloadSchemes.contains(scheme) ||
          Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
      }

      // 下载资源的方法
      def downloadResource(resource: String): String = {
        val uri = Utils.resolveURI(resource)
        uri.getScheme match {
          case "local" | "file" => resource
          case e if shouldDownload(e) =>
            val file = new File(targetDir, new Path(uri).getName)
            if (file.exists()) {
              file.toURI.toString
            } else {
              downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr)
            }
          case _ => uri.toString
        }
      }

      // 下载主要运行资源
      args.primaryResource = Option(args.primaryResource).map { downloadResource }.orNull
      // 下载文件
      args.files = Option(args.files).map { files =>
        Utils.stringToSeq(files).map(downloadResource).mkString(",")
      }.orNull
      args.pyFiles = Option(args.pyFiles).map { pyFiles =>
        Utils.stringToSeq(pyFiles).map(downloadResource).mkString(",")
      }.orNull
      // 下载jars
      args.jars = Option(args.jars).map { jars =>
        Utils.stringToSeq(jars).map(downloadResource).mkString(",")
      }.orNull
      // 下载压缩文件
      args.archives = Option(args.archives).map { archives =>
        Utils.stringToSeq(archives).map(downloadResource).mkString(",")
      }.orNull
    }

    // If we're running a python app, set the main class to our specific python runner
    // pyspark的运行设置
    // python任务client模式需要将master设置为特定的python执行程序
    if (args.isPython && deployMode == CLIENT) {

      // pyspark-shell模式下的main设置
      if (args.primaryResource == PYSPARK_SHELL) {
        args.mainClass = "org.apache.spark.api.python.PythonGatewayServer"
      } else {

        // If a python file is provided, add it to the child arguments and list of files to deploy.
        // Usage: PythonAppRunner 
[app arguments] // 非pyspark-shell时main设置,并加载上面client模式下载到本地的python文件到子类参数中 args.mainClass = "org.apache.spark.deploy.PythonRunner" args.childArgs = ArrayBuffer(localPrimaryResource, localPyFiles) ++ args.childArgs } // 非yarn模式,合并yarn模式下载的文件和python文件 if (clusterManager != YARN) { // The YARN backend handles python files differently, so don't merge the lists. args.files = mergeFileLists(args.files, args.pyFiles) } } // 将下载到本地的python文件设置加载到sparkConf中 if (localPyFiles != null) { sparkConf.set("spark.submit.pyFiles", localPyFiles) } // sparkR的运行设置 // In YARN mode for an R app, add the SparkR package archive and the R package // archive containing all of the built R libraries to archives so that they can // be distributed with the job // yarn模式,将sparkR的压缩包和R的压缩包一起添加到R lib中,随job运行时一起分发 if (args.isR && clusterManager == YARN) { // 从下载的方案列表中选取localSparkRPackagePath方案,如果方案为空,报错 val sparkRPackagePath = RUtils.localSparkRPackagePath if (sparkRPackagePath.isEmpty) { error("SPARK_HOME does not exist for R application in YARN mode.") } // 根据方案获取sparkR依赖包,如果不存在报错 val sparkRPackageFile = new File(sparkRPackagePath.get, SPARKR_PACKAGE_ARCHIVE) if (!sparkRPackageFile.exists()) { error(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.") } // 获取sparkR依赖包URI val sparkRPackageURI = Utils.resolveURI(sparkRPackageFile.getAbsolutePath).toString // Distribute the SparkR package. // Assigns a symbol link name "sparkr" to the shipped package. // 为分发的sparkR依赖包设置链接名 args.archives = mergeFileLists(args.archives, sparkRPackageURI + "#sparkr") // Distribute the R package archive containing all the built R packages. // 分发所有R压缩包,包括编译R的所有包 // 先判断包是否存在,再添加 if (!RUtils.rPackages.isEmpty) { val rPackageFile = RPackageUtils.zipRLibraries(new File(RUtils.rPackages.get), R_PACKAGE_ARCHIVE) if (!rPackageFile.exists()) { error("Failed to zip all the built R packages.") } val rPackageURI = Utils.resolveURI(rPackageFile.getAbsolutePath).toString // Assigns a symbol link name "rpkg" to the shipped package. // 将分发的包设置链接名 args.archives = mergeFileLists(args.archives, rPackageURI + "#rpkg") } } // TODO: Support distributing R packages with standalone cluster // standalone cluster分发包 if (args.isR && clusterManager == STANDALONE && !RUtils.rPackages.isEmpty) { error("Distributing R packages with standalone cluster is not supported.") } // TODO: Support distributing R packages with mesos cluster // mesos cluster 分发包 if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) { error("Distributing R packages with mesos cluster is not supported.") } // If we're running an R app, set the main class to our specific R runner // R 需要设置特定的R运行程序 // client模式,sparkR-shell的main设置 if (args.isR && deployMode == CLIENT) { if (args.primaryResource == SPARKR_SHELL) { args.mainClass = "org.apache.spark.api.r.RBackend" } else { // If an R file is provided, add it to the child arguments and list of files to deploy. // Usage: RRunner
[app arguments] // 非sparkR-shell,设置main,并将下载到本地的R文件添加到子类参数和文件列表中 args.mainClass = "org.apache.spark.deploy.RRunner" args.childArgs = ArrayBuffer(localPrimaryResource) ++ args.childArgs args.files = mergeFileLists(args.files, args.primaryResource) } } // yarn-cluster模式,将主要运行资源合并到文件列表中,随job运行一起分发 if (isYarnCluster && args.isR) { // In yarn-cluster mode for an R app, add primary resource to files // that can be distributed with the job args.files = mergeFileLists(args.files, args.primaryResource) } // Special flag to avoid deprecation warnings at the client // 设置参数中Spark-submit为开启状态 sys.props("SPARK_SUBMIT") = "true" // A list of rules to map each argument to system properties or command-line options in // each deploy mode; we iterate through these below // 为各种部署模式设置相应参数 // 这里返回的是元组 // OptionAssigner类没有方法,只是设置了参数类型 val options = List[OptionAssigner]( // All cluster managers // 设置所有集群管理的参数属性 OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.master"), OptionAssigner(args.deployMode, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.submit.deployMode"), OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.app.name"), OptionAssigner(args.ivyRepoPath, ALL_CLUSTER_MGRS, CLIENT, confKey = "spark.jars.ivy"), OptionAssigner(args.driverMemory, ALL_CLUSTER_MGRS, CLIENT, confKey = "spark.driver.memory"), OptionAssigner(args.driverExtraClassPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.driver.extraClassPath"), OptionAssigner(args.driverExtraJavaOptions, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.driver.extraJavaOptions"), OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.driver.extraLibraryPath"), // Propagate attributes for dependency resolution at the driver side // driver依赖的参数属性 OptionAssigner(args.packages, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.packages"), OptionAssigner(args.repositories, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.repositories"), OptionAssigner(args.ivyRepoPath, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.ivy"), OptionAssigner(args.packagesExclusions, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.excludes"), // Yarn only // yarn模式时需要的参数属性 // 任务队列,executor实例,依赖包和文件,运行的用户,args.key列表等 OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.queue"), OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES, confKey = "spark.executor.instances"), OptionAssigner(args.pyFiles, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.pyFiles"), OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.jars"), OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.files"), OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.archives"), OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.principal"), OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.keytab"), // Other options // 其他设置,用的比较多的是yarn的参数属性,当然测试中会使用local和standalone // executor的cpu cores设置 OptionAssigner(args.executorCores, STANDALONE | YARN | KUBERNETES, ALL_DEPLOY_MODES, confKey = "spark.executor.cores"), // executor内存设置 OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN | KUBERNETES, ALL_DEPLOY_MODES, confKey = "spark.executor.memory"), // yarn模式的最大cores由yarn的资源决定,所以这里没有设置 OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, confKey = "spark.cores.max"), OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, confKey = "spark.files"), // 这个local本地模式的jars,client时会下载 OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"), OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, confKey = "spark.jars"), // driver内存 OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER, confKey = "spark.driver.memory"), // driver的cpu cores OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER, confKey = "spark.driver.cores"), OptionAssigner(args.supervise.toString, STANDALONE | MESOS, CLUSTER, confKey = "spark.driver.supervise"), OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, confKey = "spark.jars.ivy"), // An internal option used only for spark-shell to add user jars to repl's classloader, // previously it uses "spark.jars" or "spark.yarn.dist.jars" which now may be pointed to // remote jars, so adding a new option to only specify local jars for spark-shell internally. // 专门为spark-shell指定本地jars的参数选项 OptionAssigner(localJars, ALL_CLUSTER_MGRS, CLIENT, confKey = "spark.repl.local.jars") ) // In client mode, launch the application main class directly // In addition, add the main application jar and any added jars (if any) to the classpath // local模式直接加载子类main class,把资源jars都添加到子类的路径中 if (deployMode == CLIENT) { childMainClass = args.mainClass if (localPrimaryResource != null && isUserJar(localPrimaryResource)) { childClasspath += localPrimaryResource } // localJars在上面client模式下载到了本地,这里添加到子类路径 // 这里的 ++= 是方法,将localJars列表添加到childClasspath中 if (localJars != null) { childClasspath ++= localJars.split(",") } } // Add the main application jar and any added jars to classpath in case YARN client // requires these jars. // This assumes both primaryResource and user jars are local jars, or already downloaded // to local by configuring "spark.yarn.dist.forceDownloadSchemes", otherwise it will not be // added to the classpath of YARN client. // 加载jars到yarn-client的classpath中 // 如果通过方案 "spark.yarn.dist.forceDownloadSchemes"已经将资源下载到本地才会被添加 // yarn-cluster模式,将运行程序的用户指定的主运行资源添加到子类路径,并判断资源已经下载到本地才添加 if (isYarnCluster) { if (isUserJar(args.primaryResource)) { childClasspath += args.primaryResource } if (args.jars != null) { childClasspath ++= args.jars.split(",") } } // client模式,子类参数直接添加下载到本地的参数 if (deployMode == CLIENT) { if (args.childArgs != null) { childArgs ++= args.childArgs } } // Map all arguments to command-line options or system properties for our chosen mode // 将所有参数对应到所选模式的命令行和系统配置中 for (opt <- options) { // 需要判断options非空,deployMode已设置,clusterManager已设置 if (opt.value != null && (deployMode & opt.deployMode) != 0 && (clusterManager & opt.clusterManager) != 0) { if (opt.clOption != null) { childArgs += (opt.clOption, opt.value) } if (opt.confKey != null) { sparkConf.set(opt.confKey, opt.value) } } } // In case of shells, spark.ui.showConsoleProgress can be true by default or by user. // spark-shell的webUI设置 if (isShell(args.primaryResource) && !sparkConf.contains(UI_SHOW_CONSOLE_PROGRESS)) { sparkConf.set(UI_SHOW_CONSOLE_PROGRESS, true) } // Add the application jar automatically so the user doesn't have to call sc.addJar // For YARN cluster mode, the jar is already distributed on each node as "app.jar" // For python and R files, the primary resource is already distributed as a regular file // 自动添加jars,不用为yarn-cluster再添加,而python和R的jars已经随job分发到各个节点 // 所以这里判断不是这三种模式 // 这里也把主运行程序分发到各节点 if (!isYarnCluster && !args.isPython && !args.isR) { var jars = sparkConf.getOption("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty) if (isUserJar(args.primaryResource)) { jars = jars ++ Seq(args.primaryResource) } sparkConf.set("spark.jars", jars.mkString(",")) } // In standalone cluster mode, use the REST client to submit the application (Spark 1.3+). // All Spark parameters are expected to be passed to the client through system properties. // standalone cluster模式通过REST客户端提交app,spark参数通过系统配置传递给其他客户端 if (args.isStandaloneCluster) { if (args.useRest) { childMainClass = REST_CLUSTER_SUBMIT_CLASS childArgs += (args.primaryResource, args.mainClass) } else { // In legacy standalone cluster mode, use Client as a wrapper around the user class // 使用RPC的话,将客户端作为用户类的包装执行器 childMainClass = STANDALONE_CLUSTER_SUBMIT_CLASS if (args.supervise) { childArgs += "--supervise" } Option(args.driverMemory).foreach { m => childArgs += ("--memory", m) } Option(args.driverCores).foreach { c => childArgs += ("--cores", c) } childArgs += "launch" childArgs += (args.master, args.primaryResource, args.mainClass) } if (args.childArgs != null) { childArgs ++= args.childArgs } } // Let YARN know it's a pyspark app, so it distributes needed libraries. // 设置pyspark的yarn模式开启 if (clusterManager == YARN) { if (args.isPython) { sparkConf.set("spark.yarn.isPython", "true") } } // mesos的conf设置 if (clusterManager == MESOS && UserGroupInformation.isSecurityEnabled) { setRMPrincipal(sparkConf) } // In yarn-cluster mode, use yarn.Client as a wrapper around the user class // yarn-cluster模式,使用yarn.client作为用户提交类的包装执行器 if (isYarnCluster) { childMainClass = YARN_CLUSTER_SUBMIT_CLASS // pyspark为子类参数添加--primary-py-file指定主类运行资源和--class执行方法 if (args.isPython) { childArgs += ("--primary-py-file", args.primaryResource) childArgs += ("--class", "org.apache.spark.deploy.PythonRunner") // sparkR添加--primary-r-file和--class } else if (args.isR) { val mainFile = new Path(args.primaryResource).getName childArgs += ("--primary-r-file", mainFile) childArgs += ("--class", "org.apache.spark.deploy.RRunner") // 其他非python和R添加 } else { // SparkLauncher.NO_RESOURCE="spark-internal" // 主执行资源不是spark内部资源,就通过--jar把资源添加到子类参数 if (args.primaryResource != SparkLauncher.NO_RESOURCE) { childArgs += ("--jar", args.primaryResource) } // 已经设置到spark系统配置就直接添加--class指定执行类 childArgs += ("--class", args.mainClass) } // 遍历所有args参数,添加到子类参数中 if (args.childArgs != null) { args.childArgs.foreach { arg => childArgs += ("--arg", arg) } } } // mesos和k8s跟上面差不多 if (isMesosCluster) { assert(args.useRest, "Mesos cluster mode is only supported through the REST submission API") childMainClass = REST_CLUSTER_SUBMIT_CLASS if (args.isPython) { // Second argument is main class childArgs += (args.primaryResource, "") if (args.pyFiles != null) { sparkConf.set("spark.submit.pyFiles", args.pyFiles) } } else if (args.isR) { // Second argument is main class childArgs += (args.primaryResource, "") } else { childArgs += (args.primaryResource, args.mainClass) } if (args.childArgs != null) { childArgs ++= args.childArgs } } if (isKubernetesCluster) { childMainClass = KUBERNETES_CLUSTER_SUBMIT_CLASS if (args.primaryResource != SparkLauncher.NO_RESOURCE) { if (args.isPython) { childArgs ++= Array("--primary-py-file", args.primaryResource) childArgs ++= Array("--main-class", "org.apache.spark.deploy.PythonRunner") if (args.pyFiles != null) { childArgs ++= Array("--other-py-files", args.pyFiles) } } else if (args.isR) { childArgs ++= Array("--primary-r-file", args.primaryResource) childArgs ++= Array("--main-class", "org.apache.spark.deploy.RRunner") } else { childArgs ++= Array("--primary-java-resource", args.primaryResource) childArgs ++= Array("--main-class", args.mainClass) } } else { childArgs ++= Array("--main-class", args.mainClass) } if (args.childArgs != null) { args.childArgs.foreach { arg => childArgs += ("--arg", arg) } } } // Load any properties specified through --conf and the default properties file // 加载spark配置,这个配置是上面解析过的--conf和默认配置文件的配置项 for ((k, v) <- args.sparkProperties) { sparkConf.setIfMissing(k, v) } // Ignore invalid spark.driver.host in cluster modes. // 忽略cluster模式中无效的driver.host,因为cluster模式时driver是运行在worker中 if (deployMode == CLUSTER) { sparkConf.remove("spark.driver.host") } // Resolve paths in certain spark properties // 解析依赖jars和文件的路径 val pathConfigs = Seq( "spark.jars", "spark.files", "spark.yarn.dist.files", "spark.yarn.dist.archives", "spark.yarn.dist.jars") // 遍历解析的路径,去除空值和不合理的,并解析为新的URI pathConfigs.foreach { config => // Replace old URIs with resolved URIs, if they exist sparkConf.getOption(config).foreach { oldValue => sparkConf.set(config, Utils.resolveURIs(oldValue)) } } // Resolve and format python file paths properly before adding them to the PYTHONPATH. // The resolving part is redundant in the case of --py-files, but necessary if the user // explicitly sets `spark.submit.pyFiles` in his/her default properties file. // 清理和格式化python文件的路径 // 如果默认配置中有设置spark.submit.pyFiles,name--py-files不用添加 sparkConf.getOption("spark.submit.pyFiles").foreach { pyFiles => val resolvedPyFiles = Utils.resolveURIs(pyFiles) // 忽略yarn-cluster和mesoscluster模式,因为这两种模式支持远程文件,能通过本地分发和添加python文件 val formattedPyFiles = if (!isYarnCluster && !isMesosCluster) { PythonRunner.formatPaths(resolvedPyFiles).mkString(",") } else { // Ignoring formatting python path in yarn and mesos cluster mode, these two modes // support dealing with remote python files, they could distribute and add python files // locally. // 返回清理和格式化后的python文件路径 resolvedPyFiles } // 将返回的路径添加到sparkConf中 sparkConf.set("spark.submit.pyFiles", formattedPyFiles) } // 最终prepareSubmitEnvironment()返回的元组,对应了 // (mainclass args, jars classpath, sparkConf, mainclass path) (childArgs, childClasspath, sparkConf, childMainClass) }

runMain()

准备好环境以后,返回class SparkSubmit的submit()中继续往下执行,会调用doRunMain(),最终会调用runMain()。这才是真正的根据提供的环境运行子类main()的方法,同样在SparkSubmit类下。

  /**
   * Run the main method of the child class using the provided launch environment.
   *
   * Note that this main class will not be the one provided by the user if we're
   * running cluster deploy mode or python applications.
   */
  // 使用提供的环境运行子类main(),如果是在集群模式或者是运行的python app,不会使用提交时候定义的类的main()
  private def runMain(
      // 这里的参数有子类需要的参数,子类路径,sparkConf,子类main()路径,参数重复判断
      childArgs: Seq[String],
      childClasspath: Seq[String],
      sparkConf: SparkConf,
      childMainClass: String,
      verbose: Boolean): Unit = {
    if (verbose) {
      // 如果参数有重复,输出参数信息
      logInfo(s"Main class:\n$childMainClass")
      logInfo(s"Arguments:\n${childArgs.mkString("\n")}")
      // sysProps may contain sensitive information, so redact before printing
      // 系统配置可能含有不规则信息,所以在打印前编辑格式化sparkConf参数
      logInfo(s"Spark config:\n${Utils.redact(sparkConf.getAll.toMap).mkString("\n")}")
      logInfo(s"Classpath elements:\n${childClasspath.mkString("\n")}")
      logInfo("\n")
    }

    // 初始化类加载器
    val loader =
      // 如果用户设置了class,通过ChildFirstURLClassLoader来加载
      if (sparkConf.get(DRIVER_USER_CLASS_PATH_FIRST)) {
        new ChildFirstURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      } else {
        // 如果用户没有设置,通过MutableURLClassLoader来加载
        new MutableURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      }

    // 设置由上面自定义的类加载器来加载class到JVM
    Thread.currentThread.setContextClassLoader(loader)

    // 加载jars
    for (jar <- childClasspath) {
      addJarToClasspath(jar, loader)
    }
    
    // 先定义一个mainlcass变量
    var mainClass: Class[_] = null

    try {
      // 反射加载子类mainclass
      mainClass = Utils.classForName(childMainClass)
    } catch {
      // 如果没找到类,并且类中包含hive的thriftserver,提示需要启动hive服务
      case e: ClassNotFoundException =>
        logWarning(s"Failed to load $childMainClass.", e)
        if (childMainClass.contains("thriftserver")) {
          logInfo(s"Failed to load main class $childMainClass.")
          logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
        }
        throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
      case e: NoClassDefFoundError =>
        logWarning(s"Failed to load $childMainClass: ${e.getMessage()}")
        if (e.getMessage.contains("org/apache/hadoop/hive")) {
          logInfo(s"Failed to load hive class.")
          logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
        }
        throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
    }

    // 通过classOf[]构建从属于mainClass的SparkApplication对象
    // 然后通过mainclass实例化了SparkApplication
    // SparkApplication是一个抽象类,这里主要是实现它的start()
    val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
      mainClass.newInstance().asInstanceOf[SparkApplication]
    } else {
      // SPARK-4170
      if (classOf[scala.App].isAssignableFrom(mainClass)) {
        logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
      }
      // 如果mainclass无法实例化SparkApplication,则使用替代构建子类JavaMainApplication实例
      new JavaMainApplication(mainClass)
    }

    @tailrec
    def findCause(t: Throwable): Throwable = t match {
      case e: UndeclaredThrowableException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: InvocationTargetException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: Throwable =>
        e
    }

    try {

      // 启动实例
      app.start(childArgs.toArray, sparkConf)
    } catch {
      case t: Throwable =>
        throw findCause(t)
    }
  }

SparkApplication

/**
 * Entry point for a Spark application. Implementations must provide a no-argument constructor.
 */

// 这是spark任务的入口抽象类,需要实现它的无参构造
private[spark] trait SparkApplication {

  def start(args: Array[String], conf: SparkConf): Unit

}


/**
 * Implementation of SparkApplication that wraps a standard Java class with a "main" method.
 * Configuration is propagated to the application via system properties, so running multiple
 * of these in the same JVM may lead to undefined behavior due to configuration leaks.
 * 用main方法包装标准java类的SparkApplication实现
 * 配置是通过系统配置文件传递,在同一个JVM中加载太多配置会可能导致配置溢出
 */
    
private[deploy] class JavaMainApplication(klass: Class[_]) extends SparkApplication {

  // 重写start()
  override def start(args: Array[String], conf: SparkConf): Unit = {
    // 反射获取main中的方法,必须是静态方法
    val mainMethod = klass.getMethod("main", new Array[String](0).getClass)
    if (!Modifier.isStatic(mainMethod.getModifiers)) {
      throw new IllegalStateException("The main method in the given main class must be static")
    }

    // 获取配置
    val sysProps = conf.getAll.toMap
    sysProps.foreach { case (k, v) =>
      sys.props(k) = v
    }

    // 调用对应主类真正启动,执行--class的类main()
    mainMethod.invoke(null, args)
  }

}

如果是在本地模式,到这里已经结束了。

但是如果是yarn cluster模式,它创建的实例是不同的,也就是start()启动的类其实是YarnClusterApplication,同样继承了SparkApplication,具体后面再继续解读。

你可能感兴趣的:(Spark)