github代码链接:github地址
使用springboot构建rest api远程提交spark任务,将数据库中的表数据存储到hdfs上,任务单独起一个项目,解除与springboot项目的耦合
1.8
2.3.3
2.11
org.springframework.boot
spring-boot-starter-web
org.springframework.boot
spring-boot-starter
mysql
mysql-connector-java
5.1.46
org.apache.spark
spark-launcher_${scala.version}
${spark.version}
org.projectlombok
lombok
com.alibaba
fastjson
1.2.49
org.springframework.boot
spring-boot-starter-test
test
spark
org.springframework.boot
spring-boot-maven-plugin
com.hrong.springbootspark.SpringbootSparkApplication
repackage
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.beans.factory.annotation.Value;
import java.util.Map;
/**
* @Author hrong
**/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class SparkApplicationParam {
/**
* 任务的主类
*/
private String mainClass;
/**
* jar包路径
*/
private String jarPath;
@Value("${spark.master:yarn}")
private String master;
@Value("${spark.deploy.mode:cluster}")
private String deployMode;
@Value("${spark.driver.memory:1g}")
private String driverMemory;
@Value("${spark.executor.memory:1g}")
private String executorMemory;
@Value("${spark.executor.cores:1}")
private String executorCores;
/**
* 其他配置:传递给spark job的参数
*/
private Map otherConfParams;
/**
* 调用该方法可获取spark任务的设置参数
* @return SparkApplicationParam
*/
public SparkApplicationParam getSparkApplicationParam(){
return new SparkApplicationParam(mainClass, jarPath, master, deployMode, driverMemory, executorMemory, executorCores, otherConfParams);
}
}
每个任务执行的时候都必须指定运行参数,所以要继承SparkApplicationParam对象
import com.hrong.springbootspark.entity.SparkApplicationParam;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* @Author hrong
**/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DataBaseExtractorVo extends SparkApplicationParam {
/**
* 数据库连接地址
*/
private String url;
/**
* 数据库连接账号
*/
private String userName;
/**
* 数据库密码
*/
private String password;
/**
* 指定的表名
*/
private String table;
/**
* 目标文件类型
*/
private String targetFileType;
/**
* 目标文件保存路径
*/
private String targetFilePath;
}
每个spark任务运行时都需要指定运行参数,但是任务内部所需的参数不一样,所以第一个参数为通用的参数对象,第二个参数为可变参数,根据不同的任务来进行传值
import com.hrong.springbootspark.entity.SparkApplicationParam;
import java.io.IOException;
/**
* @Author hrong
* @description spark任务提交service
**/
public interface ISparkSubmitService {
/**
* 提交spark任务入口
* @param sparkAppParams spark任务运行所需参数
* @param otherParams 单独的job所需参数
* @return 结果
* @throws IOException io
* @throws InterruptedException 线程等待中断异常
*/
String submitApplication(SparkApplicationParam sparkAppParams, String... otherParams) throws IOException, InterruptedException;
}
import com.alibaba.fastjson.JSONObject;
import com.hrong.springbootspark.entity.SparkApplicationParam;
import com.hrong.springbootspark.service.ISparkSubmitService;
import com.hrong.springbootspark.util.HttpUtil;
import org.apache.spark.launcher.SparkAppHandle;
import org.apache.spark.launcher.SparkLauncher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
/**
* @Author hrong
**/
@Service
public class SparkSubmitServiceImpl implements ISparkSubmitService {
private static Logger log = LoggerFactory.getLogger(SparkSubmitServiceImpl.class);
@Value("${driver.name:n151}")
private String driverName;
@Override
public String submitApplication(SparkApplicationParam sparkAppParams, String... otherParams) throws IOException, InterruptedException {
log.info("spark任务传入参数:{}", sparkAppParams.toString());
CountDownLatch countDownLatch = new CountDownLatch(1);
Map confParams = sparkAppParams.getOtherConfParams();
SparkLauncher launcher = new SparkLauncher()
.setAppResource(sparkAppParams.getJarPath())
.setMainClass(sparkAppParams.getMainClass())
.setMaster(sparkAppParams.getMaster())
.setDeployMode(sparkAppParams.getDeployMode())
.setConf("spark.driver.memory", sparkAppParams.getDriverMemory())
.setConf("spark.executor.memory", sparkAppParams.getExecutorMemory())
.setConf("spark.executor.cores", sparkAppParams.getExecutorCores());
if (confParams != null && confParams.size() != 0) {
log.info("开始设置spark job运行参数:{}", JSONObject.toJSONString(confParams));
for (Map.Entry conf : confParams.entrySet()) {
log.info("{}:{}", conf.getKey(), conf.getValue());
launcher.setConf(conf.getKey(), conf.getValue());
}
}
if (otherParams.length != 0) {
log.info("开始设置spark job参数:{}", otherParams);
launcher.addAppArgs(otherParams);
}
log.info("参数设置完成,开始提交spark任务");
SparkAppHandle handle = launcher.setVerbose(true).startApplication(new SparkAppHandle.Listener() {
@Override
public void stateChanged(SparkAppHandle sparkAppHandle) {
if (sparkAppHandle.getState().isFinal()) {
countDownLatch.countDown();
}
log.info("stateChanged:{}", sparkAppHandle.getState().toString());
}
@Override
public void infoChanged(SparkAppHandle sparkAppHandle) {
log.info("infoChanged:{}", sparkAppHandle.getState().toString());
}
});
log.info("The task is executing, please wait ....");
//线程等待任务结束
countDownLatch.await();
log.info("The task is finished!");
//通过Spark原生的监测api获取执行结果信息,需要在spark-default.xml、spark-env.sh、yarn-site.xml进行相应的配置
String estUrl = "http://"+driverName+":18080/api/v1/applications/" + handle.getAppId();
return HttpUtil.httpGet(restUrl, null);
}
}
controller主要的职责就是接受页面的参数,将参数传递到service层
import com.hrong.springbootspark.service.ISparkSubmitService;
import com.hrong.springbootspark.vo.DataBaseExtractorVo;
import com.hrong.springbootspark.vo.Result;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.ResponseBody;
import javax.annotation.Resource;
import java.io.IOException;
/**
* @Author hrong
**/
@Slf4j
@Controller
public class SparkController {
@Resource
private ISparkSubmitService iSparkSubmitService;
/**
* 调用service进行远程提交spark任务
* @param vo 页面参数
* @return 执行结果
*/
@ResponseBody
@PostMapping("/extract/database")
public Object dbExtractAndLoad2Hdfs(@RequestBody DataBaseExtractorVo vo){
try {
return iSparkSubmitService.submitApplication(vo.getSparkApplicationParam(),
vo.getUrl(),
vo.getTable(),
vo.getUserName(),
vo.getPassword(),
vo.getTargetFileType(),
vo.getTargetFilePath());
} catch (IOException | InterruptedException e) {
e.printStackTrace();
log.error("执行出错:{}", e.getMessage());
return Result.err(500, e.getMessage());
}
}
}
UTF-8
UTF-8
1.8
2.8.3
2.3.3
2.11
2.11.8
5.1.46
11g
3.0.10
mysql
mysql-connector-java
${mysql.version}
com.oracle.driver
jdbc-driver
${oracle.version}
org.apache.spark
spark-core_${scala.version}
${spark.version}
provided
org.apache.spark
spark-sql_${scala.version}
${spark.version}
provided
org.codehaus.janino
commons-compiler
${codehaus.version}
org.scala-lang
scala-library
${scala-library.version}
provided
org.apache.hadoop
hadoop-common
${hadoop.version}
provided
spark-job
net.alchim31.maven
scala-maven-plugin
3.2.2
net.alchim31.maven
scala-maven-plugin
scala-compile-first
process-resources
add-source
compile
scala-test-compile
process-test-resources
testCompile
org.apache.maven.plugins
maven-compiler-plugin
compile
compile
org.apache.maven.plugins
maven-shade-plugin
2.4.3
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
获取外部参数,连接数据库,并将指定表中的数据根据指定的格式、目录转存到hdfs上
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @Author hrong
* @Description 将数据库中的表数据保存到hdfs上
**/
public class DbTableEtl {
private static Logger log = LoggerFactory.getLogger(DbTableEtl.class);
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.appName(DbTableEtl.class.getSimpleName())
.getOrCreate();
String url = args[0];
String dbtable = args[1];
String user = args[2];
String password = args[3];
String targetFileType = args[4];
String targetFilePath = args[5];
Dataset dbData = spark.read()
.format("jdbc")
.option("url", url)
.option("dbtable", dbtable)
.option("user", user)
.option("password", password)
.load();
log.info("展示部分样例数据,即将开始导入到hdfs");
dbData.show(20, false);
dbData.write().mode("overwrite").format(targetFileType).save(targetFilePath);
}
}
直接使用IDEA自带打包功能