目录
1.背景
2.需求
3.开发步骤
3.1 去github上下载datax的代码
3.2 本地解压,并导入idea
3.3创建一个模块kafkareader
3.4将任意一个模块的以下两个文件考入到resource目录下
3.5进行修改plugin.json
3.6修改pom.xml(复制其中一个文件的依赖和插件到pom.xml)
3.7将其他模块下面的,这个文件夹复制到我们模块的对应的文件夹,并且修改package.xml
3.8 在最外层的package.xml加上下面这个
4.开发代码
4.1 开发前将datax的开发插件的手册认真观看一遍,对开发有帮助的。
4.2编写代码(要继承什么类实现什么方法,4.1的开发宝典上都写了)
5.打包运行
5.1将其他模块注释只留下公共模块和自己的项目模块
5.2 进入到项目最外层的目录输入cmd(前提配置了本地maven的环境变量)
5.3 使用maven命令打包
5.4 打包后将下图目录下的包上传到集群的datax对应目录
5.5 写好配置文件就可以运行了
公司要求:统一入库平台,使用datax这个工具。需要采集kafka,elasticsearch,mysql,sqlserver等数据源的数据,并且只打算用datax。
开发datax的kafkaReader组件,从kafka数据源读取数据,然后同步到其他的数据。
1.要求:可以同步json格式的数据,要求可以用正则来解析数据,可以指定数据的分隔符来解析数据。
2.可以同步到hive,mysql,hbase中
网址:https://github.com/alibaba/DataX
File-》open-》选择你解压的包。 然后进入漫长的导包时间,等待他下载好所有依赖。
复制 下面两个标签内的内容。如果懒得删除没用的依赖也可以不用删除。然后导入自己的依赖。我们这里是kafkareader所以导入一下两个
...... ......
.........
org.apache.kafka
kafka_2.11
2.0.0
org.apache.kafka
kafka-clients
2.0.0
datax-all
com.alibaba.datax
0.0.1-SNAPSHOT
4.0.0
kafkareader
com.alibaba.datax
datax-common
${datax-project-version}
slf4j-log4j12
org.slf4j
org.slf4j
slf4j-api
ch.qos.logback
logback-classic
com.alibaba.datax
plugin-rdbms-util
${datax-project-version}
mysql
mysql-connector-java
5.1.34
org.apache.kafka
kafka_2.11
2.0.0
org.apache.kafka
kafka-clients
2.0.0
maven-compiler-plugin
1.6
1.6
${project-sourceEncoding}
maven-assembly-plugin
src/main/assembly/package.xml
datax
dwzip
package
single
要修改的我标记了,下面加粗了打下划线的地方就是。就是把你之前复制过来的reader修改为kafkareader
dir
false
src/main/resources
plugin.json
plugin_job_template.json
plugin/reader/kafkareader
target/
kafkareader-0.0.1-SNAPSHOT.jar
plugin/reader/kafkareader
false
plugin/reader/kafkareader/libs
runtime
地址:https://github.com/alibaba/DataX/blob/master/dataxPluginDev.md
主要代码
package com.alibaba.datax.plugin.reader.kafkareader;
import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.element.StringColumn;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class KafkaReader extends Reader {
public static class Job extends Reader.Job {
private static final Logger LOG = LoggerFactory
.getLogger(Job.class);
private Configuration originalConfig = null;
@Override
public void init() {
this.originalConfig = super.getPluginJobConf();
// warn: 忽略大小写
String topic = this.originalConfig
.getString(Key.TOPIC);
Integer partitions = this.originalConfig
.getInt(Key.KAFKA_PARTITIONS);
String bootstrapServers = this.originalConfig
.getString(Key.BOOTSTRAP_SERVERS);
String groupId = this.originalConfig
.getString(Key.GROUP_ID);
Integer columnCount = this.originalConfig
.getInt(Key.COLUMNCOUNT);
String split = this.originalConfig.getString(Key.SPLIT);
String filterContaintsStr = this.originalConfig.getString(Key.CONTAINTS_STR);
String filterContaintsFlag = this.originalConfig.getString(Key.CONTAINTS_STR_FLAG);
String conditionAllOrOne = this.originalConfig.getString(Key.CONDITION_ALL_OR_ONE);
String parsingRules = this.originalConfig.getString(Key.PARSING_RULES);
String writerOrder = this.originalConfig.getString(Key.WRITER_ORDER);
String kafkaReaderColumnKey = this.originalConfig.getString(Key.KAFKA_READER_COLUMN_KEY);
System.out.println(topic);
System.out.println(partitions);
System.out.println(bootstrapServers);
System.out.println(groupId);
System.out.println(columnCount);
System.out.println(split);
System.out.println(parsingRules);
if (null == topic) {
throw DataXException.asDataXException(KafkaReaderErrorCode.TOPIC_ERROR,
"没有设置参数[topic].");
}
if (partitions == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.PARTITION_ERROR,
"没有设置参数[kafka.partitions].");
} else if (partitions < 1) {
throw DataXException.asDataXException(KafkaReaderErrorCode.PARTITION_ERROR,
"[kafka.partitions]不能小于1.");
}
if (null == bootstrapServers) {
throw DataXException.asDataXException(KafkaReaderErrorCode.ADDRESS_ERROR,
"没有设置参数[bootstrap.servers].");
}
if (null == groupId) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"没有设置参数[groupid].");
}
if (columnCount == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.PARTITION_ERROR,
"没有设置参数[columnCount].");
} else if (columnCount < 1) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"[columnCount]不能小于1.");
}
if (null == split) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"[split]不能为空.");
}
if (filterContaintsStr != null) {
if (conditionAllOrOne == null || filterContaintsFlag == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"设置了[filterContaintsStr],但是没有设置[conditionAllOrOne]或者[filterContaintsFlag]");
}
}
if (parsingRules == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"没有设置[parsingRules]参数");
} else if (!parsingRules.equals("regex") && parsingRules.equals("json") && parsingRules.equals("split")) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"[parsingRules]参数设置错误,不是regex,json,split其中一个");
}
if (writerOrder == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"没有设置[writerOrder]参数");
}
if (kafkaReaderColumnKey == null) {
throw DataXException.asDataXException(KafkaReaderErrorCode.KAFKA_READER_ERROR,
"没有设置[kafkaReaderColumnKey]参数");
}
}
@Override
public void preCheck() {
init();
}
@Override
public List split(int adviceNumber) {
List configurations = new ArrayList();
Integer partitions = this.originalConfig.getInt(Key.KAFKA_PARTITIONS);
for (int i = 0; i < partitions; i++) {
configurations.add(this.originalConfig.clone());
}
return configurations;
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
public static class Task extends Reader.Task {
private static final Logger LOG = LoggerFactory
.getLogger(CommonRdbmsReader.Task.class);
//配置文件
private Configuration readerSliceConfig;
//kafka消息的分隔符
private String split;
//解析规则
private String parsingRules;
//是否停止拉去数据
private boolean flag;
//kafka address
private String bootstrapServers;
//kafka groupid
private String groupId;
//kafkatopic
private String kafkaTopic;
//kafka中的数据一共有多少个字段
private int count;
//是否需要data_from
//kafka ip 端口+ topic
//将包含/不包含该字符串的数据过滤掉
private String filterContaintsStr;
//是包含containtsStr 还是不包含
//1 表示包含 0 表示不包含
private int filterContaintsStrFlag;
//全部包含或不包含,包含其中一个或者不包含其中一个。
private int conditionAllOrOne;
//writer端要求的顺序。
private String writerOrder;
//kafkareader端的每个关键子的key
private String kafkaReaderColumnKey;
//异常文件路径
private String exceptionPath;
@Override
public void init() {
flag = true;
this.readerSliceConfig = super.getPluginJobConf();
split = this.readerSliceConfig.getString(Key.SPLIT);
bootstrapServers = this.readerSliceConfig.getString(Key.BOOTSTRAP_SERVERS);
groupId = this.readerSliceConfig.getString(Key.GROUP_ID);
kafkaTopic = this.readerSliceConfig.getString(Key.TOPIC);
count = this.readerSliceConfig.getInt(Key.COLUMNCOUNT);
filterContaintsStr = this.readerSliceConfig.getString(Key.CONTAINTS_STR);
filterContaintsStrFlag = this.readerSliceConfig.getInt(Key.CONTAINTS_STR_FLAG);
conditionAllOrOne = this.readerSliceConfig.getInt(Key.CONTAINTS_STR_FLAG);
parsingRules = this.readerSliceConfig.getString(Key.PARSING_RULES);
writerOrder = this.readerSliceConfig.getString(Key.WRITER_ORDER);
kafkaReaderColumnKey = this.readerSliceConfig.getString(Key.KAFKA_READER_COLUMN_KEY);
exceptionPath = this.readerSliceConfig.getString(Key.EXECPTION_PATH);
LOG.info(filterContaintsStr);
}
@Override
public void startRead(RecordSender recordSender) {
Properties props = new Properties();
props.put("bootstrap.servers", bootstrapServers);
props.put("group.id", groupId != null ? groupId : UUID.randomUUID().toString());
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("enable.auto.commit", "false");
KafkaConsumer consumer = new KafkaConsumer(props);
consumer.subscribe(Collections.singletonList(kafkaTopic));
Record oneRecord = null;
while (flag) {
ConsumerRecords records = consumer.poll(100);
for (ConsumerRecord record : records) {
String value = record.value();
//定义过滤标志
int ifNotContinue = filterMessage(value);
//如果标志修改为1了那么就过滤掉这条数据。
if (ifNotContinue == 1) {
LOG.info("过滤数据: " + record.value());
continue;
}
oneRecord = buildOneRecord(recordSender, value);
//如果返回值不等于null表示不是异常消息。
if (oneRecord != null) {
recordSender.sendToWriter(oneRecord);
}
}
consumer.commitSync();
//判断当前事件是不是0点,0点的话进程他退出
Date date = new Date();
if (DateUtil.targetFormat(date).split(" ")[1].substring(0, 2).equals("00")) {
destroy();
}
}
}
private int filterMessage(String value) {
//如果要过滤的条件配置了
int ifNotContinue = 0;
if (filterContaintsStr != null) {
String[] filterStrs = filterContaintsStr.split(",");
//所有
if (conditionAllOrOne == 1) {
//过滤掉包含filterContaintsStr的所有项的值。
if (filterContaintsStrFlag == 1) {
int i = 0;
for (; i < filterStrs.length; i++) {
if (!value.contains(filterStrs[i])) break;
}
if (i >= filterStrs.length) ifNotContinue = 1;
} else {
//留下掉包含filterContaintsStr的所有项的值
int i = 0;
for (; i < filterStrs.length; i++) {
if (!value.contains(filterStrs[i])) break;
}
if (i < filterStrs.length) ifNotContinue = 1;
}
} else {
//过滤掉包含其中一项的值
if (filterContaintsStrFlag == 1) {
int i = 0;
for (; i < filterStrs.length; i++) {
if (value.contains(filterStrs[i])) break;
}
if (i < filterStrs.length) ifNotContinue = 1;
}
//留下包含其中一下的值
else {
int i = 0;
for (; i < filterStrs.length; i++) {
if (value.contains(filterStrs[i])) break;
}
if (i >= filterStrs.length) ifNotContinue = 1;
}
}
}
return ifNotContinue;
}
private Record buildOneRecord(RecordSender recordSender, String value) {
Record record = null;
if (parsingRules.equals("regex")) {
record = parseRegex(value, recordSender);
} else if (parsingRules.equals("json")) {
record = parseJson(value, recordSender);
} else if (parsingRules.equals("split")) {
record = parseSplit(value, recordSender);
}
return record;
}
private Record parseSplit(String value, RecordSender recordSender) {
Record record = recordSender.createRecord();
String[] splits = value.split(this.split);
if (splits.length != count) {
writerErrorPath(value);
return null;
}
parseOrders(Arrays.asList(splits), record);
return record;
}
private Record parseJson(String value, RecordSender recordSender) {
Record record = recordSender.createRecord();
HashMap map = JsonUtilJava.parseJsonStrToMap(value);
String[] columns = kafkaReaderColumnKey.split(",");
ArrayList datas = new ArrayList();
for (String column : columns) {
datas.add(map.get(column).toString());
}
if (datas.size() != count) {
writerErrorPath(value);
return null;
}
parseOrders(datas, record);
return record;
}
private Record parseRegex(String value, RecordSender recordSender) {
Record record = recordSender.createRecord();
ArrayList datas = new ArrayList();
Pattern r = Pattern.compile(split);
Matcher m = r.matcher(value);
if (m.find()) {
if (m.groupCount() != count) {
writerErrorPath(value);
}
for (int i = 1; i <= count; i++) {
// record.addColumn(new StringColumn(m.group(i)));
datas.add(m.group(i));
return record;
}
} else {
writerErrorPath(value);
}
parseOrders(datas, record);
return null;
}
private void writerErrorPath(String value) {
if (exceptionPath == null) return;
FileOutputStream fileOutputStream = null;
try {
fileOutputStream = getFileOutputStream();
fileOutputStream.write((value + "\n").getBytes());
fileOutputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private FileOutputStream getFileOutputStream() throws FileNotFoundException {
return new FileOutputStream(exceptionPath + "/" + kafkaTopic + "errordata" + DateUtil.targetFormat(new Date(), "yyyyMMdd"), true);
}
private void parseOrders(List datas, Record record) {
//writerOrder
String[] orders = writerOrder.split(",");
for (String order : orders) {
if (order.equals("data_from")) {
record.addColumn(new StringColumn(bootstrapServers + "|" + kafkaTopic));
} else if (order.equals("uuid")) {
record.addColumn(new StringColumn(UUID.randomUUID().toString()));
} else if (order.equals("null")) {
record.addColumn(new StringColumn("null"));
} else if (order.equals("datax_time")) {
record.addColumn(new StringColumn(DateUtil.targetFormat(new Date())));
} else if (isNumeric(order)) {
record.addColumn(new StringColumn(datas.get(new Integer(order) - 1)));
}
}
}
public static boolean isNumeric(String str) {
for (int i = 0; i < str.length(); i++) {
if (!Character.isDigit(str.charAt(i))) {
return false;
}
}
return true;
}
@Override
public void post() {
}
@Override
public void destroy() {
flag = false;
}
}
在最为外层的pom.xml中注释
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
本地地址:D:\DataX-master\kafkareader\target\datax\plugin\reader
集群地址:/opt/module/datax/plugin/reader
{
"job": {
"content": [
{
"reader": {
"name": "kafkareader",
"parameter": {
"topic": "Event",
"bootstrapServers": "192.168.7.128:9092",
"kafkaPartitions": "1",
"columnCount":11,
"groupId":"ast",
"filterContaints":"5^1,6^5",
"filterContaintsFlag":1,
"conditionAllOrOne":0,
"parsingRules":"regex",
"writerOrder":"uuid,1,3,6,4,8,9,10,11,5,7,2,null,datax_time,data_from",
"kafkaReaderColumnKey":"a",
"execptionPath":"/opt/module/datax/log/errorlog"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://master:8020",
"fileType": "orc",
"path": "${path}",
"fileName": "t_rsd_amber_agent_event_log",
"column": [
{
"name": "id",
"type": "string"
},
{
"name": "risk_level",
"type": "string"
},
{
"name": "device_uuid",
"type": "string"
},
{
"name": "event_device_id",
"type": "string"
},
{
"name": "device_type",
"type": "string"
},
{
"name": "event_type",
"type": "string"
},
{
"name": "event_sub_type",
"type": "string"
},
{
"name": "repeats",
"type": "string"
},
{
"name": "description",
"type": "string"
},
{
"name": "event_time",
"type": "string"
},
{
"name": "report_device_type",
"type": "string"
},
{
"name": "event_report_time",
"type": "string"
},
{
"name": "last_update_time",
"type": "string"
},
{
"name": "datax_time",
"type": "string"
}
, {
"name": "data_from",
"type": "string"
},
],
"writeMode": "append",
"fieldDelimiter": "\t",
"compress":"NONE",
"scrollFileTime":300000
}
}
}
],
"setting": {
"speed": {
"channel": 3,
"record": 20000,
"byte":5000 ,
"batchSize":2048
}
}
}
}
运行命令:
python /opt/module/datax/bin/datax.py -p "-Dpath=/data/warehouse/rsd/t_rsd_amber_agent_event_log/2019/06/05" /opt/module/datax/job/kafkatohdfs.json