如何使用javaAPI把数据从HDFS转移到Hbase

直接上代码如下:

**
注意:
1、此程序不需要打jar包到Linux上运行
2、集群为高可用
3、resource包需要添加hadoop的四个配置文件,log4j.properties为日志查看
如何使用javaAPI把数据从HDFS转移到Hbase_第1张图片

mapper阶段:

**
package HDFSToHbase.answer_base.mapper;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/**

  • @description 从hdfs上把数据迁移到hbase
  • @author: [email protected]
  • @create: 2018-12-06 21:20:59
    **/
    public class HDFSToHbaseMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    context.write(value,NullWritable.get());//把每一行数据截取直接发送到reduce端
    }
    }

**

reduce阶段:

**
package HDFSToHbase.answer_base.reducer;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**

  • @description 把数据从hdfs迁移到hbase
  • @author: [email protected]
  • @create: 2018-12-06 21:25:40
    **/
    public class HDFSToHbaseReducer extends TableReducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
    //对每一行数据根据“\001”进行切割
    String[] split = key.toString().split("\001");
    //把基础表的字段信息封装成数组,方便下一步组成K,V键值对
    String[] student={“exam_id”,“start_date”,“class_name”,“examinee_id”,“examinee_name”,“question_id”,“question_type”,“subject”,“question_stage”,“question_diffculty_id”,“is_objective”,“question_mark”,“score”,“is_right”};
    //以考试id、学生号,题目id为hbase的rowkey
    Put put=new Put((split[0]+split[3]+split[5]).getBytes());
    //每次循环把匹配的键值对装进定义的列簇(info_s)里
    for (int i = 0; i < split.length; i++) {
    if (i!=0 || 1!=3 ||i!=5) put.addColumn(“info_s”.getBytes(),split[i].getBytes(),student[i].getBytes());
    }
    context.write(NullWritable.get(),put);
    }
    }

**

job阶段:

**
package HDFSToHbase.answer_base.job;

import HDFSToHbase.answer_base.mapper.HDFSToHbaseMapper;
import HDFSToHbase.answer_base.reducer.HDFSToHbaseReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import util.HadoopUtil;

import java.io.IOException;

/**

  • @description 把数据从hdfs迁移到hbase

  • @author: [email protected]

  • @create: 2018-12-06 21:36:05
    **/
    public class HDFSToHbaseJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    //获取configuration的实例对象
    Configuration conf= HadoopUtil.getRemoteHadoopConf();

     //指定zookeeper、hbase的信息和输出的表名称
     conf.set("hbase.zookeeper.quorum","bigdata1:2181,bigdata2:2181,bigdata3:2181");
     conf.set("hbase.rootdir","hdfs://mycluster:8020/hbase");
     conf.set(TableOutputFormat.OUTPUT_TABLE,args[1]);
     conf.set("info",args[2]);
    
     //获取连接对象
     Connection connection = ConnectionFactory.createConnection(conf);
     //获取admin
     Admin admin = connection.getAdmin();
    
     //表存在,则删除表
     if ((admin.getTableDescriptor(TableName.valueOf(args[1])))!=null){
         admin.disableTable(TableName.valueOf(args[1]));//停止表应用
         admin.deleteTable(TableName.valueOf(args[1]));//删除表
     }
     //创建表、列簇
     HTableDescriptor hbaseTable = new HTableDescriptor(TableName.valueOf(args[1]));
     hbaseTable.addFamily(new HColumnDescriptor(args[2]));
     admin.createTable(hbaseTable);
    
     //关闭
     admin.close();
     connection.close();
    
    
     //获取job实例并设置名称
     Job job=Job.getInstance(conf,HDFSToHbaseJob.class.getSimpleName());
     TableMapReduceUtil.addDependencyJars(job);
    
     //设置job的类
     job.setJarByClass(HDFSToHbaseJob.class);
    
     //设置mapper类和K、V的类型
     job.setMapperClass(HDFSToHbaseMapper.class);
     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(NullWritable.class);
    
     //设置reducer的类和输出类型
     job.setReducerClass(HDFSToHbaseReducer.class);
     job.setOutputValueClass(NullWritable.class);
    
     //设置输入路径地址和输出格式
     FileInputFormat.addInputPath(job,new Path(args[0]));
     job.setOutputFormatClass(TableOutputFormat.class);
    
     //将作业提交到群集并等待它完成
     job.waitForCompletion(true);
    

    }
    }

**

util工具类:

**
package util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;

/**

  • @description 数据制造器

  • @author:

  • @create: 2018-11-14 19:34:45
    **/
    public class HadoopUtil {

    /**

    • 获取远程hadoop集群配置对象,默认直接使用bigdata-all.jar
    • @return 远程hadoop集群配置对象
      */
      public static Configuration getRemoteHadoopConf(){
      Configuration conf = getBaseRemoteHadoopConf();
      conf.set(“mapreduce.job.jar”,“B:\Project-idea\qilap\target\qilap-1.0-SNAPSHOT.jar”);
      return conf;
      }

    /**

    • 获取远程hadoop集群配置对象,使用给定的jar包的绝对路径
    • @param jarAbsPath
    • @return
      */
      public static Configuration getRemoteHadoopConf(String jarAbsPath){
      Configuration conf = getBaseRemoteHadoopConf();
      conf.set(“mapreduce.job.jar”, jarAbsPath);
      return conf;
      }

    /**

    • 构建一个基础的远程Hadoop配置信息
    • @return
      */
      private static Configuration getBaseRemoteHadoopConf(){
      Configuration conf= HBaseConfiguration.create();
      conf.set(“fs.defaultFS”, “hdfs://mycluster”);
      conf.set(“dfs.nameservices”, “mycluster”);
      conf.set(“dfs.ha.namenodes.mycluster”, “nn1,nn2”);
      conf.set(“dfs.namenode.rpc-address.mycluster.nn1”, “bigdata1:8020”);
      conf.set(“dfs.namenode.rpc-address.mycluster.nn2”, “bigdata2:8020”);
      conf.set(“dfs.client.failover.proxy.provider.ns1”, “org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider”);
      //MR&YARN相关配置
      conf.set(“mapreduce.framework.name”,“yarn”);
      conf.set(“yarn.resourcemanager.address”, “http://bigdata2:8032”);
      conf.set(“yarn.resourcemanager.scheduler.address”,“http://bigdata2:8030”);
      //允许跨平台提交jar包
      conf.set(“mapreduce.app-submission.cross-platform”, “true”);
      return conf;
      }
      }
      **

pom.xml依赖:

**


4.0.0

org.qianfeng
qilap
1.0-SNAPSHOT

    
        org.apache.hadoop
        hadoop-client
        2.8.1
    
    
        org.apache.hive
        hive-service
        2.3.4
    

你可能感兴趣的:(IT)