Spark连接各种数据源解析

 

sparksql连接MySQL数据库

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkSQLJDBC2Mysql");
		JavaSparkContext sc = new JavaSparkContext(conf);
		SQLContext sqlContext = new SQLContext(sc);
		DataFrameReader reader =sqlContext.read().format("jdbc");
		reader.option("url", "jdbc:mysql://192.168.1.243:3306/test");
		reader.option("dbtable", "address");
		reader.option("driver", "com.mysql.jdbc.Driver");
		reader.option("user", "root");
		reader.option("password", "cbbs");
		
		DataFrame nameandscoremysqlDataSourceDF=reader.load(); //生成的DataFrame
		nameandscoremysqlDataSourceDF.show();
		  
		reader.option("dbtable", "nameandage");
		DataFrame nameandagemysqlDataSourceDF=reader.load();//生成的DataFrame
		nameandagemysqlDataSourceDF.show();
}

sparksql连接Oracle数据库

public static void main(String[] args) {
		SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkSQLJDBC2Oracle");
		JavaSparkContext sc = new JavaSparkContext(conf);
		SQLContext sqlContext = new SQLContext(sc);
		DataFrameReader reader =sqlContext.read().format("jdbc");
		reader.option("url", "jdbc:oracle:thin:@192.168.1.243:1521:orcl");
		//reader.option("dbtable", "bb");
		reader.option("driver", "oracle.jdbc.driver.OracleDriver");
		reader.option("user", "testdata");
		reader.option("password", "cbbs");
		
	
		DataFrame nameandscoremysqlDataSourceDF=reader.load(); //生成的DataFrame
		nameandscoremysqlDataSourceDF.show();
		  
		reader.option("dbtable", "nameandage");
		DataFrame nameandagemysqlDataSourceDF=reader.load();//生成的DataFrame
		nameandagemysqlDataSourceDF.show();

 

 spark连接MongoDB

实例1:

JavaSparkContext sc =new JavaSparkContext("local","test");
Configuration config =new Configuration();
//解释 主机:端口号/数据库名.Collection名
		        config.set("mongo.input.uri","mongodb://192.168.1.243:27017/cbbstest.testData");
config.set("mongo.output.uri", "mongodb://192.168.1.243:27017/cbbstest.test");//读取
JavaPairRDD mongoRDD = sc.newAPIHadoopRDD(config, com.mongodb.hadoop.MongoInputFormat.class, Object.class, BSONObject.class);
//BasonObject-> text
JavaRDD result = mongoRDD.map(
new Function, text>() {
public text call(Tuple2 v1) throws Exception {
String title = (String) v1._2().get("title");
Date date =(Date) v1._2().get("date");
List paragraph = (List) v1._2().get("paragraph");
	return new text(title,date,paragraph);
}
}
);
//copy lang.sanlu to lang.output
mongoRDD.saveAsNewAPIHadoopFile("file:///copy",Object.class, Object.class, MongoOutputFormat.class, config);

实例2

  JavaSparkContext sc =new JavaSparkContext("local","test");
                Configuration mongodbConfig =new Configuration();
                mongodbConfig.set("mongo.job.input.format",
                        "com.mongodb.hadoop.MongoInputFormat");
                mongodbConfig.set("mongo.job.output.format",
                        "com.mongodb.hadoop.MongoOutputFormat");
                mongodbConfig.set("mongo.input.uri",
                        "mongodb://localhost:27017/hong.test");
                JavaPairRDD documents = sc.newAPIHadoopRDD(
                        mongodbConfig,            // Configuration
                        MongoInputFormat.class,   // InputFormat: read from a live cluster.
                        Object.class,             // Key class
                        BSONObject.class          // Value class
                    );
                
                System.out.println("1111");
                Configuration outputConfig = new Configuration();
                outputConfig.set("mongo.output.uri",
                                 "mongodb://localhost:27017/hong.44");
                
                documents.saveAsNewAPIHadoopFile(
                        "f:/a/abc.json",
                        Object.class,
                        BSONObject.class,
                        MongoOutputFormat.class,
                        outputConfig
                    );

spark连接JSON数据源

Dataset people = spark.read().json("src/main/resources/people.json");
    people.printSchema();
       people.createOrReplaceTempView("people");
    // SQL statements can be run by using the sql methods provided by spark
    Dataset namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
    namesDF.show();
     List jsonData = Arrays.asList(
            "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
    JavaRDD anotherPeopleRDD =
            new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
    Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
    anotherPeople.show();

 spark连接parquet源

private static void runBasicParquetExample(SparkSession spark) {
    Dataset peopleDF = spark.read().json("src/main/resources/people.json");
    peopleDF.write().parquet("people.parquet");
    Dataset parquetFileDF = spark.read().parquet("people.parquet");
    parquetFileDF.createOrReplaceTempView("parquetFile");
    Dataset namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
    Dataset namesDS = namesDF.map(new MapFunction() {
    public String call(Row row) {
    return "Name: " + row.getString(0);
      }
    }, Encoders.STRING());
    namesDS.show();
  }

  private static void runParquetSchemaMergingExample(SparkSession spark) {
     List squares = new ArrayList<>();
    for (int value = 1; value <= 5; value++) {
      Square square = new Square();
      square.setValue(value);
      square.setSquare(value * value);
      squares.add(square);
    }

   
    Dataset squaresDF = spark.createDataFrame(squares, Square.class);
    squaresDF.write().parquet("data/test_table/key=1");
    List cubes = new ArrayList<>();
    for (int value = 6; value <= 10; value++) {
    Cube cube = new Cube();
    cube.setValue(value);
    cube.setCube(value * value * value);
    cubes.add(cube);
    }
    Dataset cubesDF = spark.createDataFrame(cubes, Cube.class);
    cubesDF.write().parquet("data/test_table/key=2");
    Dataset mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table");
    mergedDF.printSchema();
  }

SparkJ解析CSV格式文件

SparkConf().setMaster("local").setAppName("SparkSQLCSV");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc);
	HashMap options = new HashMap();
	options.put("header", "true");
	options.put("path", "cars.csv");
	DataFrame df = sqlContext.load("com.databricks.spark.csv", options);
	df.select("year", "model").save("newcars.csv", "com.databricks.spark.csv");

spark处理Hbase数据

public class Spark2Hbase implements Serializable {

    public Log log = LogFactory.getLog(HbaseTest.class);
        static String convertScanToString(Scan scan) throws IOException {
        ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
        return Base64.encodeBytes(proto.toByteArray());
    }
    public void start() {
        //初始化sparkContext,这里必须在jars参数里面放上Hbase的jar,
        // 否则会报unread block data异常
        JavaSparkContext sc = new JavaSparkContext("spark://nowledgedata-n3:7077", "hbaseTest",
                "/home/hadoop/software/spark-0.8.1",
                new String[]{"target/ndspark.jar", "target\\dependency\\hbase-0.94.6.jar"});

        //使用HBaseConfiguration.create()生成Configuration
        // 必须在项目classpath下放上hadoop以及hbase的配置文件。
        Configuration conf = HBaseConfiguration.create();
        //设置查询条件,这里值返回用户的等级
        Scan scan = new Scan();
        scan.setStartRow(Bytes.toBytes("195861-1035177490"));
        scan.setStopRow(Bytes.toBytes("195861-1072173147"));
        scan.addFamily(Bytes.toBytes("info"));
        scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("levelCode"));
        try {
            //需要读取的hbase表名
            String tableName = "usertable";
            conf.set(TableInputFormat.INPUT_TABLE, tableName);
            conf.set(TableInputFormat.SCAN, convertScanToString(scan));
            //获得hbase查询结果Result
            JavaPairRDD hBaseRDD = sc.newAPIHadoopRDD(conf,
                    TableInputFormat.class, ImmutableBytesWritable.class,
                    Result.class);
            //从result中取出用户的等级,并且每一个算一次
            JavaPairRDD levels = hBaseRDD.map(
                    new PairFunction, Integer, Integer>() {
                        @Override
                        public Tuple2 call(
                                Tuple2 immutableBytesWritableResultTuple2)
                                throws Exception {
                            byte[] o = immutableBytesWritableResultTuple2._2().getValue(
                                    Bytes.toBytes("info"), Bytes.toBytes("levelCode"));
                            if (o != null) {
                                return new Tuple2(Bytes.toInt(o), 1);
                            }
                            return null;
                        }
                    });
            //数据累加
            JavaPairRDD counts = levels.reduceByKey(new Function2() {
                public Integer call(Integer i1, Integer i2) {
                    return i1 + i2;
                }
            });
                        //打印出最终结果
            List> output = counts.collect();
            for (Tuple2 tuple : output) {
                System.out.println(tuple._1 + ": " + tuple._2);
            }
        } catch (Exception e) {
            log.warn(e);
        }
    }
    /**
     * spark如果计算没写在main里面,实现的类必须继承Serializable接口,
* 否则会报 Task not serializable: java.io.NotSerializableException 异常 */ public static void main(String[] args) throws InterruptedException { new HbaseTest().start(); System.exit(0); } } }

JAVA代码连接Hive数据仓库

参考( http://blog.csdn.net/u013468917/article/details/52748342)

object SparkSQL2Hive {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()  //创建SparkConf对象
    conf.setAppName("SparkSQL2Hive")  //设置应用程序名
    conf.setMaster("spark://slq1:7077")  //设置集群的Master
    val sc = new SparkContext  //创建SparkContext对象,
    
    //在目前企业级大数据Spark开发的时候,绝大多数情况下是采用Hive作为数据仓库
    //Spark提供了HIve的支持功能,Spark通过HiveContext可以直接操作Hive中的数据
    //基于HiveContext我们可以使用sql/hql两种方式才编写SQL语句对Hive进行操作,
    //包括创建表、删除表、往表里导入数据 以及用SQL语法构造 各种SQL语句对表中的数据进行CRUD操作   
    //第二:也可以直接通过saveAsTable的方式把DaraFrame中的数据保存到Hive数据仓库中
    //第三:可以直接通过HiveContext.table方法来直接加载Hive中的表而生成DataFrame
    val hiveContext = new HiveContext(sc)
    hiveContext.sql("use hive")
    hiveContext.sql("DROP TABLE IF EXISTS people")
    hiveContext.sql("CREATE TABLE IF NOT EXISTS people(name STRING,age INT)")
    hiveContext.sql("LOAD DATA LOCAL INPATH '/home/richard/slq/spark/people.txt' INTO TABLE people")
    //把本地数据加载到Hive中(背后实际上发生了数据的拷贝)
    //当然也可以通过LOAD DATA INPATH去获得HDFS等上面的数据 到Hive(此时发生了数据的移动)
    hiveContext.sql("DROP TABLE IF EXISTS peoplescores")
    hiveContext.sql("CREATE TABLE IF NOT EXISTS peoplescores(name STRING,score INT)")
    hiveContext.sql("LOAD DATA LOCAL INPATH '/home/richard/slq/spark/peoplescores.txt' INTO TABLE peoplescores")
    //通过HiveContext使用join直接基于Hive中的两张表进行操作获得大于90分的人的name,age,score
    val resultDF = hiveContext.sql("SELECT pi.name,pi.age,ps.score"   
      + "FROM people pi JOIN peoplescores ps ON pi.name=ps.name WHERE ps.score > 90")
    
      //通过saveAsTable创建一张Hive Managed Table,数据放在什么地方、元数据都是Hive管理的
      //当删除该表时,数据也会一起被删除(磁盘上的数据不再存在)
    hiveContext.sql("DROP TABLE IF EXISTS peopleinformationresult")
    resultDF.saveAsTable("peopleinformationresult")  
    
    //使用HivewContext的Table方法可以直接去读Hive中的Table并生成DaraFrame
    //读取的数据就可以进行机器学习、图计算、各种复杂ETL等操作
    val dataFrameHive = hiveContext.table("peopleinformationresult")
    
    dataFrameHive.show()  
    
  }
}

 

 

 

你可能感兴趣的:(Spark连接各种数据源解析)