JavaSparkContext sc =new JavaSparkContext("local","test");
Configuration mongodbConfig =new Configuration();
mongodbConfig.set("mongo.job.input.format",
"com.mongodb.hadoop.MongoInputFormat");
mongodbConfig.set("mongo.job.output.format",
"com.mongodb.hadoop.MongoOutputFormat");
mongodbConfig.set("mongo.input.uri",
"mongodb://localhost:27017/hong.test");
JavaPairRDD documents = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
System.out.println("1111");
Configuration outputConfig = new Configuration();
outputConfig.set("mongo.output.uri",
"mongodb://localhost:27017/hong.44");
documents.saveAsNewAPIHadoopFile(
"f:/a/abc.json",
Object.class,
BSONObject.class,
MongoOutputFormat.class,
outputConfig
);
spark连接JSON数据源
Dataset people = spark.read().json("src/main/resources/people.json");
people.printSchema();
people.createOrReplaceTempView("people");
// SQL statements can be run by using the sql methods provided by spark
Dataset namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
namesDF.show();
List jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
JavaRDD anotherPeopleRDD =
new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
anotherPeople.show();
spark连接parquet源
private static void runBasicParquetExample(SparkSession spark) {
Dataset peopleDF = spark.read().json("src/main/resources/people.json");
peopleDF.write().parquet("people.parquet");
Dataset parquetFileDF = spark.read().parquet("people.parquet");
parquetFileDF.createOrReplaceTempView("parquetFile");
Dataset namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
Dataset namesDS = namesDF.map(new MapFunction() {
public String call(Row row) {
return "Name: " + row.getString(0);
}
}, Encoders.STRING());
namesDS.show();
}
private static void runParquetSchemaMergingExample(SparkSession spark) {
List squares = new ArrayList<>();
for (int value = 1; value <= 5; value++) {
Square square = new Square();
square.setValue(value);
square.setSquare(value * value);
squares.add(square);
}
Dataset squaresDF = spark.createDataFrame(squares, Square.class);
squaresDF.write().parquet("data/test_table/key=1");
List cubes = new ArrayList<>();
for (int value = 6; value <= 10; value++) {
Cube cube = new Cube();
cube.setValue(value);
cube.setCube(value * value * value);
cubes.add(cube);
}
Dataset cubesDF = spark.createDataFrame(cubes, Cube.class);
cubesDF.write().parquet("data/test_table/key=2");
Dataset mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table");
mergedDF.printSchema();
}
SparkJ解析CSV格式文件
SparkConf().setMaster("local").setAppName("SparkSQLCSV");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
HashMap options = new HashMap();
options.put("header", "true");
options.put("path", "cars.csv");
DataFrame df = sqlContext.load("com.databricks.spark.csv", options);
df.select("year", "model").save("newcars.csv", "com.databricks.spark.csv");
object SparkSQL2Hive {
def main(args: Array[String]): Unit = {
val conf = new SparkConf() //创建SparkConf对象
conf.setAppName("SparkSQL2Hive") //设置应用程序名
conf.setMaster("spark://slq1:7077") //设置集群的Master
val sc = new SparkContext //创建SparkContext对象,
//在目前企业级大数据Spark开发的时候,绝大多数情况下是采用Hive作为数据仓库
//Spark提供了HIve的支持功能,Spark通过HiveContext可以直接操作Hive中的数据
//基于HiveContext我们可以使用sql/hql两种方式才编写SQL语句对Hive进行操作,
//包括创建表、删除表、往表里导入数据 以及用SQL语法构造 各种SQL语句对表中的数据进行CRUD操作
//第二:也可以直接通过saveAsTable的方式把DaraFrame中的数据保存到Hive数据仓库中
//第三:可以直接通过HiveContext.table方法来直接加载Hive中的表而生成DataFrame
val hiveContext = new HiveContext(sc)
hiveContext.sql("use hive")
hiveContext.sql("DROP TABLE IF EXISTS people")
hiveContext.sql("CREATE TABLE IF NOT EXISTS people(name STRING,age INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/home/richard/slq/spark/people.txt' INTO TABLE people")
//把本地数据加载到Hive中(背后实际上发生了数据的拷贝)
//当然也可以通过LOAD DATA INPATH去获得HDFS等上面的数据 到Hive(此时发生了数据的移动)
hiveContext.sql("DROP TABLE IF EXISTS peoplescores")
hiveContext.sql("CREATE TABLE IF NOT EXISTS peoplescores(name STRING,score INT)")
hiveContext.sql("LOAD DATA LOCAL INPATH '/home/richard/slq/spark/peoplescores.txt' INTO TABLE peoplescores")
//通过HiveContext使用join直接基于Hive中的两张表进行操作获得大于90分的人的name,age,score
val resultDF = hiveContext.sql("SELECT pi.name,pi.age,ps.score"
+ "FROM people pi JOIN peoplescores ps ON pi.name=ps.name WHERE ps.score > 90")
//通过saveAsTable创建一张Hive Managed Table,数据放在什么地方、元数据都是Hive管理的
//当删除该表时,数据也会一起被删除(磁盘上的数据不再存在)
hiveContext.sql("DROP TABLE IF EXISTS peopleinformationresult")
resultDF.saveAsTable("peopleinformationresult")
//使用HivewContext的Table方法可以直接去读Hive中的Table并生成DaraFrame
//读取的数据就可以进行机器学习、图计算、各种复杂ETL等操作
val dataFrameHive = hiveContext.table("peopleinformationresult")
dataFrameHive.show()
}
}
什么是Thrift
The Apache Thrift software framework, for scalable cross-language services development, combines a software stack with a code generation engine to build services that work efficiently and s
org.json.JSONException: No value for items
在JSON解析中会遇到一种错误,很常见的错误
06-21 12:19:08.714 2098-2127/com.jikexueyuan.secret I/System.out﹕ Result:{"status":1,"page":1,&