Dataset API(DSL)。
6.对 DataFrame 进行处理 personDF.show
1. 使用的spark 1.x 的写法 使用的RDD转换 DataFrame
package org.yonggan
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* spark 1.x 版本中 DataFrame创建
*/
object WordCountDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("app").setMaster("local")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
val wcRdd = sc.textFile("wc.txt")
//单词切分
val wordAndOne: RDD[Word] = wcRdd.flatMap(_.split(" ")).map(Word(_))
import sqlc.implicits._
// 创建 word DataFrame
val wordDf: DataFrame = wordAndOne.toDF()
//注册 中间 表
wordDf.registerTempTable("t_word")
// , count(1)
val resFrame = sqlc.sql("select word,count(1) from t_word group by word")
resFrame.show()
sc.stop()
}
}
case class Word(word:String)
package org.yonggan
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql._
object WordCountDemo02 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("app").setMaster("local")
val sc = new SparkContext(conf)
val sqlc = new SQLContext(sc)
val wcRdd = sc.textFile("wc.txt")
//单词切分
val rowRdd: RDD[Row] = wcRdd.flatMap(_.split(" ")).map(Row(_))
// schema
val schema = StructType(List(StructField("name", StringType, true)))
//创建DataFrame
val wdf: DataFrame = sqlc.createDataFrame(rowRdd, schema)
// 创建查询条件
val resDf = wdf.select("name").groupBy("name").count()
resDf.show()
// val wcDf: RelationalGroupedDataset = wdf.select("name", "count(1)").groupBy("name")
}
}
创建初始化信息 使用的新的api 接口SparkSession
val session = SparkSession.builder().getOrCreate()
package org.yonggan
import org.apache.spark.sql.{Dataset, SparkSession}
object DatasetWordCount01 {
def main(args: Array[String]): Unit = {
/**
* spark 2.x 版本过后使用 新版本api
*/
val session = SparkSession.builder()
.appName("app")
.master("local")
.getOrCreate()
val sqlContext = session.sqlContext
val wcDs = sqlContext.read.textFile("wc.txt")
// 导入隐式转换
import session.implicits._
val wordData: Dataset[String] = wcDs.flatMap(_.split(" "))
wordData.createTempView("t_word")
// wordData.printSchema()
// 查询结果
val resDf = session.sql("select value, count(1) from t_word group by value")
resDf.show()
}
}
2. 使用的DSL
package org.yonggan
import org.apache.spark.sql.{Dataset, Row, SparkSession}
object DatasetWordCount02 {
def main(args: Array[String]): Unit = {
/**
* spark 2.x 版本过后使用 新版本api
*/
val session = SparkSession.builder()
.appName("app")
.master("local")
.getOrCreate()
val sqlContext = session.sqlContext
val wcDs = sqlContext.read.textFile("wc.txt")
// 导入隐式转换
import session.implicits._
val wordData: Dataset[String] = wcDs.flatMap(_.split(" "))
wordData.createTempView("t_word")
// 查询结果
import org.apache.spark.sql.functions._
val wDs = wordData.groupBy("value").agg(count("value") as "cts")
//排序
val resDataset: Dataset[Row] = wDs.orderBy($"cts" desc)
resDataset.show()
}
}