SparkSQLExample.scala官方范例学习

$ bin/spark-shell --master local[4]

scala> spark.

baseRelationToDataFrame   conf              emptyDataFrame   implicits         range        sparkContext   stop      time      

catalog                   createDataFrame   emptyDataset     listenerManager   read         sql            streams   udf       

close                     createDataset     experimental     newSession        readStream   sqlContext     table     version

scala> spark.conf

18/03/19 15:22:48 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException

res0: org.apache.spark.sql.RuntimeConfig = org.apache.spark.sql.RuntimeConfig@4138af7

// global_temp可以跨session使用的

scala> spark.read.json("examples/src/main/resources/people.json")

org.apache.spark.sql.AnalysisException: 

Path does not exist: hdfs://xxxxxxx1:8020/user/YYYYYYYYY/examples/src/main/resources/people.json;

// 系统在.sparkStaging所在的目录搜寻examples/目录

scala> spark.read.json("file:///examples/src/main/resources/people.json")

org.apache.spark.sql.AnalysisException: Path does not exist: file:/examples/src/main/resources/people.json;

......

// 不在$SPARK_HOME目录

scala> spark.read.json("file:///$SPARK_HOME/examples/src/main/resources/people.json")

org.apache.spark.sql.AnalysisException: Path does not exist: file:/$SPARK_HOME/examples/src/main/resources/people.json;

......

// 不识别$SPARK_HOME,毕竟不是同一个shell……鱼唇的尝试!

scala> spark.read.json("file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json")

res4: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

// 终于对了,json返回的是DataFrame

scala> res4.show

+----+-------+

| age|   name|

+----+-------+

|null|Michael|

|  30|   Andy|

|  19| Justin|

+----+-------+

 

scala> val df =res4

df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

 

scala> df.printSchema

root

 |-- age: long (nullable = true)

 |-- name: string (nullable = true)

 

scala> df.select("name").show

+-------+

|   name|

+-------+

|Michael|

|   Andy|

| Justin|

+-------+

// DF.select("[column_name]")

scala> results.select("name", "age").show

+-------+---+

|   name|age|

+-------+---+

|Michael| 29|

|   Andy| 30|

| Justin| 19|

+-------+---+

// 可以多选几列

scala> df.select($"name", $"age" + 100).show

+-------+-----------+

|   name|(age + 100)|

+-------+-----------+

|Michael|       null|

|   Andy|        130|

| Justin|        119|

+-------+-----------+

// DF.select($"[column_name]") 

// select的方法重载,取值需要 $ 符号;不能混用

scala> results.select("name", "age"+100).show

org.apache.spark.sql.AnalysisException: cannot resolve '`age100`' given input columns: [name, age];;

'Project [name#172, 'age100]

+- Project [name#172, age#173]

   +- SubqueryAlias people4

      +- LogicalRDD [name#172, age#173]

......

scala> results.select("name", $"age"+100).show

:38: error: overloaded method value select with alternatives:

scala> df.filter($"age" > 21).show

+---+----+

|age|name|

+---+----+

| 30|Andy|

+---+----+

 

scala> df.groupBy("age").count.show

+----+-----+

| age|count|

+----+-----+

|  19|    1|

|null|    1|

|  30|    1|

+----+-----+

 

scala> df.

agg             count                     except             inputFiles      orderBy             sample                 take              where               

alias           createGlobalTempView      explain            intersect       persist             schema                 takeAsList        withColumn          

apply           createOrReplaceTempView   explode            isLocal         printSchema         select                 toDF              withColumnRenamed   

as              createTempView            filter             isStreaming     queryExecution      selectExpr             toJSON            withWatermark       

cache           crossJoin                 first              javaRDD         randomSplit         show                   toJavaRDD         write               

checkpoint      cube                      flatMap            join            randomSplitAsList   sort                   toLocalIterator   writeStream         

coalesce        describe                  foreach            joinWith        rdd                 sortWithinPartitions   toString                              

col             distinct                  foreachPartition   limit           reduce              sparkSession           transform                             

collect         drop                      groupBy            map             registerTempTable   sqlContext             union                                 

collectAsList   dropDuplicates            groupByKey         mapPartitions   repartition         stat                   unionAll                              

columns         dtypes                    head               na              rollup              storageLevel           unpersist                             

// DF可以做的各种操作

 

scala> df.create

createGlobalTempView   createOrReplaceTempView   createTempView

 

scala> df.createOrReplaceTempView("people")

 

scala> val sqlDF=spark.sql("select * from people")

sqlDF: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

// 现变成View,才能SQL

scala> sqlDF.show

+----+-------+

| age|   name|

+----+-------+

|null|Michael|

|  30|   Andy|

|  19| Justin|

+----+-------+

 

scala> df.createGlobalTempView("people2")

 

scala> spark.sql("select * from global_temp.people2").show

+----+-------+

| age|   name|

+----+-------+

|null|Michael|

|  30|   Andy|

|  19| Justin|

+----+-------+

 

scala> spark.newSession().sql("select * from global_temp.people2").show

+----+-------+

| age|   name|

+----+-------+

|null|Michael|

|  30|   Andy|

|  19| Justin|

+----+-------+

// global view 可以跨session

scala> case class Person(name:String, age:Long)

defined class Person

 

scala> val caseClassDS=Seq(Person("Andy",32)).toDS

caseClassDS: org.apache.spark.sql.Dataset[Person] = [name: string, age: bigint]

 

scala> caseClassDS.show

+----+---+

|name|age|

+----+---+

|Andy| 32|

+----+---+

 

scala> val primitiveDS=Seq(1,2,3).toDS

primitiveDS: org.apache.spark.sql.Dataset[Int] = [value: int]

// 基本数据类型不需要指明类型

scala> primitiveDS.map(_*2).show

+-----+

|value|

+-----+

|    2|

|    4|

|    6|

+-----+

 

scala> primitiveDS.map(_*2).collect

res19: Array[Int] = Array(2, 4, 6)

 

scala> val path="file:/opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json"

path: String = file:/opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json

 

scala> val peopleDS=spark.read.json(path).as[Person]

peopleDS: org.apache.spark.sql.Dataset[Person] = [age: bigint, name: string]

 

scala> peopleDS.show

+----+-------+

| age|   name|

+----+-------+

|null|Michael|

|  30|   Andy|

|  19| Justin|

+----+-------+

 

scala> peopleDS.collect

java.lang.RuntimeException: Error while decoding: java.lang.NullPointerException: Null value appeared in non-nullable field:

- field (class: "scala.Long", name: "age")

- root class: "Person"

If the schema is inferred from a Scala tuple/case class, or a Java bean, please try to use scala.Option[_] or other nullable types (e.g. java.lang.Integer instead of int/scala.Int).

newInstance(class Person)

:- input[1, string, true].toString

:  +- input[1, string, true]

+- assertnotnull(input[0, bigint, true])

   +- input[0, bigint, true]

scala> val path = "file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt"

scala> val peopleDF2 = spark.sparkContext.textFile(path).map(_.split(",")).map(atts => Person(atts(0), atts(1).trim.toInt)).toDF

peopleDF2: org.apache.spark.sql.DataFrame = [name: string, age: bigint]

 

scala> peopleDF2.createOrReplaceTempView("people3")

 

scala> val teenagersDF=spark.sql("select name, age from people3 where age between 13 and 19")

teenagersDF: org.apache.spark.sql.DataFrame = [name: string, age: bigint]

 

scala> teenagersDF.show

+------+---+

|  name|age|

+------+---+

|Justin| 19|

+------+---+

 

scala> teenagersDF.map(teenager => "Name ->" + teenager(0)).show

+-------------+

|        value|

+-------------+

|Name ->Justin|

+-------------+

 

scala> teenagersDF.map(teen => "Name -> " + teen.getAs[String]("name")).show

+--------------+

|         value|

+--------------+

|Name -> Justin|

+--------------+

 

scala> implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]

mapEncoder: org.apache.spark.sql.Encoder[Map[String,Any]] = class[value[0]: binary]

// 自定义Encoder

// Primitive types and case classes can be also defined as

// implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()

scala> teenagersDF.map(teen => teen.getValuesMap[Any](List("name", "age"))).collect

res26: Array[Map[String,Any]] = Array(Map(name -> Justin, age -> 19))

// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]

// 试翻译:一次操作多列,每一列映射为 Map[String, T] 的形式

scala> teenagersDF.map(teen => teen.getValuesMap[Any](List("name", "age"))).show

+--------------------+

|               value|

+--------------------+

|[2E 01 02 39 01 0...|

+--------------------+

scala> val path = "file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt"

scala> val peopleRDD = spark.sparkContext.textFile(path)

peopleRDD: org.apache.spark.rdd.RDD[String] = file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt MapPartitionsRDD[70] at textFile at :25

 

scala> peopleRDD.show

:34: error: value show is not a member of org.apache.spark.rdd.RDD[String]

       peopleRDD.show

                 ^

scala> peopleRDD.collect

res29: Array[String] = Array(Michael, 29, Andy, 30, Justin, 19)

 

scala> val schemaString = "name age"

schemaString: String = name age

 

scala> import org.apache.spark.sql.types._

import org.apache.spark.sql.types._

// spark-shell模式下需要手动import

scala>  val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))

fields: Array[org.apache.spark.sql.types.StructField] = Array(StructField(name,StringType,true), StructField(age,StringType,true))

 

scala> fields

res31: Array[org.apache.spark.sql.types.StructField] = Array(StructField(name,StringType,true), StructField(age,StringType,true))

// Array里面是StructField,StructField可理解为列,column

scala> val schema = StructType(fields)

schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,StringType,true))

// StructType可以理解为shcema,表结构。ST里面是SF

scala> import org.apache.spark.sql.Row

import org.apache.spark.sql.Row

 

scala> val rowRDD = peopleRDD.map(_.split(",")).map(att => Row(att(0), att(1).trim))

rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[72] at map at :37

// RDD里面是Row

scala> val peopleDF = spark.createDataFrame(rowRDD, schema)

peopleDF: org.apache.spark.sql.DataFrame = [name: string, age: string]

// 目前看到④种创建DataFrame的方式 (吐槽一下编辑器的蜜汁行距……)

// 1)sparkSession.read.json(path)

// 2)  sparkSession.sql("[sql]")

// 3)  rdd.toDF

// 4)  sparkSession.createDataFrame(rowRDD, schema)

 

// 创建Dataset的②种方式(目前)

// 1)Seq(...).toDS,Seq里面可以是简单数据类型,也可是case class

// --Seq(Person("Andy",32)).toDS

// --Seq(1,2,3).toDS

// 2)spark.read.json(path).as[Person]

 

scala> peopleDF.createOrReplaceTempView("people4")

 

scala> val results = spark.sql("select * from people4")

results: org.apache.spark.sql.DataFrame = [name: string, age: string]

scala> results.show

+-------+---+

|   name|age|

+-------+---+

|Michael| 29|

|   Andy| 30|

| Justin| 19|

+-------+---+

 

scala> results.map(atts => "Name -> "+atts(0) + ", age -> " + atts(1)).show

+--------------------+

|               value|

+--------------------+

|Name -> Michael, ...|

|Name -> Andy, age...|

|Name -> Justin, a...|

+--------------------+

 

scala> results.map(atts => "Name->"+atts(0) + ",age->" + atts(1)).show

+--------------------+

|               value|

+--------------------+

|Name->Michael,age...|

|  Name->Andy,age->30|

|Name->Justin,age->19|

+--------------------+

 

你可能感兴趣的:(spark)