$ bin/spark-shell --master local[4]
scala> spark.
baseRelationToDataFrame conf emptyDataFrame implicits range sparkContext stop time
catalog createDataFrame emptyDataset listenerManager read sql streams udf
close createDataset experimental newSession readStream sqlContext table version
scala> spark.conf
18/03/19 15:22:48 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
res0: org.apache.spark.sql.RuntimeConfig = org.apache.spark.sql.RuntimeConfig@4138af7
// global_temp可以跨session使用的
scala> spark.read.json("examples/src/main/resources/people.json")
org.apache.spark.sql.AnalysisException:
Path does not exist: hdfs://xxxxxxx1:8020/user/YYYYYYYYY/examples/src/main/resources/people.json;
// 系统在.sparkStaging所在的目录搜寻examples/目录
scala> spark.read.json("file:///examples/src/main/resources/people.json")
org.apache.spark.sql.AnalysisException: Path does not exist: file:/examples/src/main/resources/people.json;
......
// 不在$SPARK_HOME目录
scala> spark.read.json("file:///$SPARK_HOME/examples/src/main/resources/people.json")
org.apache.spark.sql.AnalysisException: Path does not exist: file:/$SPARK_HOME/examples/src/main/resources/people.json;
......
// 不识别$SPARK_HOME,毕竟不是同一个shell……鱼唇的尝试!
scala> spark.read.json("file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json")
res4: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
// 终于对了,json返回的是DataFrame
scala> res4.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
scala> val df =res4
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> df.printSchema
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
scala> df.select("name").show
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
// DF.select("[column_name]")
scala> results.select("name", "age").show
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
// 可以多选几列
scala> df.select($"name", $"age" + 100).show
+-------+-----------+
| name|(age + 100)|
+-------+-----------+
|Michael| null|
| Andy| 130|
| Justin| 119|
+-------+-----------+
// DF.select($"[column_name]")
// select的方法重载,取值需要 $ 符号;不能混用
scala> results.select("name", "age"+100).show
org.apache.spark.sql.AnalysisException: cannot resolve '`age100`' given input columns: [name, age];;
'Project [name#172, 'age100]
+- Project [name#172, age#173]
+- SubqueryAlias people4
+- LogicalRDD [name#172, age#173]
......
scala> results.select("name", $"age"+100).show
scala> df.filter($"age" > 21).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
scala> df.groupBy("age").count.show
+----+-----+
| age|count|
+----+-----+
| 19| 1|
|null| 1|
| 30| 1|
+----+-----+
scala> df.
agg count except inputFiles orderBy sample take where
alias createGlobalTempView explain intersect persist schema takeAsList withColumn
apply createOrReplaceTempView explode isLocal printSchema select toDF withColumnRenamed
as createTempView filter isStreaming queryExecution selectExpr toJSON withWatermark
cache crossJoin first javaRDD randomSplit show toJavaRDD write
checkpoint cube flatMap join randomSplitAsList sort toLocalIterator writeStream
coalesce describe foreach joinWith rdd sortWithinPartitions toString
col distinct foreachPartition limit reduce sparkSession transform
collect drop groupBy map registerTempTable sqlContext union
collectAsList dropDuplicates groupByKey mapPartitions repartition stat unionAll
columns dtypes head na rollup storageLevel unpersist
// DF可以做的各种操作
scala> df.create
createGlobalTempView createOrReplaceTempView createTempView
scala> df.createOrReplaceTempView("people")
scala> val sqlDF=spark.sql("select * from people")
sqlDF: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
// 现变成View,才能SQL
scala> sqlDF.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
scala> df.createGlobalTempView("people2")
scala> spark.sql("select * from global_temp.people2").show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
scala> spark.newSession().sql("select * from global_temp.people2").show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
// global view 可以跨session
scala> case class Person(name:String, age:Long)
defined class Person
scala> val caseClassDS=Seq(Person("Andy",32)).toDS
caseClassDS: org.apache.spark.sql.Dataset[Person] = [name: string, age: bigint]
scala> caseClassDS.show
+----+---+
|name|age|
+----+---+
|Andy| 32|
+----+---+
scala> val primitiveDS=Seq(1,2,3).toDS
primitiveDS: org.apache.spark.sql.Dataset[Int] = [value: int]
// 基本数据类型不需要指明类型
scala> primitiveDS.map(_*2).show
+-----+
|value|
+-----+
| 2|
| 4|
| 6|
+-----+
scala> primitiveDS.map(_*2).collect
res19: Array[Int] = Array(2, 4, 6)
scala> val path="file:/opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json"
path: String = file:/opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.json
scala> val peopleDS=spark.read.json(path).as[Person]
peopleDS: org.apache.spark.sql.Dataset[Person] = [age: bigint, name: string]
scala> peopleDS.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
scala> peopleDS.collect
java.lang.RuntimeException: Error while decoding: java.lang.NullPointerException: Null value appeared in non-nullable field:
- field (class: "scala.Long", name: "age")
- root class: "Person"
If the schema is inferred from a Scala tuple/case class, or a Java bean, please try to use scala.Option[_] or other nullable types (e.g. java.lang.Integer instead of int/scala.Int).
newInstance(class Person)
:- input[1, string, true].toString
: +- input[1, string, true]
+- assertnotnull(input[0, bigint, true])
+- input[0, bigint, true]
scala> val path = "file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt"
scala> val peopleDF2 = spark.sparkContext.textFile(path).map(_.split(",")).map(atts => Person(atts(0), atts(1).trim.toInt)).toDF
peopleDF2: org.apache.spark.sql.DataFrame = [name: string, age: bigint]
scala> peopleDF2.createOrReplaceTempView("people3")
scala> val teenagersDF=spark.sql("select name, age from people3 where age between 13 and 19")
teenagersDF: org.apache.spark.sql.DataFrame = [name: string, age: bigint]
scala> teenagersDF.show
+------+---+
| name|age|
+------+---+
|Justin| 19|
+------+---+
scala> teenagersDF.map(teenager => "Name ->" + teenager(0)).show
+-------------+
| value|
+-------------+
|Name ->Justin|
+-------------+
scala> teenagersDF.map(teen => "Name -> " + teen.getAs[String]("name")).show
+--------------+
| value|
+--------------+
|Name -> Justin|
+--------------+
scala> implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
mapEncoder: org.apache.spark.sql.Encoder[Map[String,Any]] = class[value[0]: binary]
// 自定义Encoder
// Primitive types and case classes can be also defined as
// implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()
scala> teenagersDF.map(teen => teen.getValuesMap[Any](List("name", "age"))).collect
res26: Array[Map[String,Any]] = Array(Map(name -> Justin, age -> 19))
// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
// 试翻译:一次操作多列,每一列映射为 Map[String, T] 的形式
scala> teenagersDF.map(teen => teen.getValuesMap[Any](List("name", "age"))).show
+--------------------+
| value|
+--------------------+
|[2E 01 02 39 01 0...|
+--------------------+
scala> val path = "file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt"
scala> val peopleRDD = spark.sparkContext.textFile(path)
peopleRDD: org.apache.spark.rdd.RDD[String] = file:////opt/bigdata/nfs/spark-2.1.2-bin-hadoop2.7/examples/src/main/resources/people.txt MapPartitionsRDD[70] at textFile at
scala> peopleRDD.show
peopleRDD.show
^
scala> peopleRDD.collect
res29: Array[String] = Array(Michael, 29, Andy, 30, Justin, 19)
scala> val schemaString = "name age"
schemaString: String = name age
scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._
// spark-shell模式下需要手动import
scala> val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))
fields: Array[org.apache.spark.sql.types.StructField] = Array(StructField(name,StringType,true), StructField(age,StringType,true))
scala> fields
res31: Array[org.apache.spark.sql.types.StructField] = Array(StructField(name,StringType,true), StructField(age,StringType,true))
// Array里面是StructField,StructField可理解为列,column
scala> val schema = StructType(fields)
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,StringType,true))
// StructType可以理解为shcema,表结构。ST里面是SF
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
scala> val rowRDD = peopleRDD.map(_.split(",")).map(att => Row(att(0), att(1).trim))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[72] at map at
// RDD里面是Row
scala> val peopleDF = spark.createDataFrame(rowRDD, schema)
peopleDF: org.apache.spark.sql.DataFrame = [name: string, age: string]
// 目前看到④种创建DataFrame的方式 (吐槽一下编辑器的蜜汁行距……)
// 1)sparkSession.read.json(path)
// 2) sparkSession.sql("[sql]")
// 3) rdd.toDF
// 4) sparkSession.createDataFrame(rowRDD, schema)
// 创建Dataset的②种方式(目前)
// 1)Seq(...).toDS,Seq里面可以是简单数据类型,也可是case class
// --Seq(Person("Andy",32)).toDS
// --Seq(1,2,3).toDS
// 2)spark.read.json(path).as[Person]
scala> peopleDF.createOrReplaceTempView("people4")
scala> val results = spark.sql("select * from people4")
results: org.apache.spark.sql.DataFrame = [name: string, age: string]
scala> results.show
+-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
scala> results.map(atts => "Name -> "+atts(0) + ", age -> " + atts(1)).show
+--------------------+
| value|
+--------------------+
|Name -> Michael, ...|
|Name -> Andy, age...|
|Name -> Justin, a...|
+--------------------+
scala> results.map(atts => "Name->"+atts(0) + ",age->" + atts(1)).show
+--------------------+
| value|
+--------------------+
|Name->Michael,age...|
| Name->Andy,age->30|
|Name->Justin,age->19|
+--------------------+