scala> import spark.implicits._
import spark.implicits._
scala> val df = spark.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> df.show()
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
从文件people.json创建一个DataFrame,然后保存为csv文件和文本txt文件
scala> val peopleDF = spark.read.format("json").load("file:///usr/local/spark/examples/src/main/resources/people.json")
peopleDF: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> peopleDF.select("name","age").write.format("csv").save("File:///usr/local/spark/newpqople.csv")
scala> peopleDF.rdd.saveAsTextFile("file:///usr/local/spark/newpeople.txt")
1、printSchcema()
可以使用printSchcema()操作,打印出DaraFrame的模式(Schcema)信息
scala> df.printSchema()
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
2、select()
select()操作的功能,是从DataFrame中选取部分列的数据。select()操作选取了name和age两个列,并把age这个列的值增加1
scala> df.select(df("name"),df("age")+1).show()
+-------+---------+
| name|(age + 1)|
+-------+---------+
|Michael| null|
| Andy| 31|
| Justin| 20|
+-------+---------+
select()操作还可以实现对列名进行重命名的操作,如:将name列名称被重命名为username。
scala> df.select(df("name").as("uaername"),df("age")).show()
+--------+----+
|uaername| age|
+--------+----+
| Michael|null|
| Andy| 30|
| Justin| 19|
+--------+----+
3、fiter()
fiter()操作可以实现条件查询,找到满足条件要求的记录,查询所有age大于20的记录
scala> df.filter(df("age")>20).show()
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
4、groupBy()
groupBy()操作用于对记录进行分组。如:可以根据age字段进行分组,并对每个分组中包含的记录数量进行统计
scala> df.groupBy("age").count().show()
+----+-----+
| age|count|
+----+-----+
| 30| 1|
| 19| 1|
|null| 1|
+----+-----+
5、sort()
sort()操作用于对记录进行排序,如:df.sort(df(“age”).desc)表示根据age字段进行降序排序,df.sort(df(“age”).desc,df(“name”).asc)表示根据age字段进行降序排序,当age字段的值相同时,再根据name字段进行升序排序。
scala> df.sort(df("age").desc).show()
+----+-------+
| age| name|
+----+-------+
| 30| Andy|
| 19| Justin|
|null|Michael|
+----+-------+
scala> df.sort(df("age").desc,df("name").asc).show()
+----+-------+
| age| name|
+----+-------+
| 30| Andy|
| 19| Justin|
|null|Michael|
+----+-------+
1、利用反射机制推断RDD模式
scala> import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
scala> import org.apache.spark.sql.Encoder
import org.apache.spark.sql.Encoder
scala> import spark.implicits._
import spark.implicits._
scala> case class Person(name:String,age:Long)
defined class Person
scala> val peopleDF = spark.sparkContext.
| textFile("file:///usr/local/spark/examples/src/main/resources/people.txt").
| map(_.split(",")).
| map(attributes => Person(attributes(0),attributes(1).trim.toInt)).toDF()
peopleDF: org.apache.spark.sql.DataFrame = [name: string, age: bigint]
scala> peopleDF.createOrReplaceTempView("people")
scala> val personsRDD = spark.sql("select name, age from people where age>20")
personsRDD: org.apache.spark.sql.DataFrame = [name: string, age: bigint]
scala> personsRDD.map(t=>"Name:"+t(0)+ ","+"Age:"+t(1)).show()
+-------------------+
| value|
+-------------------+
|Name:Michael,Age:29|
| Name:Andy,Age:30|
+-------------------+
2、使用编程方式定义RDD模式
利用SparkSQL查询people.txt
scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
//生成字段
scala> val fileds = Array(StructField("name",StringType,true),StructField("age",IntegerType,true))
fileds: Array[org.apache.spark.sql.types.StructField] = Array(StructField(name,StringType,true), StructField(age,IntegerType,true))
//可以看出,schema描述了模式信息,模式包含name和age两个字段
scala> val schema = StructType(fields)
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true))
//加载文件删除RDD
scala> val peopleRDD = spark.sparkContext.
| textFile("file:///usr/local/spark/examples/src/main/resources/people.txt")
peopleRDD: org.apache.spark.rdd.RDD[String] = file:///usr/local/spark/examples/src/main/resources/people.txt MapPartitionsRDD[12] at textFile at :32
//对peopleRDD这个RDD中的每一行元素都进行解析(得到的rowRDD就是“表中的记录”)
scala> val rowRDD = peopleRDD.map(_.split(",")).
| map(attributes => Row(attributes(0),attributes(1).trim.toInt))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[15] at map at <console>:35
//把“表头”和“表中的记录”拼装起来
scala> val peopleDF = spark.createDataFrame(rowRDD,schema)
peopleDF: org.apache.spark.sql.DataFrame = [name: string, age: int]
//必须注册为临时表才能供下面查询使用
scala> peopleDF.createOrReplaceTempView("people")
scala> val results = spark.sql("SELECT name,age from people")
results: org.apache.spark.sql.DataFrame = [name: string, age: int]
scala> results.
| map(attributes => "name:"+attributes(0)+","+"age:"+attributes(1)).show()
+-------------------+
| value|
+-------------------+
|name:Michael,age:29|
| name:Andy,age:30|
| name:Justin,age:19|
+-------------------+
1、在mysql shell,创建数据库spark和数据表student
mysql> create database spark;
Query OK, 1 row affected (0.01 sec)
mysql> use spark;
Database changed
mysql> create table student(id int(4),name char(20),gender char(4),age int(4));
Query OK, 0 rows affected (0.02 sec)
mysql> insert into student values(1,"zhangsan","F",20);
Query OK, 1 row affected (0.01 sec)
mysql> insert into student values(2,"lisi","M",22);
Query OK, 1 row affected (0.00 sec)
mysql> select *from student;
+------+----------+--------+------+
| id | name | gender | age |
+------+----------+--------+------+
| 1 | zhangsan | F | 20 |
| 2 | lisi | M | 22 |
+------+----------+--------+------+
2 rows in set (0.00 sec)
2、启动spark-shell,必须指定mysql连接驱动包jar包
./bin/spark-shell --jars /usr/local/spark/jars/mysql-connector-java-5.1.27-bin.jar --driver-class-path /usr/local/spark/jars/mysql-connector-java-5.1.27-bin.jar
3、读取mysql数据库的数据
scala> val jdbcDF = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/spark").option("driver","com.mysql.jdbc.Driver").option("dbtable", "student").option("user", "hive").option("password", "123456").load()
jdbcDF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> jdbcDF.show()
+---+--------+------+---+
| id| name|gender|age|
+---+--------+------+---+
| 1|zhangsan| F| 20|
| 2| lisi| M| 22|
+---+--------+------+---+
scala> jdbcDF.printSchema()
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- gender: string (nullable = true)
|-- age: integer (nullable = true)
scala> println(jdbcDF.count())
2
4、将Dataframe写入Mysql
scala> import java.util.Properties
import java.util.Properties
scala> val prop = new Properties()
prop: java.util.Properties = {}
scala> prop.put("user", "hive")
res6: Object = null
scala> prop.put("password", "123456")
res7: Object = null
scala> prop.put("driver","com.mysql.jdbc.Driver")
res8: Object = null
scala> jdbcDF.write.jdbc(url,"student1",prop)
scala> val url = "jdbc:mysql://localhost:3306/spark"
url: String = jdbc:mysql://localhost:3306/spark
scala> jdbcDF.write.jdbc(url,"student2",prop)
在Mysql shell查询student2表
mysql> select *from student2;
+------+----------+--------+------+
| id | name | gender | age |
+------+----------+--------+------+
| 1 | zhangsan | F | 20 |
| 2 | lisi | M | 22 |
+------+----------+--------+------+
2 rows in set (0.00 sec)
5、向MySQL数据库写入数据
scala> import java.util.Properties
import java.util.Properties
scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
//下面设置两条数据,表示两个学生的信息
scala> val studentRDD = spark.sparkContext.parallelize(Array("3 wangwu M 24","4 liuliu F 27")).map(_.split(" "))
studentRDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[1] at map at <console>:28
//下面设置模式信息
scala> val schema = StructType(List(StructField("id",IntegerType,true),StructField("name",StringType,true),StructField("gender",StringType,true),StructField("age",IntegerType,true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(gender,StringType,true), StructField(age,IntegerType,true))
//下面创建Row对象,每个对象都是rowRDD的一行
scala> val rowRDD = studentRDD.map(p=>Row(p(0).toInt,p(1).trim,p(2).trim,p(3).toInt))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[2] at map at <console>:30
//建立起Row对象和模式之间的对应关系,也就是把数据和模式对应起来
scala> val studentDF = spark.createDataFrame(rowRDD,schema)
studentDF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
//下面创建一个prop变量来保存JDBC连接参数
scala> val prop = new Properties()
prop: java.util.Properties = {}
scala> prop.put("user","hive") //用户名是hive(自己mysql的用户名)
res0: Object = null
scala> prop.put("password","123456")
res1: Object = null
scala> prop.put("driver","com.mysql.jdbc.Driver") //表示驱动程序是com.mysql.jdbc.Driver
res2: Object = null
//连接数据库,采用append模式,表示追加记录到数据库spark的student中
scala> studentDF.write.mode("append").jdbc("jdbc:mysql://localhost:3306/spark","spark.student",prop)