pyspark之RDD,Data Frame,SQL Context 转换与操作

# 加载数据
Path='file:/home/swt/pythonwork/PythonProject/'
RawUserRDD = sc.textFile(Path+'data/u.user')
RawUserRDD.count()
# 查看RDD数据
RawUserRDD.take(5)
['1|24|M|technician|85711',
 '2|53|F|other|94043',
 '3|23|M|writer|32067',
 '4|24|M|technician|43537',
 '5|33|F|other|15213']
# 按照|分割
userRDD = RawUserRDD.map(lambda x:x.split('|'))
userRDD.take(5)
[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]
# 将RDD格式转换为Row格式
sqlContext = SparkSession.builder.getOrCreate()
from pyspark.sql import Row
user_roes = userRDD.map(lambda x:Row(userid=int(x[0]),age=int(x[1]),gender=x[2],occupation=x[3],zipcode=x[4]))
user_roes.take(5)
[Row(age=24, gender='M', occupation='technician', userid=1, zipcode='85711'),
 Row(age=53, gender='F', occupation='other', userid=2, zipcode='94043'),
 Row(age=23, gender='M', occupation='writer', userid=3, zipcode='32067'),
 Row(age=24, gender='M', occupation='technician', userid=4, zipcode='43537'),
 Row(age=33, gender='F', occupation='other', userid=5, zipcode='15213')]
# 转换成DataFrame
user_df = sqlContext.createDataFrame(user_roes)
user_df.printSchema()
user_df.show(5)
root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- userid: long (nullable = true)
 |-- zipcode: string (nullable = true)

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 53|     F|     other|     2|  94043|
| 23|     M|    writer|     3|  32067|
| 24|     M|technician|     4|  43537|
| 33|     F|     other|     5|  15213|
+---+------+----------+------+-------+
only showing top 5 rows
# 给DATa Frame去别名,结果同上
df = user_df.alias("df")
df.show(5)
# 创建表
df.createOrReplaceTempView("user_table")

创建表之后就可以进行类似sql的数据操作了。。。
RDD查询数据

# RDD select 
userRDDnew = userRDD.map(lambda x:(x[3],x[2],x[1]))
userRDDnew.take(5)

# 计算字段
userRDDnew1 = userRDD.map(lambda x:(x[0],x[3],x[2],x[1],2016-int(x[1])))
userRDDnew1.take(5)

# 筛选查询
userRDD.filter(lambda r:r[3]=='technician' and r[2]=='M' and r[1]=="24").take(5)

# 排序 升序
userRDD.takeOrdered(5,key=lambda x:int(x[1]))
# 降序
userRDD.takeOrdered(5,key=lambda x:-1*int(x[1]))

# 多字段排序
userRDD.takeOrdered(5,key=lambda x:(-int(x[1]),x[2]))

# 显示不重复数据
userRDD.map(lambda x:x[2]).distinct().collect()

# 分组统计
userRDD.map(lambda x:(x[2],1)).reduceByKey(lambda x,y:(x+y)).collect()

dataframe查询字段

# dataframe 查询字段
df.select("age","gender").show(5)
df.select(df.userid,df.age,df.gender).show(5)
# []查询
df[df['userid'],df['age'],df['gender']].show(5)

# 计算字段
df.select('userid','occupation','gender','age',(2016-df.age).alias('bithyaer')).show(5)

# 筛选查询
df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show(5)
df.filter((df.occupation=='technician')&(df.age==24)&(df.gender=='M')).show(5)
df.filter((df['occupation']=='technician')&(df['age']==24)&(df['gender']=='M')).show(5)

# 排序 升序
df.select('userid','age','occupation').orderBy('age').show(5)
# 降序
df.select('userid','age','occupation').orderBy('age',ascending=0).show(5)

# 多字段排序
df.orderBy(['age','gender'],ascending=[0,1]).show(5)

# 显示不重复数据
df.select('gender').distinct().show(5)

# 分组统计
df.select('gender').groupby('gender').count().show()
+---+------+
|age|gender|
+---+------+
| 24|     M|
| 53|     F|
| 23|     M|
| 24|     M|
| 33|     F|
+---+------+
only showing top 5 rows

+------+---+------+
|userid|age|gender|
+------+---+------+
|     1| 24|     M|
|     2| 53|     F|
|     3| 23|     M|
|     4| 24|     M|
|     5| 33|     F|
+------+---+------+
only showing top 5 rows

+------+---+------+
|userid|age|gender|
+------+---+------+
|     1| 24|     M|
|     2| 53|     F|
|     3| 23|     M|
|     4| 24|     M|
|     5| 33|     F|
+------+---+------+
only showing top 5 rows

+------+----------+------+---+--------+
|userid|occupation|gender|age|bithyaer|
+------+----------+------+---+--------+
|     1|technician|     M| 24|    1992|
|     2|     other|     F| 53|    1963|
|     3|    writer|     M| 23|    1993|
|     4|technician|     M| 24|    1992|
|     5|     other|     F| 33|    1983|
+------+----------+------+---+--------+
only showing top 5 rows

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 24|     M|technician|     4|  43537|
| 24|     M|technician|   456|  31820|
| 24|     M|technician|   717|  84105|
| 24|     M|technician|   832|  77042|
+---+------+----------+------+-------+
only showing top 5 rows

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 24|     M|technician|     4|  43537|
| 24|     M|technician|   456|  31820|
| 24|     M|technician|   717|  84105|
| 24|     M|technician|   832|  77042|
+---+------+----------+------+-------+
only showing top 5 rows

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 24|     M|technician|     4|  43537|
| 24|     M|technician|   456|  31820|
| 24|     M|technician|   717|  84105|
| 24|     M|technician|   832|  77042|
+---+------+----------+------+-------+
only showing top 5 rows

+------+---+----------+
|userid|age|occupation|
+------+---+----------+
|    30|  7|   student|
|   471| 10|   student|
|   289| 11|      none|
|   880| 13|   student|
|   628| 13|      none|
+------+---+----------+
only showing top 5 rows

+------+---+-------------+
|userid|age|   occupation|
+------+---+-------------+
|   481| 73|      retired|
|   860| 70|      retired|
|   767| 70|     engineer|
|   803| 70|administrator|
|   559| 69|    executive|
+------+---+-------------+
only showing top 5 rows

+---+------+-------------+------+-------+
|age|gender|   occupation|userid|zipcode|
+---+------+-------------+------+-------+
| 73|     M|      retired|   481|  37771|
| 70|     F|      retired|   860|  48322|
| 70|     M|administrator|   803|  78212|
| 70|     M|     engineer|   767|  00000|
| 69|     M|    executive|   559|  10022|
+---+------+-------------+------+-------+
only showing top 5 rows

+------+
|gender|
+------+
|     F|
|     M|
+------+

+------+-----+
|gender|count|
+------+-----+
|     F|  273|
|     M|  670|
+------+-----+

sqlContext查询数据,(利用sql语句)

# spark SQL 查询字段
sqlContext.sql("SELECT * FROM user_table").show(5)
sqlContext.sql("select * from user_table").show(5)

#limit 查询
sqlContext.sql("SELECT * FROM user_table LIMIT 5").show()

# 指定查询
sqlContext.sql("SELECT userid,age,gender FROM user_table").show(5)

# 计算字段
sqlContext.sql("SELECT userid,occupation,gender,age,2016-age birthyear FROM user_table").show(5)

#  where 查询
sqlContext.sql("select age,gender,occupation, userid from user_table where age=24 and gender='M'").show(5)

# order by 查询 升序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age").show(5)
# 降序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc").show(5)

# 多字段排序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc,gender").show(5)

# 显示不重复数据
sqlContext.sql("select distinct gender from user_table").show(5)

# 分组统计
sqlContext.sql("select gender,count(*) counts from user_table group by gender").show(5)
sqlContext.sql("select gender,occupation,count(*) counts from user_table group by gender,occupation").show(5)

# join
sqlContext.sql("select u1.*,u2.userid from user_table u1 join user_table u2 on u1.userid=u2.userid where u1.age=24").show(5)
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 53|     F|     other|     2|  94043|
| 23|     M|    writer|     3|  32067|
| 24|     M|technician|     4|  43537|
| 33|     F|     other|     5|  15213|
+---+------+----------+------+-------+
only showing top 5 rows

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 53|     F|     other|     2|  94043|
| 23|     M|    writer|     3|  32067|
| 24|     M|technician|     4|  43537|
| 33|     F|     other|     5|  15213|
+---+------+----------+------+-------+
only showing top 5 rows

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 53|     F|     other|     2|  94043|
| 23|     M|    writer|     3|  32067|
| 24|     M|technician|     4|  43537|
| 33|     F|     other|     5|  15213|
+---+------+----------+------+-------+

+------+---+------+
|userid|age|gender|
+------+---+------+
|     1| 24|     M|
|     2| 53|     F|
|     3| 23|     M|
|     4| 24|     M|
|     5| 33|     F|
+------+---+------+
only showing top 5 rows

+------+----------+------+---+---------+
|userid|occupation|gender|age|birthyear|
+------+----------+------+---+---------+
|     1|technician|     M| 24|     1992|
|     2|     other|     F| 53|     1963|
|     3|    writer|     M| 23|     1993|
|     4|technician|     M| 24|     1992|
|     5|     other|     F| 33|     1983|
+------+----------+------+---+---------+
only showing top 5 rows

+---+------+----------+------+
|age|gender|occupation|userid|
+---+------+----------+------+
| 24|     M|technician|     1|
| 24|     M|technician|     4|
| 24|     M|    artist|    31|
| 24|     M|  engineer|    69|
| 24|     M|   student|    73|
+---+------+----------+------+
only showing top 5 rows

+------+---+----------+------+
|userid|age|occupation|gender|
+------+---+----------+------+
|    30|  7|   student|     M|
|   471| 10|   student|     M|
|   289| 11|      none|     M|
|   880| 13|   student|     M|
|   628| 13|      none|     M|
+------+---+----------+------+
only showing top 5 rows

+------+---+-------------+------+
|userid|age|   occupation|gender|
+------+---+-------------+------+
|   481| 73|      retired|     M|
|   860| 70|      retired|     F|
|   767| 70|     engineer|     M|
|   803| 70|administrator|     M|
|   559| 69|    executive|     M|
+------+---+-------------+------+
only showing top 5 rows

+------+---+-------------+------+
|userid|age|   occupation|gender|
+------+---+-------------+------+
|   481| 73|      retired|     M|
|   860| 70|      retired|     F|
|   803| 70|administrator|     M|
|   767| 70|     engineer|     M|
|   559| 69|    executive|     M|
+------+---+-------------+------+
only showing top 5 rows

+------+
|gender|
+------+
|     F|
|     M|
+------+

+------+------+
|gender|counts|
+------+------+
|     F|   273|
|     M|   670|
+------+------+

+------+-------------+------+
|gender|   occupation|counts|
+------+-------------+------+
|     M|    executive|    29|
|     M|     educator|    69|
|     F|         none|     4|
|     F|entertainment|     2|
|     F|      retired|     1|
+------+-------------+------+
only showing top 5 rows

+---+------+----------+------+-------+------+
|age|gender|occupation|userid|zipcode|userid|
+---+------+----------+------+-------+------+
| 24|     M|    writer|   293|  60804|   293|
| 24|     F|   student|   348|  45660|   348|
| 24|     M|programmer|   414|  38115|   414|
| 24|     M|technician|   889|  78704|   889|
| 24|     M|     other|   936|  32789|   936|
+---+------+----------+------+-------+------+
only showing top 5 rows
# sqlContext 转换为 DataFrame 格式
sqlContext_df = sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc,gender").toPandas().set_index("userid")
sqlContext_df
 	age 	occupation 	gender
userid 			
481 	73 	retired 	M
860 	70 	retired 	F
767 	70 	engineer 	M
803 	70 	administrator 	M
559 	69 	executive 	M
585 	69 	librarian 	M
349 	68 	retired 	M
573 	68 	retired 	M
211 	66 	salesman 	M
318 	65 	retired 	M
564 	65 	retired 	M
651 	65 	retired 	M
423 	64 	other 	M
845 	64 	doctor 	M
364 	63 	engineer 	M
777 	63 	programmer 	M
858 	63 	educator 	M
266 	62 	administrator 	F

你可能感兴趣的:(虚拟机+大数据)