# 加载数据
Path='file:/home/swt/pythonwork/PythonProject/'
RawUserRDD = sc.textFile(Path+'data/u.user')
RawUserRDD.count()
# 查看RDD数据
RawUserRDD.take(5)
['1|24|M|technician|85711',
'2|53|F|other|94043',
'3|23|M|writer|32067',
'4|24|M|technician|43537',
'5|33|F|other|15213']
# 按照|分割
userRDD = RawUserRDD.map(lambda x:x.split('|'))
userRDD.take(5)
[['1', '24', 'M', 'technician', '85711'],
['2', '53', 'F', 'other', '94043'],
['3', '23', 'M', 'writer', '32067'],
['4', '24', 'M', 'technician', '43537'],
['5', '33', 'F', 'other', '15213']]
# 将RDD格式转换为Row格式
sqlContext = SparkSession.builder.getOrCreate()
from pyspark.sql import Row
user_roes = userRDD.map(lambda x:Row(userid=int(x[0]),age=int(x[1]),gender=x[2],occupation=x[3],zipcode=x[4]))
user_roes.take(5)
[Row(age=24, gender='M', occupation='technician', userid=1, zipcode='85711'),
Row(age=53, gender='F', occupation='other', userid=2, zipcode='94043'),
Row(age=23, gender='M', occupation='writer', userid=3, zipcode='32067'),
Row(age=24, gender='M', occupation='technician', userid=4, zipcode='43537'),
Row(age=33, gender='F', occupation='other', userid=5, zipcode='15213')]
# 转换成DataFrame
user_df = sqlContext.createDataFrame(user_roes)
user_df.printSchema()
user_df.show(5)
root
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- occupation: string (nullable = true)
|-- userid: long (nullable = true)
|-- zipcode: string (nullable = true)
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 53| F| other| 2| 94043|
| 23| M| writer| 3| 32067|
| 24| M|technician| 4| 43537|
| 33| F| other| 5| 15213|
+---+------+----------+------+-------+
only showing top 5 rows
# 给DATa Frame去别名,结果同上
df = user_df.alias("df")
df.show(5)
# 创建表
df.createOrReplaceTempView("user_table")
创建表之后就可以进行类似sql的数据操作了。。。
RDD查询数据
# RDD select
userRDDnew = userRDD.map(lambda x:(x[3],x[2],x[1]))
userRDDnew.take(5)
# 计算字段
userRDDnew1 = userRDD.map(lambda x:(x[0],x[3],x[2],x[1],2016-int(x[1])))
userRDDnew1.take(5)
# 筛选查询
userRDD.filter(lambda r:r[3]=='technician' and r[2]=='M' and r[1]=="24").take(5)
# 排序 升序
userRDD.takeOrdered(5,key=lambda x:int(x[1]))
# 降序
userRDD.takeOrdered(5,key=lambda x:-1*int(x[1]))
# 多字段排序
userRDD.takeOrdered(5,key=lambda x:(-int(x[1]),x[2]))
# 显示不重复数据
userRDD.map(lambda x:x[2]).distinct().collect()
# 分组统计
userRDD.map(lambda x:(x[2],1)).reduceByKey(lambda x,y:(x+y)).collect()
dataframe查询字段
# dataframe 查询字段
df.select("age","gender").show(5)
df.select(df.userid,df.age,df.gender).show(5)
# []查询
df[df['userid'],df['age'],df['gender']].show(5)
# 计算字段
df.select('userid','occupation','gender','age',(2016-df.age).alias('bithyaer')).show(5)
# 筛选查询
df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show(5)
df.filter((df.occupation=='technician')&(df.age==24)&(df.gender=='M')).show(5)
df.filter((df['occupation']=='technician')&(df['age']==24)&(df['gender']=='M')).show(5)
# 排序 升序
df.select('userid','age','occupation').orderBy('age').show(5)
# 降序
df.select('userid','age','occupation').orderBy('age',ascending=0).show(5)
# 多字段排序
df.orderBy(['age','gender'],ascending=[0,1]).show(5)
# 显示不重复数据
df.select('gender').distinct().show(5)
# 分组统计
df.select('gender').groupby('gender').count().show()
+---+------+
|age|gender|
+---+------+
| 24| M|
| 53| F|
| 23| M|
| 24| M|
| 33| F|
+---+------+
only showing top 5 rows
+------+---+------+
|userid|age|gender|
+------+---+------+
| 1| 24| M|
| 2| 53| F|
| 3| 23| M|
| 4| 24| M|
| 5| 33| F|
+------+---+------+
only showing top 5 rows
+------+---+------+
|userid|age|gender|
+------+---+------+
| 1| 24| M|
| 2| 53| F|
| 3| 23| M|
| 4| 24| M|
| 5| 33| F|
+------+---+------+
only showing top 5 rows
+------+----------+------+---+--------+
|userid|occupation|gender|age|bithyaer|
+------+----------+------+---+--------+
| 1|technician| M| 24| 1992|
| 2| other| F| 53| 1963|
| 3| writer| M| 23| 1993|
| 4|technician| M| 24| 1992|
| 5| other| F| 33| 1983|
+------+----------+------+---+--------+
only showing top 5 rows
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 24| M|technician| 4| 43537|
| 24| M|technician| 456| 31820|
| 24| M|technician| 717| 84105|
| 24| M|technician| 832| 77042|
+---+------+----------+------+-------+
only showing top 5 rows
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 24| M|technician| 4| 43537|
| 24| M|technician| 456| 31820|
| 24| M|technician| 717| 84105|
| 24| M|technician| 832| 77042|
+---+------+----------+------+-------+
only showing top 5 rows
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 24| M|technician| 4| 43537|
| 24| M|technician| 456| 31820|
| 24| M|technician| 717| 84105|
| 24| M|technician| 832| 77042|
+---+------+----------+------+-------+
only showing top 5 rows
+------+---+----------+
|userid|age|occupation|
+------+---+----------+
| 30| 7| student|
| 471| 10| student|
| 289| 11| none|
| 880| 13| student|
| 628| 13| none|
+------+---+----------+
only showing top 5 rows
+------+---+-------------+
|userid|age| occupation|
+------+---+-------------+
| 481| 73| retired|
| 860| 70| retired|
| 767| 70| engineer|
| 803| 70|administrator|
| 559| 69| executive|
+------+---+-------------+
only showing top 5 rows
+---+------+-------------+------+-------+
|age|gender| occupation|userid|zipcode|
+---+------+-------------+------+-------+
| 73| M| retired| 481| 37771|
| 70| F| retired| 860| 48322|
| 70| M|administrator| 803| 78212|
| 70| M| engineer| 767| 00000|
| 69| M| executive| 559| 10022|
+---+------+-------------+------+-------+
only showing top 5 rows
+------+
|gender|
+------+
| F|
| M|
+------+
+------+-----+
|gender|count|
+------+-----+
| F| 273|
| M| 670|
+------+-----+
sqlContext查询数据,(利用sql语句)
# spark SQL 查询字段
sqlContext.sql("SELECT * FROM user_table").show(5)
sqlContext.sql("select * from user_table").show(5)
#limit 查询
sqlContext.sql("SELECT * FROM user_table LIMIT 5").show()
# 指定查询
sqlContext.sql("SELECT userid,age,gender FROM user_table").show(5)
# 计算字段
sqlContext.sql("SELECT userid,occupation,gender,age,2016-age birthyear FROM user_table").show(5)
# where 查询
sqlContext.sql("select age,gender,occupation, userid from user_table where age=24 and gender='M'").show(5)
# order by 查询 升序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age").show(5)
# 降序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc").show(5)
# 多字段排序
sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc,gender").show(5)
# 显示不重复数据
sqlContext.sql("select distinct gender from user_table").show(5)
# 分组统计
sqlContext.sql("select gender,count(*) counts from user_table group by gender").show(5)
sqlContext.sql("select gender,occupation,count(*) counts from user_table group by gender,occupation").show(5)
# join
sqlContext.sql("select u1.*,u2.userid from user_table u1 join user_table u2 on u1.userid=u2.userid where u1.age=24").show(5)
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 53| F| other| 2| 94043|
| 23| M| writer| 3| 32067|
| 24| M|technician| 4| 43537|
| 33| F| other| 5| 15213|
+---+------+----------+------+-------+
only showing top 5 rows
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 53| F| other| 2| 94043|
| 23| M| writer| 3| 32067|
| 24| M|technician| 4| 43537|
| 33| F| other| 5| 15213|
+---+------+----------+------+-------+
only showing top 5 rows
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 53| F| other| 2| 94043|
| 23| M| writer| 3| 32067|
| 24| M|technician| 4| 43537|
| 33| F| other| 5| 15213|
+---+------+----------+------+-------+
+------+---+------+
|userid|age|gender|
+------+---+------+
| 1| 24| M|
| 2| 53| F|
| 3| 23| M|
| 4| 24| M|
| 5| 33| F|
+------+---+------+
only showing top 5 rows
+------+----------+------+---+---------+
|userid|occupation|gender|age|birthyear|
+------+----------+------+---+---------+
| 1|technician| M| 24| 1992|
| 2| other| F| 53| 1963|
| 3| writer| M| 23| 1993|
| 4|technician| M| 24| 1992|
| 5| other| F| 33| 1983|
+------+----------+------+---+---------+
only showing top 5 rows
+---+------+----------+------+
|age|gender|occupation|userid|
+---+------+----------+------+
| 24| M|technician| 1|
| 24| M|technician| 4|
| 24| M| artist| 31|
| 24| M| engineer| 69|
| 24| M| student| 73|
+---+------+----------+------+
only showing top 5 rows
+------+---+----------+------+
|userid|age|occupation|gender|
+------+---+----------+------+
| 30| 7| student| M|
| 471| 10| student| M|
| 289| 11| none| M|
| 880| 13| student| M|
| 628| 13| none| M|
+------+---+----------+------+
only showing top 5 rows
+------+---+-------------+------+
|userid|age| occupation|gender|
+------+---+-------------+------+
| 481| 73| retired| M|
| 860| 70| retired| F|
| 767| 70| engineer| M|
| 803| 70|administrator| M|
| 559| 69| executive| M|
+------+---+-------------+------+
only showing top 5 rows
+------+---+-------------+------+
|userid|age| occupation|gender|
+------+---+-------------+------+
| 481| 73| retired| M|
| 860| 70| retired| F|
| 803| 70|administrator| M|
| 767| 70| engineer| M|
| 559| 69| executive| M|
+------+---+-------------+------+
only showing top 5 rows
+------+
|gender|
+------+
| F|
| M|
+------+
+------+------+
|gender|counts|
+------+------+
| F| 273|
| M| 670|
+------+------+
+------+-------------+------+
|gender| occupation|counts|
+------+-------------+------+
| M| executive| 29|
| M| educator| 69|
| F| none| 4|
| F|entertainment| 2|
| F| retired| 1|
+------+-------------+------+
only showing top 5 rows
+---+------+----------+------+-------+------+
|age|gender|occupation|userid|zipcode|userid|
+---+------+----------+------+-------+------+
| 24| M| writer| 293| 60804| 293|
| 24| F| student| 348| 45660| 348|
| 24| M|programmer| 414| 38115| 414|
| 24| M|technician| 889| 78704| 889|
| 24| M| other| 936| 32789| 936|
+---+------+----------+------+-------+------+
only showing top 5 rows
# sqlContext 转换为 DataFrame 格式
sqlContext_df = sqlContext.sql("select userid,age,occupation,gender from user_table order by age desc,gender").toPandas().set_index("userid")
sqlContext_df
age occupation gender
userid
481 73 retired M
860 70 retired F
767 70 engineer M
803 70 administrator M
559 69 executive M
585 69 librarian M
349 68 retired M
573 68 retired M
211 66 salesman M
318 65 retired M
564 65 retired M
651 65 retired M
423 64 other M
845 64 doctor M
364 63 engineer M
777 63 programmer M
858 63 educator M
266 62 administrator F