sparkSQL面试题
一、查询所有数学课程成绩大于语文课程成绩的学生学号
数据
1,yuwen,43
1,shuxue,55
2,yuwen,77
2,shuxue,88
3,yuwen,98
3,shuxue,65
3,yingyu,88
基本步骤:
- 进行行转列
- 比较语文与数学的成绩
SQL代码:
with t1 as(
SELECT id,sum(if(name = 'yuwen',score,0)) chinese,sum(if(name = 'shuxue',score,0)) math,sum(if(name = 'yingyu',score,0)) english FROM zuoye1 group by id
)
select id from t1 where math>chinese
sparkSQL代码
import os
import re
from pyspark.sql import SparkSession
"""
------------------------------------------
Description : TODO:
SourceFile : _04-zuoye1
Author : zxx
Date : 2024/11/4
-------------------------------------------
"""
# 查询所有数学课程成绩大于语文课程成绩的学生学号
if __name__ == '__main__':
os.environ['JAVA_HOME'] = 'D:/bigdata/03-java/java-8/jdk'
# 配置Hadoop的路径,就是前面解压的那个路径
os.environ['HADOOP_HOME'] = 'D:/bigdata/04-Hadoop/hadoop/hadoop-3.3.1/hadoop-3.3.1'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_PYTHON'] = 'D:/bigdata/22-spark/Miniconda3/python.exe' # 配置base环境Python解析器的路径
os.environ['PYSPARK_DRIVER_PYTHON'] = 'D:/bigdata/22-spark/Miniconda3/python.exe'
spark = SparkSession.builder.master("local[2]").appName("第一题").config(
"spark.sql.shuffle.partitions", 2).getOrCreate()
df = (spark.sparkContext.textFile("../../datas/zuoye2/zuoye1.txt")
.map(lambda line: (re.split(",",line)[0],re.split(",",line)[1],re.split(",",line)[2])).toDF(["id","name","score"]))
df.createOrReplaceTempView("zuoye1")
spark.sql("""
with t1 as(
SELECT id,sum(if(name = 'yuwen',score,0)) chinese,sum(if(name = 'shuxue',score,0)) math,sum(if(name = 'yingyu',score,0)) english FROM zuoye1 group by id
)
select id from t1 where math>english
""").show()
# 关闭
spark.stop()
运行结果:
二、查询每种爱好中年龄最大的人,如果有相同的年龄,并列显示。
数据
id;name;age;favor
1;huangbo;33;a,b,c,d,e
2;xuzheng;44;b,c
3;wangbaoqiang;33;c,d,e
4;fanbingbing;32;a,b,d
基本步骤:
- 把favor列用炸裂函数explode炸开
- 用排名函数进行排序,根据题意相同年龄要并列显示,故选用rank()
- 用where筛选出排名第一的人
SQL代码:
with t1 as(
select name,age,aihao from zuoye2 lateral view explode(split(favor,',')) t1 as aihao)
,t2 as(
select aihao,name,rank() over(partition by aihao order by age) r1 from t1
)
select * from t2 where r1 = 1 order by aihao
sparkSQL代码:
import os
import re
from pyspark.sql import SparkSession
"""
------------------------------------------
Description : TODO:
SourceFile : _04-zuoye1
Author : zxx
Date : 2024/11/4
-------------------------------------------
"""
# 查询每种爱好中年龄最大的人,如果有相同的年龄,并列显示。
if __name__ == '__main__':
os.environ['JAVA_HOME'] = 'D:/bigdata/03-java/java-8/jdk'
# 配置Hadoop的路径,就是前面解压的那个路径
os.environ['HADOOP_HOME'] = 'D:/bigdata/04-Hadoop/hadoop/hadoop-3.3.1/hadoop-3.3.1'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_PYTHON'] = 'D:/bigdata/22-spark/Miniconda3/python.exe' # 配置base环境Python解析器的路径
os.environ['PYSPARK_DRIVER_PYTHON'] = 'D:/bigdata/22-spark/Miniconda3/python.exe'
spark = SparkSession.builder.master("local[2]").appName("第一次构建SparkSession").config(
"spark.sql.shuffle.partitions", 2).getOrCreate()
df = (spark.sparkContext.textFile("../../datas/zuoye2/zuoye2.txt").filter(lambda x:re.split(";",x)[0] !="id")
.map(lambda line: (re.split(";",line)[0],re.split(";",line)[1],re.split(";",line)[2],
re.split(";",line)[3]
)
).toDF(["id","name","age","favor"]))
df.createOrReplaceTempView("zuoye2")
spark.sql("""
with t1 as(
select name,age,aihao from zuoye2 lateral view explode(split(favor,',')) t1 as aihao)
,t2 as(
select aihao,name,rank() over(partition by aihao order by age) r1 from t1
)
select * from t2 where r1 = 1 order by aihao
""").show()
# 关闭
spark.stop()
运行结果: