hive sql多表练习
hive sql多表练习
准备原始数据集
学生表 student.csv
讲师表 teacher.csv
课程表 course.csv
分数表 score.csv
学生表 student.csv
001,彭于晏,1995-05-16,男
002,胡歌,1994-03-20,男
003,周杰伦,1995-04-30,男
004,刘德华,1998-08-28,男
005,唐国强,1993-09-10,男
006,陈道明,1992-11-12,男
007,陈坤,1999-04-09,男
008,吴京,1994-02-06,男
009,郭德纲,1992-12-05,男
010,于谦,1998-08-23,男
011,潘长江,1995-05-27,男
012,杨紫,1996-12-21,女
013,蒋欣,1997-11-08,女
014,赵丽颖,1990-01-09,女
015,刘亦菲,1993-01-14,女
016,周冬雨,1990-06-18,女
017,范冰冰,1992-07-04,女
018,李冰冰,1993-09-24,女
019,邓紫棋,1994-08-31,女
020,宋丹丹,1991-03-01,女
讲师表 teacher.csv
1001,张高数
1002,李体音
1003,王子文
1004,刘丽英
课程表 course.csv
01,语文,1003
02,数学,1001
03,英语,1004
04,体育,1002
05,音乐,1002
分数表 score.csv
001,01,94
002,01,74
004,01,85
005,01,64
006,01,71
007,01,48
008,01,56
009,01,75
010,01,84
011,01,61
012,01,44
013,01,47
014,01,81
015,01,90
016,01,71
017,01,58
018,01,38
019,01,46
020,01,89
001,02,63
002,02,84
004,02,93
005,02,44
006,02,90
007,02,55
008,02,34
009,02,78
010,02,68
011,02,49
012,02,74
013,02,35
014,02,39
015,02,48
016,02,89
017,02,34
018,02,58
019,02,39
020,02,59
001,03,79
002,03,87
004,03,89
005,03,99
006,03,59
007,03,70
008,03,39
009,03,60
010,03,47
011,03,70
012,03,62
013,03,93
014,03,32
015,03,84
016,03,71
017,03,55
018,03,49
019,03,93
020,03,81
001,04,54
002,04,100
004,04,59
005,04,85
007,04,63
009,04,79
010,04,34
013,04,69
014,04,40
016,04,94
017,04,34
020,04,50
005,05,85
007,05,63
009,05,79
015,05,59
018,05,87
创建数据库和数据表
create database chap05;
use chap05;
-- 学生表 student.csv
create external table student (
stu_id string comment '学生ID',
stu_name string comment '学生姓名',
birthday string comment '出生日期',
gender string comment '学生性别'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/student';
load data local inpath '/root/data/data02/student.csv' overwrite into table student;
select * from student;
-- 讲师表 teacher.csv
create external table teacher (
tea_id string comment '课程ID',
tea_name string comment '课程名称'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/teacher';
load data local inpath '/root/data/data02/teacher.csv' overwrite into table teacher;
select * from teacher;
-- 课程表 course.csv
create external table course (
course_id string comment '课程ID',
course_name string comment '课程名称',
tea_id string comment '讲师ID'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/course';
load data local inpath '/root/data/data02/course.csv' overwrite into table course;
select * from course;
-- 分数表 score.csv
create external table score (
stu_id string comment '学生ID',
course_id string comment '课程ID',
score int comment '成绩'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/score';
load data local inpath '/root/data/data02/score.csv' overwrite into table score;
SQL练习
-- 查询所有学生信息
select * from student;
-- 查询周姓学生信息
select * from student where stu_name like '周%';
-- 查询周姓学生数量
select count(*) from student where stu_name like '周%';
-- 查询 学生ID 004 的分数 超过 85 的成绩
select stu.stu_id, stu_name, birthday, gender, course_id, score from student stu inner join score s
on stu.stu_id = 004 and stu.stu_id = s.stu_id and score > 85;
-- 查询 学生程ID 004 的成绩降序
select stu.stu_id, stu_name, birthday, gender, course_id, score from student stu inner join score s
on stu.stu_id = 004 and stu.stu_id = s.stu_id order by score desc;
-- 查询 数学成绩不及格学生及其对应的成绩 学生成绩
select stu.stu_id, stu_name, birthday, gender, course_name, score from student stu
inner join score s inner join course c on s.course_id = c.course_id and stu.stu_id = s.stu_id
and c.course_name = '数学' and score < 60 order by score;
-- 查询男女生人数
select gender,count(*) from student group by gender;
-- 查询编号为 02 的课程平均成绩
select round(avg(score),2) from score where course_id = 02;
-- 查询每科课程平均成绩
select course_id,round(avg(score),2) from score group by course_id;
-- 查询参加考试学生人数
select count(distinct stu_id) from score where score is not null and score >= 0;
select count(stu_id) from (select stu_id from score where score is not null and score >= 0 group by stu_id) t;
-- 查询每科有多少学生参加考试
select course_id,count(stu_id) from score where score is not null and score >= 0 group by course_id;
-- 查询未参加考试的学生信息
select * from student where stu_id in (
select stu.stu_id from student stu
left join score s on stu.stu_id = s.stu_id
group by stu.stu_id having count(*) != (select count(*) from course)
);
-- 查询选修至少 4 门 以上课程学生的学号
select stu_id,count(course_id) course_count from score
group by stu_id
having course_count >= 4;
-- 查询姓氏相同学生名单 并且同姓人数大于 2 的姓氏
select first_name ,count(*) first_name_count from (
select stu_id,stu_name,substr(stu_name,1,1) as first_name
from student
) ts
group by ts.first_name
having first_name_count > 1;
-- 查询每门功课的学生的平均成绩 按照平均成绩升序 平均成绩相同按照课程编号降序
select course_id, avg(score) avg_score
from score
group by course_id
order by avg_score,course_id desc;
-- 统计参加考试人数大于等于 15 的学科
select course_id,count(*) as stu_count from score group by course_id having stu_count > 15;
-- 查询学生总成绩并按照总成绩降序排序
select stu_id, sum(score) sum_score
from score
group by stu_id
order by sum_score desc;
-- 按照指定格式显示 stu_id 语文 数学 英语 选课数 平均成绩
select
s.stu_id,
sum(`if`(c.course_name='语文',score,0)) as `语文`,
sum(`if`(c.course_name='数学',score,0)) as `数学`,
sum(`if`(c.course_name='英语',score,0)) as `英语`,
count(s.course_id) as `选课数`,
avg(s.score) as `平均成绩`
from course c left join score s
on c.course_id = s.course_id
group by s.stu_id
order by `平均成绩` desc;
-- 查询一共参加了三门功课且其中一门为语文的学生id 和 姓名
select s.stu_id,stu_name from
(select t1.stu_id ,count(t1.course_id) course_count from
(select stu_id,course_id from score
where stu_id in ( select stu_id from score where course_id = "01")
) t1 group by t1.stu_id having course_count >=3
) t2 join student s on t2.stu_id = s.stu_id;
-- 分解
-- 查询该学生的姓名
select s.stu_id,stu_name from
-- 成绩表中学习科目数量 >=3 科的学生
(select t1.stu_id ,count(t1.course_id) course_count from
-- 报名了语文的学生还报名了那些学科
(select stu_id,course_id from score
where stu_id in (
-- 查询报名了语文的学生ID
select stu_id from score where course_id = "01"
)
) t1 group by t1.stu_id having course_count >=3
) t2 join student s on t2.stu_id = s.stu_id;
-- 查询两门以上的课程不及格学生的学号及其平均成绩
-- 1、先按照学生分组 过滤出成绩低于60的数量 大于1
-- 2、计算所有学生的平均成绩
-- 3、两个子查询相互join
select t1.stu_id,t2.avg_score from
(select stu_id, sum(if(score < 60, 1, 0)) as result from score group by stu_id having result > 1) t1
left join
(select stu_id,avg(score) as avg_score from score group by stu_id) t2 on t1.stu_id =t2.stu_id;
-- 查询所有学生的学号、姓名、选课数、总成绩
select
stu.stu_id,stu.stu_name,count(s.course_id) count_course ,nvl(sum(s.score),0) total_score
from student stu left join score s on stu.stu_id = s.stu_id
group by stu.stu_id, stu.stu_name order by stu.stu_id;
-- 平均成绩大于 85 的所有学生的学号、姓名、平均成绩
select
stu.stu_id,stu.stu_name ,nvl(avg(s.score),0) as `avg_score`
from student stu left join score s on stu.stu_id = s.stu_id
group by stu.stu_id, stu.stu_name having nvl(avg(s.score),0) > 85 order by stu.stu_id
-- 查询学生的选课情况:学号,姓名,课程号,课程名称
select student.stu_id,student.stu_name,c.course_id,c.course_name from student
right join score s on student.stu_id = s.stu_id
left join course c on s.course_id = c.course_id
-- 查询学生的没有选课情况:学号,姓名
select stu_id,stu_name from
(
select student.stu_id,student.stu_name, s.course_id from student
left join score s on student.stu_id = s.stu_id
left join course c on s.course_id = c.course_id
) t where course_id is null
-- 查询出每门课程的及格人数和不及格人数
select c.course_id,course_name,pass,fail
from course c join
(
select
course_id,sum(if(score >= 60,1,0)) as `pass`, sum(if(score < 60,1,0)) as `fail`
from score group by course_id
) t on c.course_id = t.course_id
-- 查询课程编号为03且课程成绩在80分以上的学生的学号和姓名及课程信息
select t1.stu_id,s.stu_name,t1.course_id,c.course_name,t1.score from
(select * from score where course_id = '03' and score > 80) t1
left join student s on s.stu_id = t1.stu_id
left join course c on t1.course_id = c.course_id
-- 查询语文成绩低于平均分数的学生是谁,教师是谁
select t3.stu_id,t3.stu_name,t3.`avg_score`,t.tea_name from
(select t2.stu_id,t2.`avg_score`,s.stu_name,t2.course_id,c.tea_id from
(select t1.stu_id,t1.course_id,t1.`avg_score` from
(select stu_id,s.course_id, avg(score) as `avg_score` from score s right join
(select course_id from course where course_name = '语文') t1 on t1.course_id = s.course_id
group by stu_id,s.course_id) t1
where t1.`avg_score` < (select avg(score) as `avg_score` from score s right join (select course_id from course where course_name = '语文') t1 on t1.course_id = s.course_id)
) t2 left join student s on t2.stu_id = s.stu_id
left join course c on t2.course_id = c.course_id
)t3 left join teacher t on t3.tea_id = t.tea_id;
-- 查询所有学生总成绩和平均成绩,
-- 且他们的总成绩低于平均成绩的有多少个人,
-- 高于平均成绩的有多少人,
-- 低于平均成绩的男生和女生分别有多少人,
-- 且他们的任课老师是谁。
-- 统计各科成绩各分数段人数:课程编号,课程名称,[100-85],[85-75],[70-60],[0-60]及所占百分比
select c.course_id, course_name, a, b, c, d from course c left join (
select course_id,
concat(round((sum(`if`(score >= 85,1,0)) / count(*)) * 100,2), '%') as a,
concat(round((sum(`if`(score between 75 and 84,1,0)) / count(*)) * 100,2), '%') as b,
concat(round((sum(`if`(score between 60 and 74,1,0)) / count(*)) * 100,2), '%') as c,
concat(round((sum(`if`(score < 60,1,0)) / count(*)) * 100,2), '%') as d
from score group by course_id
) t on t.course_id = c.course_id;
-- 查询各科成绩最高分、最低分和平均分,以如下形式显示:
-- 课程ID,课程name,最高分,最低分,平均分,中下率,中等率,优良率,优秀率
-- 及格为>=60,中等为:70-80,优良为:80-90,优秀为:>=90
select c.course_id, course_name, `最高分`,`最低分`,`平均分`,`优秀率`, `优良率`, `中等率`, `中下率`,`芸芸众生` from course c left join (
select course_id,
max(score) `最高分`,
min(score) `最低分`,
round(avg(score),2) `平均分`,
concat(round((sum(`if`(score >= 90,1,0)) / count(*)) * 100,2), '%') as `优秀率`,
concat(round((sum(`if`(score between 80 and 89,1,0)) / count(*)) * 100,2), '%') as `优良率`,
concat(round((sum(`if`(score between 70 and 79,1,0)) / count(*)) * 100,2), '%') as `中等率`,
concat(round((sum(`if`(score between 60 and 69,1,0)) / count(*)) * 100,2), '%') as `中下率`,
concat(round((sum(`if`(score < 60,1,0)) / count(*)) * 100,2), '%') as `芸芸众生`
from score group by course_id
) t on t.course_id = c.course_id;
-- 查询每门课程的男生和女生的比例是多少
select t1.course_id,t1.gender, concat(round((count_gender / count_course_student) * 100,2), '%') as proportion from
(select course_id,gender,count(*) count_gender from score s1 inner join student s2 on s1.stu_id = s2.stu_id group by course_id,gender) t1
inner join
(select course_id,count(*) count_course_student from score s1 inner join student s2 on s1.stu_id = s2.stu_id group by course_id) t2
on t2.course_id = t1.course_id;
-- 每门学科的成绩是男生比较优一些还是女生比较优一些,并且每门课程的最高分是谁。
select battle_t.course_id,male_avg_score, female_avg_score, battle, max_stu_id,min_stu_id, max_score, min_score from
(select male_t.course_id,round(male_avg_score,2) male_avg_score,round(female_avg_score,2) female_avg_score,
case
when male_avg_score > female_avg_score then '男性优秀'
when male_avg_score < female_avg_score then '女性优秀'
else '势均力敌'
end battle
from
(select course_id,avg(score) male_avg_score from score s1 inner join student s2 on s1.stu_id = s2.stu_id and gender = '男' group by course_id) male_t
inner join
(select course_id,avg(score) female_avg_score from score s1 inner join student s2 on s1.stu_id = s2.stu_id and gender = '女' group by course_id) female_t
on male_t.course_id = female_t.course_id) battle_t
inner join
(select max_t.course_id,max_t.stu_id max_stu_id,max_score,min_t.stu_id min_stu_id,min_score from
(select stu_id, s.course_id, max_score from score s
inner join
(select course_id, max(score) max_score from score group by course_id) t
on s.course_id = t.course_id and max_score = score) max_t
full join
(select stu_id, s.course_id, min_score from score s
inner join
(select course_id,min(score) min_score from score group by course_id) t
on s.course_id = t.course_id and min_score = score) min_t
on max_t.course_id = min_t.course_id) info
on battle_t.course_id = info.course_id;
-- 课程编号为"01"且课程分数小于60,按分数降序排列的学生信息
select s.stu_id, stu.stu_name, stu.birthday, stu.gender,s.score
from score s join student stu on s.stu_id = stu.stu_id
where s.score < 60 order by s.score desc;
-- 查询所有课程成绩在70分以上的学生的姓名、课程名称和分数,按分数升序
select stu.stu_name, c.course_name, s2.score
from student stu join
(select s.stu_id, sum(`if`(s.score >= 70, 0, 1)) as `is_ok` from score s group by s.stu_id having is_ok = 0) t1
on stu.stu_id = t1.stu_id left join score s2 on stu.stu_id = s2.stu_id left join course c on s2.course_id = c.course_id
order by s2.score;
-- 查询某学生不同课程的成绩相同的学生编号、课程编号、学生成绩
select s1.stu_id,collect_list(s1.course_id) as course_id,collect_set(s1.score) as score
from score s1 join score s2 on s1.stu_id = s2.stu_id
and s1.course_id != s2.course_id
and s1.score == s2.score
group by s1.stu_id;
分组排序取TopN
查询各科成绩前五名的学生
select a.course_id,a.stu_id,a.score from score a
left join score b
on a.course_id = b.course_id and a.score <= b.score
group by a.stu_id,a.course_id,a.score
having count(a.stu_id) <=5
order by a.course_id,a.score desc;
select S1.course_id,s1.stu_id,s1.score from score s1 where
(select count(*) from score s2
where s2.course_id=s1.course_id AND s2.score > s1.score
) <= 5 order by s1.course_id,s1.score desc;
row_number
row_number() over () 连续序号
over()里头的分组以及排序的执行晚于 where 、group by、order by 的执行。
select * from
(select course_id, stu_id, score,
row_number() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;
rank
rank() over () 排名 跳跃排序 序号不是连续的
select * from
(select course_id, stu_id, score,
rank() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;
dense_rank
dense_rank() over () 排名 连续排序
select * from
(select course_id, stu_id, score,
dense_rank() over (partition by course_id order by score desc ) as mum
from score) t where mum <= 5;