- 【P001-P017】大数据Hadoop教程-学习笔记01【大数据导论与Linux基础】【17p】
- 【P018-P037】大数据Hadoop教程-学习笔记02【Apache Hadoop、HDFS】【20p】
- 【P038-P050】大数据Hadoop教程-学习笔记03【Hadoop MapReduce与Hadoop YARN】【13p】
- 【P051-P068】大数据Hadoop教程-学习笔记04【数据仓库基础与Apache Hive入门】【18p】
- 【P069-P083】大数据Hadoop教程-学习笔记05【Apache Hive DML语句与函数使用】【15p】
- 【P084-P096】大数据Hadoop教程-学习笔记06【Hadoop生态综合案例:陌陌聊天数据分析】【13p】
01【Hive SQL DML语法之加载数据】
- Hive SQL DML语法之加载数据
- Hive SQL DML语法之查询数据
- Hive SQL Join关联查询
- Hive SQL中的函数使用
- 掌握Hive SQL Load加载数据语句
- 掌握Hive SQL Insert插入数据语句
- 掌握Hive SQL Select基础查询语句
- 掌握Hive SQL Join查询语句
- 掌握Hive SQL 常用函数的使用
P070【02-Hive SQL-DML-Load加载数据操作】
load data local,local:不是客户端所在的本地,而是hive服务器所在的本地;只要访问的是node1这台服务器上运行的hive服务,加载数据时local本地指的就是从node1这台linux加载的本地文件系统。
nohup /export/server/apache-hive-3.1.2-bin/bin/hive --service metastore
nohup /export/server/apache-hive-3.1.2-bin/bin/hive --service hiveserver2 &
! connect jdbc:hive2://node1:10000
use itheima;
show tables;
load data local inpath '/root/hivedata/students.txt' into table itheima.student_local;
select * from student_local;
create table t_2(id int, name string);
insert into table t_2 values(1, "zhangsan"); 语法支持,但运行速度太慢!
select * from t_2;
show databases;
use itheima;
------------Hive SQL-DML-Load加载数据---------------
--建表student_local 用于演示从本地加载数据
create table student_local(num int,name string,sex string,age int,dept string) row format delimited fields terminated by ',';
--建表student_HDFS 用于演示从HDFS加载数据
create table student_HDFS(num int,name string,sex string,age int,dept string) row format delimited fields terminated by ',';
--建议使用beeline客户端 可以显示出加载过程日志信息
-- 从本地加载数据 数据位于HS2(node1)本地文件系统 本质是hadoop fs -put上传操作
LOAD DATA LOCAL INPATH '/root/hivedata/students.txt' INTO TABLE student_local;
--从HDFS加载数据 数据位于HDFS文件系统根目录下 本质是hadoop fs -mv 移动操作
--先把数据上传到HDFS上 hadoop fs -put /root/hivedata/students.txt /
LOAD DATA INPATH '/students.txt' INTO TABLE student_HDFS;
------------Hive SQL-DML-Insert插入数据-----------------
drop table if exists student;
create table student(num int,name string,sex string,age int,dept string)
row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/students.txt' into table student;
select * from student;
--step3:创建一张目标表 只有两个字段
create table student_from_insert(sno int, sname string);
insert into table student_from_insert select num, name from student;
select * from student_from_insert;
02【Hive SQL DML语法之查询数据】
P072【04-Hive SQL-DML-Select查询--语法树与学习环境准备】
- 从哪里查询取决于FROM关键字后面的table_reference,这是我们写查询SQL的首先要确定的事即你查询谁?
- 表名和列名不区分大小写。
------------Hive SQL select查询基础语法------------
drop table if exists t_usa_covid19;
CREATE TABLE t_usa_covid19(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int)
row format delimited fields terminated by ",";
load data local inpath '/root/hivedata/us-covid19-counties.dat' into table t_usa_covid19;
select * from t_usa_covid19;
P073【05-Hive SQL-DML-Select查询--列表达式与distinct去重】
select * from t_usa_covid19;
select county, cases, deaths from t_usa_covid19;
--查询常数返回 此时返回的结果和表中字段无关
select 1 from t_usa_covid19;
select current_database(); --省去from关键字
select state from t_usa_covid19;
select all state from t_usa_covid19;
--返回所有匹配的行 去除重复的结果
select distinct state from t_usa_covid19;
--多个字段distinct 整体去重
select distinct county,state from t_usa_covid19;
--county, state
P074【06-Hive SQL-DML-Select查询--Where条件过滤】
select * from t_usa_covid19 where 1 > 2; -- 1 > 2 返回false
select * from t_usa_covid19 where 1 = 1; -- 1 = 1 返回true
select * from t_usa_covid19 where state = 'California';
--where条件中使用函数 找出州名字母长度超过10位的有哪些
select * from t_usa_covid19 where length(state) >10 ;
P075【07-Hive SQL-DML-Select查询--聚合操作aggregate】
select county from t_usa_covid19;
select count(county) from t_usa_covid19;
select county as itcast from t_usa_covid19;
select count(county) as county_cnts from t_usa_covid19;
select count(distinct county) as county_cnts from t_usa_covid19;
select count(county) from t_usa_covid19 where state = "California";
select sum(deaths) from t_usa_covid19 where state = "Texas";
select max(cases) from t_usa_covid19;
P076【08-Hive SQL-DML-Select查询--Group by分组及语法限制】
select * from t_usa_covid19;
--根据state州进行分组 统计每个州有多少个县county
select count(county) from t_usa_covid19 where count_date = "2021-01-28" group by state;
select state,count(county) as county_nums from t_usa_covid19 where count_date = "2021-01-28" group by state;
--再想看一下每个县的死亡病例数,我们猜想很简单呀 把deaths字段加上返回 真实情况如何呢?
select state,count(county),sum(deaths) from t_usa_covid19 where count_date = "2021-01-28" group by state;
--很尴尬 sql报错了org.apache.hadoop.hive.ql.parse.SemanticException:Line 1:27 Expression not in GROUP BY key 'deaths'
--为什么会报错??group by的语法限制
--结论:出现在GROUP BY中select_expr的字段:要么是GROUP BY分组的字段;要么是被聚合函数应用的字段。
--deaths不是分组字段 报错
--state是分组字段 可以直接出现在select_expr中
select state,count(county),sum(deaths) from t_usa_covid19 where count_date = "2021-01-28" group by state;
P077【09-Hive SQL-DML-Select查询--Having过滤操作】
select state,sum(deaths) from t_usa_covid19 where count_date = "2021-01-28" and sum(deaths) >10000 group by state;
--先where分组前过滤,再进行group by分组, 分组后每个分组结果集确定 再使用having过滤
select state,sum(deaths) from t_usa_covid19 where count_date = "2021-01-28" group by state having sum(deaths) > 10000;
--这样写更好 即在group by的时候聚合函数已经作用得出结果 having直接引用结果过滤 不需要再单独计算一次了
select state,sum(deaths) as cnts from t_usa_covid19 where count_date = "2021-01-28" group by state having cnts> 10000;
P078【10-Hive SQL-DML-Select查询--Order by排序】
--7、order by
--根据确诊病例数升序排序 查询返回结果
select * from t_usa_covid19 ;
select * from t_usa_covid19 order by cases;
--不写排序规则 默认就是asc升序
select * from t_usa_covid19 order by cases asc;
--根据死亡病例数倒序排序 查询返回加州每个县的结果
select * from t_usa_covid19 where state = "California" order by cases desc;
P079【11-Hive SQL-DML-Select查询--Limit限制语法】
--没有限制返回2021.1.28 加州的所有记录
select * from t_usa_covid19 where count_date = "2021-01-28" and state ="California";
select * from t_usa_covid19 where count_date = "2021-01-28" and state ="California" limit 5;
--返回结果集从第1行开始 共3行
select * from t_usa_covid19 where count_date = "2021-01-28" and state ="California" limit 2,3;
--注意 第一个参数偏移量是从0开始的
P080【12-Hive SQL-DML-Select查询--执行顺序梳理】
select state,sum(deaths) as cnts from t_usa_covid19
where count_date = "2021-01-28"
group by state
having cnts> 10000
limit 2;
03【Hive SQL Join关联查询】
P081【13-Hive SQL Join关联查询】
在Hive中,使用最多最重要的两种join分别是:inner join(内连接)、left join(左连接)。
1206,78B,old city,la
------------Hive Join SQL 语法------------
--Join语法练习 建表
drop table if exists employee_address;
drop table if exists employee_connection;
drop table if exists employee;
--table1: 员工表
CREATE TABLE employee(
id int,
name string,
deg string,
salary int,
dept string
) row format delimited
fields terminated by ',';
CREATE TABLE employee_address (
id int,
hno string,
street string,
city string
) row format delimited
fields terminated by ',';
CREATE TABLE employee_connection (
id int,
phno string,
email string
) row format delimited
fields terminated by ',';
load data local inpath '/root/hivedata/employee.txt' into table employee;
load data local inpath '/root/hivedata/employee_address.txt' into table employee_address;
load data local inpath '/root/hivedata/employee_connection.txt' into table employee_connection;
select * from employee;
select * from employee_address;
select * from employee_connection;
--1、inner join
select e.id,e.name,e_a.city,e_a.street
from employee e inner join employee_address e_a
on e.id =e_a.id;
--等价于 inner join=join
select e.id,e.name,e_a.city,e_a.street
from employee e join employee_address e_a
on e.id =e_a.id;
--等价于 隐式连接表示法
select e.id,e.name,e_a.city,e_a.street
from employee e , employee_address e_a
where e.id = e_a.id;
--2、left join
select e.id,e.name,e_conn.phno,e_conn.email
from employee e left join employee_connection e_conn
on e.id = e_conn.id;
--等价于 left outer join
select e.id,e.name,e_conn.phno,e_conn.email
from employee e left outer join employee_connection e_conn
on e.id = e_conn.id;
04【Hive SQL中的常用函数使用入门】
- 使用show functions查看当下可用的所有函数;
- 通过describe function extended funcname来查看函数的使用方式。
Hive的函数分为两大类:内置函数(Built-in Functions)、用户定义函数UDF(User-Defined Functions):
- 内置函数可分为:数值类型函数、日期类型函数、字符串类型函数、集合函数、条件函数等;
- 用户定义函数根据输入输出的行数可分为3类:UDF、UDAF、UDTF。
- 内置函数(build-in)指的是Hive开发实现好,直接可以使用的函数,也叫做内建函数。
- 官方文档地址:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
- 内置函数根据应用归类整体可以分为8大种类型,我们将对其中重要的,使用频率高的函数使用进行详细讲解。
- String Functions,字符串函数
- Date Functions,日期函数
- Mathematical Functions,数学函数
- Conditional Functions,条件函数
-----------------Hive 常用的内置函数----------------------
show functions;
describe function extended count;
------------String Functions 字符串函数------------
select length("itcast");
select reverse("itcast");
select concat("angela", "baby");
--带分隔符字符串连接函数:concat_ws(separator, [string | array(string)]+)
select concat_ws('.', 'www', array('itcast', 'cn'));
--字符串截取函数:substr(str, pos[, len]) 或者 substring(str, pos[, len])
select substr("angelababy", -2); --pos是从1开始的索引,如果为负数则倒着数
select substr("angelababy", 2, 2);
--分割字符串函数: split(str, regex)
select split('apache hive', ' ');
select split('apache hive', ' ')[0];
select split('apache hive', ' ')[1];
----------- Date Functions 日期函数 -----------------
--获取当前日期: current_date
select current_date();
--获取当前UNIX时间戳函数: unix_timestamp
select unix_timestamp();
--日期转UNIX时间戳函数: unix_timestamp
select unix_timestamp("2011-12-07 13:01:03");
--指定格式日期转UNIX时间戳函数: unix_timestamp
select unix_timestamp('20111207 13:01:03', 'yyyyMMdd HH:mm:ss');
--UNIX时间戳转日期函数: from_unixtime
select from_unixtime(1618238391);
select from_unixtime(0, 'yyyy-MM-dd HH:mm:ss');
--日期比较函数: datediff 日期格式要求'yyyy-MM-dd HH:mm:ss' or 'yyyy-MM-dd'
select datediff('2012-12-08', '2012-05-09');
--日期增加函数: date_add
select date_add('2012-02-28', 10);
--日期减少函数: date_sub
select date_sub('2012-01-1', 10);
----Mathematical Functions 数学函数-------------
--取整函数: round 返回double类型的整数值部分 (遵循四舍五入)
select round(3.1415926);
--指定精度取整函数: round(double a, int d) 返回指定精度d的double类型
select round(3.1415926, 4);
--取随机数函数: rand 每次执行都不一样 返回一个0到1范围内的随机数
select rand();
--指定种子取随机数函数: rand(int seed) 得到一个稳定的随机数序列
select rand(3);
-----Conditional Functions 条件函数------------------
select * from student limit 3;
--if条件判断: if(boolean testCondition, T valueTrue, T valueFalseOrNull)
select if(1 = 2, 100, 200);
select if(sex = '男', 'M', 'W') from student limit 3;
--空值转换函数: nvl(T value, T default_value)
select nvl("allen", "itcast");
select nvl(null, "itcast");
--条件转换函数: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
select case 100 when 50 then 'tom' when 100 then 'mary' else 'tim' end;
select case sex when '男' then 'male' else 'female' end from student limit 3;