学习大数据DAY41 Hive 分区表创建

分区表

分区表应用场景

oracle 分区表种类

oracle 分区-范围分区

oracle 分区-列表分区

oracle 分区-散列分区

oracle 分区-组合分区

oracle 分区-分区表操作

hive 分区-创建分区表

hive 分区-分区表操作

hive 分区-动态分区表配置

上机练习

分区表

分区是将一个表或索引物理地分解为多个更小、更可管理的部分。

分区对应用透明，即对访问数据库的应用而言，逻辑上讲只有一个表或一个

索引（相当于应用“看到”的只是一个表或索引），但在物理上这个表或索引可

能由数十个物理分区组成。

分区表应用场景

oracle 分区表种类

oracle 分区-范围分区

CREATE TABLE ORDER_ACTIVITIES

( ORDER_ID

NUMBER(7) NOT NULL,

ORDER_DATE

DATE,

TOTAL_AMOUNT NUMBER,

CUSTOTMER_ID NUMBER(7),

PAID

CHAR(1)

)

-- 建表语句不变

PARTITION BY RANGE (ORDER_DATE) -- 范围分区创建语句关键字

range( 列名 )

(

PARTITION ORD_ACT_PART01 VALUES LESS THAN (TO_DATE('01- MAY

-2003','DD-MON-YYYY')) TABLESPACE ORD_TS01,

-- 分区名（时间值）表空间

PARTITION

ORD_ACT_PART02

VALUES

LESS

THAN

(TO_DATE('01-JUN-2003','DD-MON-YYYY')) TABLESPACE ORD_TS02,

PARTITION ORD_ACT_PART02 VALUES LESS THAN (MAXVALUE) TABLESPACE

ORD_TS03

-- 使用 maxvalue 将其他不符合上述范围的值放入其中

);

oracle 分区-列表分区

CREATE TABLE ORDER_ACTIVITIES

( PROBLEM_ID

NUMBER(7) NOT NULL PRIMARY KEY,

CUSTOMER_ID NUMBER(7) NOT NULL,

STATUS

VARCHAR2(20))

PARTITION BY LIST (STATUS)

-- 范围分区创建语句关键字 LIST( 列

名 )

(

PARTITION PROB_ACTIVE

VALUES ('ACTIVE') TABLESPACE

PROB_TS01, --

PARTITION

PROB_INACTIVE

VALUES

('INACTIVE','unknow')

TABLESPACE PROB_TS02

);

oracle 分区-散列分区

CREATE TABLE HASH_TABLE

( COL NUMBER(8),

INF VARCHAR2(100)

)

PARTITION BY HASH (COL)

-- 范围分区创建语句关键字 HASH( 列名 )

(

PARTITION PART01 TABLESPACE HASH_TS01,

PARTITION PART02 TABLESPACE HASH_TS02,

PARTITION PART03 TABLESPACE HASH_TS03

)

oracle 分区-组合分区

CREATE TABLE SALES

(

PRODUCT_ID VARCHAR2(5),

SALES_DATE DATE,

SALES_COST NUMBER(10),

STATUS VARCHAR2(20)

)

PARTITION BY RANGE(SALES_DATE) SUBPARTITION BY LIST (STATUS) (

PARTITION

VALUES

LESS

THAN(TO_DATE('2003-01-01','YYYY-MM-DD'))TABLESPACE rptfact2009

(

SUBPARTITION

P1SUB1

VALUES

('ACTIVE')

TABLESPACE

rptfact2009,

SUBPARTITION

P1SUB2

VALUES

('INACTIVE')

TABLESPACE

rptfact2009

PARTITION

VALUES

LESS

THAN

(TO_DATE('2003-03-01','YYYY-MM-DD')) TABLESPACE rptfact2009

(

SUBPARTITION

P2SUB1

VALUES

('ACTIVE')

TABLESPACE

rptfact2009,

SUBPARTITION

P2SUB2

VALUES

('INACTIVE')

TABLESPACE

rptfact2009

)

oracle 分区-分区表操作

-- 添加分区

ALTER

TABLE

SALES

ADD

PARTITION

VALUES

LESS

THAN(TO_DATE('2003-06-01','YYYY-MM-DD'));

-- 注意：以上添加的分区界限应该高于最后一个分区界限。

-- 添加了一个 P3SUB1 子分区

ALTER TABLE SALES MODIFY PARTITION P3 ADD SUBPARTITION P3SUB1

VALUES('COMPLETE');

-- 删除分区

ALTER TABLE SALES DROP PARTITION P3;

ALTER TABLE SALES DROP SUBPARTITION P4SUB1;

-- 注意：如果删除的分区是表中唯一的分区，那么此分区将不能被删除，要想

删除此分区，必须删除表。

-- 交换分区

ALTER TABLE table_name EXCHANGE PARTITION partition_name WITH

TABLE nonpartition_name;

-- 将一个分区 ( 子分区 ) 和非分区表进行数据交换， oracle 交换的方法是其实是

对逻辑存储段进行交换。使用 INCLUDEING INDEXES 子句可以同步将本地索引

也进行交换，使用 WITH VALIDATATION 子句还可以实现行数据的验证。 -- 交换分区时如果不带 UPDATE INDEXES 子句，则全局索引或全局索引基于的

分区将变为不可用。

hive 分区-创建分区表

-- 创建分区表

create table dept_partition(

deptno int, dname string, loc string

)

partitioned by (day string)

row format delimited fields terminated by '\t';

hive 分区-分区表操作

-- 分区表数据导入

load data local inpath '/opt/module/hive/datas/dept_20200401.log'

into table dept_partition partition(day='20200401');

--select 分区表插入数据

insert into table log_list_6 partition(dat='20221231') select *

from log_list_tmp

-- 多表分区插入

from student

insert overwrite table student partition(month='201707')select

id, name where month='201707'

insert overwrite table student partition(month='201706')select

id, name where month='201706';

-- 查看分区

show partitions tab_name;

-- 添加分区

alter table dept_partition add partition(day='20200404') ;

-- 添加多分区

alter

table

dept_partition

add

partition(day='20200405')

partition(day='20200406');

-- 删除分区

alter table dept_partition drop partition (day='20200406');

-- 查看分区表信息

show partitions dept_partition;

-- 查看分区表结构

desc formatted dept_partition;

-- 修改分区表

ALTER TABLE table_name PARTITION (dt='2008-08-08') SET LOCATION

"new location"; ALTER TABLE table_name PARTITION (dt='2008-08-08') RENAME TO

PARTITION (dt='20080808');

hive 分区-动态分区表配置

-- 开启动态分区 ( 默认开启 )

set hive.exec.dynamic.partition=true

-- 指定非严格模式 nonstrict 模式表示允许所有的分区字段都可以使用动态

分区

set hive.exec.dynamic.partition.mode=nonstrict

-- 在所有执行 MR 的节点上，最大一共可以创建多少个动态分区。默认 1000

set hive.exec.max.dynamic.partitions=1000

-- 在每个执行 MR 的节点上，最大可以创建多少个动态分区 ( 分区字段有多少种

设多少个 )

set hive.exec.max.dynamic.partitions.pernode=100

-- 整个 MR Job 中，最大可以创建多少个 HDFS 文件。默认 100000

set hive.exec.max.created.files=100000

-- 当有空分区生成时，是否抛出异常

set hive.error.on.empty.partition=false

-- 打开正则查询模式 `(dt|hr)?+.+`

set hive.support.quoted.identifiers=none

上机练习

1 清洗超市数据 excel 为文本文件,按照分区字段对文件进行拆分

超市数据.xlsx:

import pandas as pd
XlsxFile="D:\智云大数据\数据源\超市\超市数据.xlsx"
XlsxRead=pd.read_excel(XlsxFile)
# 选择要分组的列
group_column = '细分'
# 按照指定列分组
grouped = XlsxRead.groupby(group_column)
# 遍历每个分组，并将每个分组保存为单独的 xlsx 文件
# group_name: 分组名称
# group_df: 分组数据
for group_name, group_df in grouped:
output_file = f'D:\智云大数据\{group_name}超市数据.txt'
group_df.to_csv(output_file,
header=False,index=False,sep='\t')
print("文件拆分完成")

2 创建一个分区表将文本文件分别插入到分区中

create table if not exists supermarket_p (id string, -- 行 ID
ord_id string comment '订单 ID',
ord_date string comment '订单日期',
exch_date string comment '发货日期',
exch_type string comment '邮寄方式',
cust_id string comment '客户 ID
',
cust_name string comment '客户名称',
d_type string comment '细分',
city string comment '城市',
prov string comment '省/自治区',
country string comment'国家',
area string comment '地区',
pro_id string comment '产品 ID',
type1 string comment '类别',
type2 string comment '子类别',
pro_name string comment '产品名称',
sales float comment '销售额',
count1 int comment '数量 ',
discount float comment '折扣 ',
profit float comment '利润'
)
partitioned by (cd_type string)
row format delimited fields terminated by '\t'
lines TERMINATED by '\n'load data local inpath '/root/公司超市数据.txt' into table
supermarket_p
partition(cd_type='company');
load data local inpath '/root/消费者超市数据.txt' into table supermarket_p
partition(cd_type='consumer');
load data local inpath '/root/小型企业超市数据.txt' into table
supermarket_p partition(cd_type='enterprise');

3 创建一个按订单时间分区的分区表 (year_ string, month string)

create table if not exists supermarket_p_ord_date (
id string, -- 行 ID
ord_id string comment '订单 ID',
--ord_date string comment '订单日期',
exch_date string comment '发货日期',
exch_type string comment '邮寄方式',
cust_id string comment '客户 ID
',
cust_name string comment '客户名称',
d_type string comment '细分',
city string comment '城市',
prov string comment '省/自治区',
country string comment'国家',
area string comment '地区',
pro_id string comment '产品 ID',
type1 string comment '类别',
type2 string comment '子类别',
pro_name string comment '产品名称',sales float comment '销售额',
count1 int comment '数量 ',
discount float comment '折扣 ',
profit float comment '利润'
)
partitioned by (ord_date_month string)
row format delimited fields terminated by '\t'
lines TERMINATED by '\n'

4 将第二部的数据使用动态分区的方法导入到第三步的表中

导入：

insert into table supermarket_p_ord_date partition(ord_date_month)
select id,ord_id,date_format(ord_date,'YYYY-MM') as
ord_date_month,exch_date,exch_type,cust_id,cust_name,d_type,city,prov,
country,area,pro_id,type1,
type2,pro_name,sales,count1,discount,profit
from supermarket_p;

今天的表

需要开启的

-开启动态分区(默认开启)
set hive.exec.dynamic.partition=true
--指定非严格模式 nonstrict 模式表示允许所有的分区字段都可以使用动态分区
set hive.exec.dynamic.partition.mode=nonstrict
--在所有执行 MR 的节点上，最大一共可以创建多少个动态分区。默认 1000
set hive.exec.max.dynamic.partitions=1000
--在每个执行 MR 的节点上，最大可以创建多少个动态分区(分区字段有多少种设多少个)
set hive.exec.max.dynamic.partitions.pernode=100
--整个 MR
Job 中，最大可以创建多少个 HDFS 文件。默认 100000
set hive.exec.max.created.files=100000
--当有空分区生成时，是否抛出异常
set hive.error.on.empty.partition=false
--打开正则查询模式`(dt|hr)?+.+`
set hive.support.quoted.identifiers=none