hive调优第一部分_大数据系统

1、Explain查看执行计划

explain可以查看执行计划

-- 创建大表
create table bigtable(id bigint,t bigint,uid string,keyword string,url_rank int,click_num int,click_url string
)
row format delimited 
fields terminated by 't';

-- 创建小表
create table smalltable(id bigint,t bigint,uid string,keyword string,url_rank int,click_num int,click_url string
)
row format delimited 
fields terminated by 't';

-- 创建JOIN后表
create table jointable(
id bigint,t bigint,uid string,keyword string,
url_rank int,click_num int,click_url string
)
row format delimited 
fields terminated by 't';

使用explain

-- 1、没有走MR的HQL
explain select * from bigtable;  -- 这个HQL我们并没有走MR

hive (test)> explain select * from bigtable; 
OK
STAGE DEPENDENCIES: -- 阶段的依赖关系
  Stage-0 is a root stage -- 根阶段(也就是阶段0)

STAGE PLANS: -- 阶段计划
  Stage: Stage-0 -- 阶段0
    Fetch Operator -- Stage-0是一个抓取操作
      limit: -1 -- 说明并没有做limit限制
      Processor Tree: -- 操作树
        TableScan -- 扫描的表
          alias: bigtable -- 表bigtable
          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          Select Operator -- 查询列
            expressions: id (type: bigint), t (type: bigint), uid (type: string), keyword (type: string), url_rank (type: int), click_num (type: int), click_url (type: string) -- 具体的查询的列
            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 -- 输出列名，自起的
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            ListSink

Time taken: 0.4 seconds, Fetched: 17 row(s)

-- 2、走MR的HQL
explain select click_url,count(*) ct from bigtable group by click_url;

hive (test)> explain select click_url,count(*) ct from bigtable group by click_url;
OK
STAGE DEPENDENCIES: -- 依赖的阶段
  Stage-1 is a root stage -- 阶段1，根阶段
  Stage-0 depends on stages: Stage-1 -- 阶段0，并且阶段0依赖阶段1

STAGE PLANS: -- 阶段计划
  Stage: Stage-1 -- 阶段1
    Map Reduce -- map reduce阶段
      Map Operator Tree: -- map阶段
          TableScan -- 扫描的表
            alias: bigtable -- 具体的表，bigtable
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE -- 表的状态
            Select Operator -- 查询操作
              expressions: click_url (type: string)  -- 查询的内容
              outputColumnNames: click_url -- 输出的列
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE -- 查询的状态
              Group By Operator -- 分组的操作
                aggregations: count()  -- 分组的操作时count()
                keys: click_url (type: string) -- 分组的字段
                mode: hash -- 采用的模式是hash
                outputColumnNames: _col0, _col1 -- 输出字段为两个
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE -- 状态
                Reduce Output Operator -- reduce的输出操作
                  key expressions: _col0 (type: string)
                  sort order: + -- 排序方式为正序
                  Map-reduce partition columns: _col0 (type: string) -- MR的分区字段，也就是第一个字段
                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                  value expressions: _col1 (type: bigint)
      Reduce Operator Tree: -- reduce阶段
        Group By Operator -- 分组操作
          aggregations: count(VALUE._col0) -- 聚合操作
          keys: KEY._col0 (type: string) 
          mode: mergepartial
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          File Output Operator
            compressed: false
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            table: 
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0 -- 第二阶段，也是个抓取阶段
    Fetch Operator
      limit: -1 -- 没有限制抓取的行数
      Processor Tree: -- 操作树
        ListSink

Time taken: 0.163 seconds, Fetched: 48 row(s)

-- 3、查看的更加详细的信息
explain extended select * from bigtable;
explain extended select click_url,count(*) ct from bigtable group by click_url;

这种加了extended关键字的用法，在我们查看MR任务的时候基本不使用的，因为多出来的字段对我查看MR任务并没有帮助

2、hive建表优化

1、分区表

hive中的分区其实就是分目录，把一个大的数据集分割成一个个小的数据集，可以指定分区进行查询，可以很大的提高效率
分区的具体操作

分区表分为静态分区和动态分区

	-- 以下为静态分区
	-- 创建分区表
	create table dept_partiton(
  deptno int,
  dname string,
  loc string
  )
  partitioned  by (day string) -- 分区字段不能是表中已经存在的字段，而这个分区字段也称为伪类
  row format delimited fields terminated by 't';  
  
  -- 加载数据 ,这里必须要指定分区
  load data /opt/apps/data/20211112.log into table dept_partiton partition(day='20211112');
  
  -- 查询数据
  select * from dept_partition where day='20211112';
  
  select * from dept_partition where day='20211112'
  union 
  select * from dept_partition where day='20211113'
  union
  select * from dept_partition where day='20211114';
  
  -- 添加分区
  alter table dept_partiton add partition(day='20211115');
  
  alter table dept_partiton add partition(day='20211116') partition(day='20211117');
  
  -- 删除分区
  alter table dept_partition drop partition(day='20211115');
  
  alter table dept_partition drop partition(day='20211116'),partition(day='20211117');
  
  -- 查看分区表有多少分区
  show partitions dept_partition;
  
  -- 查看分区表结构
  desc formatted dept_partition;
  
  -- 二级分区
  create table dept_partition2(
  deptno int,
  dname string,
  loc string
  )
  partitioned by (day string,hour string)
  row format delimited fields terminated by 't';
  
  -- 二级分区加载数据
  load data local inpath '/opt/apps/1.log' into table dept_partition2 partition(day='20211112',hour='12');
  
  -- 二级分区查询数据
  select * from dept_partition2 where day='20211112' and hour='12';

-- 动态分区
-- 在进行动态分区的时候需要设置一些参数

-- 1、开启动态分区功能(m默认true，开启)
set hive.exec.dynamic.partition=true;

-- 2、设置非严格模式(动态分区模式，默认strict，表示必须指定至少一个分区为静态分区，nonstrist模式表示允许所有分区字段都可以使用动态分区)
set hive.exec.dynamic.partition.mode=nonstrict;

-- 3、在所有执行MR的节点上，最大一共可以创建多少个动态分区。默认1000
set hive.exec.max.dynamic.partitions=1000;

-- 4、在每个执行MR的节点上，最大可以创建多少个动态分区
-- 该参数需要根据实际的数据来设定。比如:源数据中包含了一年的数据,即day字段有365个值,那么该参数就需要设置成大于365,如果使用默认值 100，则会报错。
set hive.exec.max.dynamic.partiitons.pernode=100;

-- 5、整个MR Job中，最大可以创建多少个HDFS文件。默认100000
set hive.exec.max.created.files=100000

-- 6、当有空分区生成时，是否抛出异常。一般不需要设置，默认false
set hive.error.on.empty.partition=false


-- 创建动态分区表的时候需要有两个表，一个目标表，还有一个是原表

-- 需求，将dept表中的数据按照地区(loc字段)，插入到目标表dept_partition的相应分区中

-- 1、创建目标分区表(也就是动态分区表)
create table dept_partition_dy(id int,name string)
partition by (loc int) 
row format delimited fields terminated by 't';

-- 2、设置动态分区
set hive.exec.dynamic.partiiton.mode = nonstrict;

insert into table dept_partition_by partition(loc) select deptno,dname,loc from dept;

-- 3、查看目标分区表的分区情况
show partitions dept_partition;

2、分桶表

分区针对的是数据的存储路径，分桶针对的是数据文件

分通是对分区进行更加细粒度的划分

-- 创建一个分桶表
create table stu_buck(id int,name string)
clustered by(id) into 4 buckets
row format delimited fields terminated by 't';

-- 查看表结构
desc formatted stu_buck;

-- 导入数据到分桶表中，load方式
load data inpath '/stu.txt' into table stu_buck;

-- 查询分桶的数据
select * form stu_buck;

-- 分桶规则：
-- Hive的分桶采用对分桶字段的值进行哈希，然后除以桶的个数求余的方式决定该条记录存放在哪个桶中

-- 分桶操作需要注意的事项
   -- 1、reduce的个数设置为-1，让Job自行决定需要用多少个reduce或者将reduce的个数设置为大于等于分桶表的桶数
   -- 2、从hdfs中load数据到分桶表中，避免本地文件找不到问题
   -- 3、不要使用本地模式
   
-- insert 方式将数据导入分桶表
insert into table stu_buck select * from student_insert;

-- 抽样查询
select * from stu_buck tablesample(bucket 1 out of 4 on id);

-- 我们在使用分桶表的时候大多是进行抽样查询

3、合适的文件格式

Hive支持的存储数据的格式主要有：textfile，sequencefile，orc，parquet

行存储：textfile，sequencefile

列存储：orc，parquet

行存储的特点：如果经常用select * 适合用行存储

列存储：单个行进行查询效率高

1、textfile
	默认格式，数据不做压缩，磁盘开销大，数据解析开销大。可结合Gzip、Bzip2使用，但使用Gzip这种方式，hive不会对数据进行切分，从而无法对数据进行并行操作
	
2、Orc格式
	Orc是hive 0.11引入新的存储格式
	
3、Parquet格式
	Parquet文件是以二进制方式存储的，所有是不可以直接读取的，文件中包含该文件的数据和元数据。因此Parquet格式文件是自解析的

4、格式的压缩方式

压缩格式	hadoop 自带？	算法	文件扩展名	是否可切分	换成压缩格式后，原来的程序是否需要修改
DEFLATE	是，直接使用	DEFLATE	.deflate	否	和文本处理一样，不需要修改
Gzip	是，直接使用	DEFLATE	.gz	否	和文本处理一样，不需要修改
bzip2	是，直接使用	bzip2	.bz2	是	和文本处理一样，不需要修改
LZO	否，需要安装	LZO	.lzo	是	需要建索引，还需要指定输入格式
Snappy	否，需要安装	Snappy	.snappy	否	和文本处理一样，不需要修改

在工作中使用最多的是LZO和Snappy，因为他们都有着优秀的压缩能力和压缩效率

hive调优第一部分

大数据系统相关栏目本月热门文章