hive学习_大数据系统

hive学习

之前发过如何使用idea连接hive，连接上hive之后肯定是要去使用hive执行一些操作了，这里整理了一些HQL操作。值得注意的是，在我执行hive的时候，有时候会报错[08S01][2] Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask。我看了网上的一些说法，说是mr把资源耗尽，具体啥bug我也不想深究，毕竟也不是运维，但是我亲身实践过重启一次hive就好了。

hive是一个数据仓库工具，能将结构化的文件映射成表，病提供SQL的方式供开发人员运行OLAP任务，用户写的SQL在底层会转换为map-reduce任务得到结果。


//创建数据库
create database if not exists myhive;

//在hdfs指定位置创建数据库
create database myhive2 location '/myhive2';

// 描述数据库
desc  database  myhive2;

// 列出所有数据库
show databases;

// 详细描述数据库
desc database extended  myhive2;

// 使用某个数据库，先use才能操作这个数据库下的表
use myhive;

// 创建表
create table stu(id int,name string);

// 在表中插入记录
insert into stu values (1,"zhangsan");

// 在表中插入多个记录
insert into stu values (1,"zhangsan"),(2,"lisi");

// 查询出表中所有的值
select * from stu;

// 创建表，值以't'结束，存储在hdfs上的/usr/stu2
create table if not exists
    stu2(id int ,name string)
    row format delimited
    fields terminated by 't'
    stored as textfile location '/user/stu2';

// 描述数据库
desc formatted  stu2;

// 展示数据库建库语句
show create table stu2;

// 创建外部表
create external table
    ext_stu (s_id string,s_name string)
    row format delimited
    fields terminated by 't';

// 将本地路径上的文件加载到表里面，等于将本地这个文件上传到hdfs中表对应的文件夹中
load data local inpath '/home/wxwmd/1.txt' into table ext_stu;

// 将hdfs中的数据加载到表中
load data inpath '/tmp/1.txt' into table ext_stu;

// 删除表
drop table stu2;

// 创建分区表
create table score(s_id string, s_score int) partitioned by (month string);

// 修改表的属性
ALTER TABLE score SET SERDEPROPERTIES ('field.delim' = 't' , 'serialization.format'='t');

// 加载数据到分区表中，分区为202201
load data local inpath '/home/wxwmd/score.txt' into table score partition (month='2022101');

// 再加载一份数据到202202分区中
load data local inpath '/home/wxwmd/score.txt' into table score partition (month='2022102');

// 查看分区
show  partitions  score;

// 新建分区
alter table score add partition(month='202203') partition(month = '202204');

// 删除分区
alter table score drop partition (month='202203') ;
alter table score drop partition (month='202204') ;

// 分区表的使用，使用分区进行查询
select * from score where month='2022101';

// 修改hive执行引擎
set hive.execution.engine=mr;

// 直接向分区表中写入数据
insert into table score partition(month ='2022101') values ('wll',100);


create external table
    score2 (s_id string,s_score int)
    row format delimited
    fields terminated by 't'
    location '/score/202201';

// 开启hive的分桶功能
set hive.enforce.bucketing=true;

// 设置reduce任务个数，因为分桶实际上是分成了数个reduce任务，每个reduce任务输出到一个文件当中
set mapreduce.job.reduces=3;

// 创建分桶表
create table bucket_stu (s_id int,s_name string) clustered by(s_id) into 3 buckets;

// 加载数据
insert overwrite table bucket_stu select * from ext_stu cluster by(s_id);  -- 最后指定桶字段

// 导出hive表数据到hdfs上面
export table ext_stu to  '/export/ext_stu';

// 将查询的结果格式化导出到本地
insert overwrite
    local directory '/home/wxwmd/export/ext_stu'
    row format delimited
    fields terminated by 't'
    collection items terminated by '#'
select * from ext_stu;

// 将查询的结果导出到HDFS上(没有local)
insert overwrite
    directory '/export/score'
    row format delimited
    fields terminated by 't'
    collection items terminated by '#'
select * from score;


// 最简单的where语句
select * from ext_stu where s_id=1;

// group by语句
select s_id,avg(s_score) from score group by s_id;

// group by，加上hiving进行过滤
select s_id,sum(s_score) SumScore from score group by s_id having SumScore > 120;


select * from score join ext_stu es on score.s_id = es.s_name;


select s_id,sum(s_score) SumScore from score group by s_id order by SumScore desc;


// 设置reduce任务个数
set mapreduce.job.reduces=3;
//查看设置reduce个数
set mapreduce.job.reduces;
// sort by局部排序
select s_id,s_score from score sort by s_score desc;


select s_id,s_score from score distribute by s_id sort by s_score;

hive学习

大数据系统相关栏目本月热门文章