#HIVE
#hive启动模式 在/opt/software/hive312/bin目录下启动
[root@single01 ~]# cd /opt/software/hive312/bin
#首先启动元数据服务
nohup hive --service metastore 1>/dev/null 2>&1 &
#1、方法一 hive客户端
hive
#2、方法二 基于metastore和hiveserver2的beeline
hdfs dfsadmin -safemode get
#启动hiveserver2服务
nohup hive --service hiveserver2 1>/dev/null 2>&1 &
#通过beeline启动hive客户端
beeline -u jdbc:hive2://localhost:10000
beeline -u jdbc:hive2://single01:10000
beeline -u jdbc:hive2://192.168.43.200:10000
#是一套根据客户需求,集合各种大数据组件工具对客户数据进行处理,管理,治理的方案
#见图C:Users86182Picturespictureshive_warehouse.png
给与权限
[root@single01 ~]# hdfs dfs -chown -R root:supergroup /hive312
[root@single01 ~]# hdfs dfs -chmod -R 777 /tmp
数据仓库
概念:
数据仓库(Data Warehouse,简称数仓、DW),是一个用于存储、分析、报告的数据系统。
数据仓库的目的是构建面向分析的集成化数据环境,分析结果为企业提供决策支持。
数据仓库本身并不“生产”任何数据,其数据来源于不同外部系统;
# 数据类型
#primitive_type #java
TINYINT byte
SMALLINT short
INT int*
BIGIINT long*
BOOLEAN boolean
FLOAT float
DOUBLE double
DOUBLE PRECISION double
STRING String*
BINARY
TIMESTAMP date*
DECIMAL
DECIMAL(precision,scale) BigDecimal*
DATE date*
VARCHAR String
CHAR String
复杂类型:
array_type
:ARRAY
map_type
: MAP
struct_type #结构体
:STRUCT
union_type #联合类型
:UNIOnTYPE
#数据表
explain select * from student
create [temporary][external] table [if not exists][dbname.]tabname
[(
colname data_type [comment col_comment],
...,
[constraint_specification]
)]
[comment table_comment] //表注释
[partitioned by (external_colname data_type [comment col_comment],...)]
//分区 表里的字段名不能拿出来分区
[clustered by (colname,...) [sorted by (colname ASC|DESC,...)] into num_buckets buckets]
//分桶(方便抽样查看数据的倾斜,对事务级处理优化节约内存资源) 表里固有的字段。 可以排序,分桶字段和排序字段可以不是同一个字段
//num_buckets 是提供一个数字给分桶用来取余数分的
[skewed by (colname,colname,...) on ((colvalue,...),(colvalue,...),...) [stored as directories]]
//表里的字段存在倾斜 字段在哪些值上 hive内部自动优化
[rowformat row_format]
row format delimited //指定行格式化分隔符
fields terminated by ',' //指定字段之间分隔符
collection items terminated by ';' //指定集合项之间分隔符
map keys terminated by ':' //指定键值之间分隔符
lines terminated by 'n' //指定行结束符
[stored as file_format | stored by 'storage.handler.classname' [with serdeproperties]] //字段关联
[location hdfs_path]
[tblproperties(key=value,...)]表属性
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
row_format
row format delimited //指定行格式化分隔符
fields terminated by ',' //指定字段之间分隔符
collection items terminated by ';' //指定集合项之间分隔符
map keys terminated by ':' //指定键值之间分隔符
lines terminated by 'n' //指定行结束符
file_format:
SEQUENCEFULE //二进制文件
TEXTFILE ★//默认用这个
RCFILE
ORC ★//支持压缩 snappy,lzo
PARQUET
AVRO
INPUTFORMAT input_format_classname
//数据类型 描述 实例
STRUCT 类似于java中的pojo对象,以结构 struct
化的多个属性组成一个共有的数据。
MAP 类似于java中的map,表示 map
键值对组成的元组集合。
ARRAY 类似于java中的数组,表示一组具有 array["A","B","C"]
相同类型和名称的变量集合。数组中
的对象可以通过一个从0开始的index
直接访问。
//举例
//1.有如下的json格式文件
{
"name":"roy",
"friends": ["bob","john"], //列表 Array,
"children": { //键值 Map,
"yula":6,
"sophia":17
}
"address": { //结构 Struct,
"street":"hongshan",
"city":"nanjing"
}
}
//2.转换格式写进文件
roy,bob_john,yula:6_sophia:17,hongshan_nanjing
mike,lea_jacky,rita:7_king:20,chaoyang_beijing
//3.建表
create table test3(
name string,
friends array
)
1、//建表
create external table kb16.student_ext(
stuname string,
stuage int,
stugender string,
stuhobby array
stuscore map
stuaddress struct
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by ':'
lines terminated by 'n'
stored as textfile
location '/test/hive/student_ext' ;
2.[root@single01 ~]#vim student_ext.txt //编辑文件
henry,18,m,sing|dance|read,java:88|mysql:67,js|nj|xw
poly,16,f,sing|eat|read,java:88|mysql:76,js|nj|xz
jack,28,m,carton|dance|read,java:90|mysql:90,js|yc|jh
[root@single01 ~]# hdfs dfs -put student_ext.txt /test/hive/student_ext //上传到hdfs上
修改表结构:
alter table student_ext change stuaddress stuaddress struct
after stuscore;
1.1 内部表MANAGED_TABLE 外部表EXTERNAL_TABLE
create [external]table
1.2create ... [as] select
create table kb16.student_copy
as
select * from kb16.student_ext
#------------------------------------------------------------------------------------------
增加自增主键
mysql> alter table tags add column id int(14) primary key auto_increment;
MySQL 给查询结果增加序号列
SELECt m.name, (@a :=@a + 1) a FROM sys_menu m, (SELECt @a := 0) t1
@a 在mysql中是声明变量
:= 在mysql中是给变量赋值
t1表其实就一条a=0的数据,并列查询,给变量a递增赋值
mysql> create table act select a.*,@id:=@id+1 id from Activity a,(select @id:=0) t;
#----------------------------------------------------------------------------------------
1.3create ... like existed_table
create table kb16.student_like
like
kb16.student_ext
4、create partitioned table
create external table kb16.user_movie_rating(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
row format delimited
fields terminated by ','
location '/test/hive/user_movie_rating'
tblproperties ("skip.header.line.count"="1");
create external table kb16.user_movie_rating_par(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by(dt string)
row format delimited
fields terminated by ',';
开启动态分区(默认已开启)
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition;
+-----------------------------------+
| set |
+-----------------------------------+
| hive.exec.dynamic.partition=true |
将动态分区设置为非严格模式
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode;
+------------------------------------------+
| set |
+------------------------------------------+
| hive.exec.dynamic.partition.mode=strict |
+------------------------------------------+
1 row selected (0.007 seconds)
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode=nonstrict;
No rows affected (0.005 seconds)
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode;
+---------------------------------------------+
| set |
+---------------------------------------------+
| hive.exec.dynamic.partition.mode=nonstrict |
+---------------------------------------------+
##查询最大日期和最小日期之间相隔多少个月
select
month_between(maxdt,mindt) monthdiff
from(
select max(from_unixtime(`timestamp`)) maxdt,min(from_unixtime(`timestamp`)) mindt
from kb16.user_movie_rating
)A;
set hive.exec.dynamic.partition;
=>1000
set hive.exec.dynamic.partition.pernode;
=>100
set hive.exec.dynamic.partition.pernode=1000;
set hive.exec.max.dynamic.partitions.pernode=1000;
insert overwrite into table kb16.user_movie_rating_par partition(dt)
select userid,movieid,rating,`timestamp`,date_format(from_unixtime(`timestamp`),'yyyy-MM') dt
from user_movie_rating;
drop table user_movie_rating;
select * from user_movie_rating limit 10
tblproperties ("skip.header.line.count"="1");//Hive导入数据时忽略文件第一行
select from_unixtime( 1641521500 );
select unix_timestamp;
select date_format(from_unixtime( 1641521500 ),'yyyy-MM')
#一年作为分区
insert overwrite table user_movie_rating_par partition(dt)
select U.*,date_format(from_unixtime(U.`timestamp`),'yyyy') from user_movie_rating U
select U.*,date_format(from_unixtime(U.`timestamp`),'yyyy')from user_movie_rating U limit 10;
#hive 分区分桶表
create external table user_movie_rating_par_bucket(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by (years int)
clustered by (`timestamp`) sorted by (`timestamp` ASC) into 5 buckets
row format delimited fields terminated by ',' ;
create external table user_movie_rating_par_bucket(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by (years int)
clustered by (`timestamp`) sorted by (`timestamp` ASC) into 5 buckets
row format delimited fields terminated by ',';
insert overwrite table user_movie_rating_par_bucket partition(years) select *,pmod(cast(date_format(from_unixtime(`timestamp`),'yyyy') as int),5) years from user_movie_rating;
#example
#*************************************************************************
create table kb16.student(
stuname string,
stuage int,
stugender string
);
insert into kb16.student(stuname,stuage,stugender) values('heney',18,'m'),('poly',18,'f'),('hen',18,'s');
0: jdbc:hive2://192.168.43.200:10000> desc formatted kb16.student;
mysql> show tables;
mysql> desc DBS; #数据库表详情
mysql> mysql> select * from DBS; #查询数据库
mysql> mysql> select * from TBLS; #查询表
mysql> select * from COLUMNS_V2; #查询字段
3、搜索语句
select * from student_ext;
select
stuname,stuage,stugender,
stuhobby[0] hobby1,stuhobby[1] hobby2,stuhobby[2] hobby3,
stuscore['java'] javascore,stuscore['mysql'] mysqlscore,
stuaddress.province province,
stuaddress.city city,
stuaddress.district district
from kb16.student_ext
where array_contains(stuhobby,'read');
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| stuname | stuage | stugender | hobby1 | hobby2 | hobby3 | javascore | mysqlscore | province | city | district |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| henry | 18 | m | sing | dance | read | 88 | 67 | js | nj | xw |
| poly | 16 | f | sing | eat | read | 88 | 76 | js | nj | xz |
| jack | 28 | m | carton | dance | read | 90 | 90 | js | yc | jh |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
select
stuname,stuage,stugender,
stuhobby[0] hobby1,stuhobby[1] hobby2,stuhobby[2] hobby3,
stuscore['java'] javascore,stuscore['mysql'] mysqlscore,
stuaddress.province province,
stuaddress.city city,
stuaddress.district district
from kb16.student_ext
where stuaddress.city='nj';
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| stuname | stuage | stugender | hobby1 | hobby2 | hobby3 | javascore | mysqlscore | province | city | district |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| henry | 18 | m | sing | dance | read | 88 | 67 | js | nj | xw |
| poly | 16 | f | sing | eat | read | 88 | 76 | js | nj | xz |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+



