hive笔记

#HIVE
#hive启动模式在/opt/software/hive312/bin目录下启动
[root@single01 ~]# cd /opt/software/hive312/bin

       #首先启动元数据服务
       nohup hive --service metastore 1>/dev/null 2>&1 &

       #1、方法一 hive客户端
           hive
       #2、方法二基于metastore和hiveserver2的beeline
       hdfs dfsadmin -safemode get
           #启动hiveserver2服务
           nohup hive --service hiveserver2 1>/dev/null 2>&1 &
           #通过beeline启动hive客户端
           beeline -u jdbc:hive2://localhost:10000
           beeline -u jdbc:hive2://single01:10000
           beeline -u jdbc:hive2://192.168.43.200:10000
#是一套根据客户需求，集合各种大数据组件工具对客户数据进行处理，管理，治理的方案
#见图C:Users86182Picturespictureshive_warehouse.png
给与权限
[root@single01 ~]# hdfs dfs -chown -R root:supergroup /hive312
[root@single01 ~]# hdfs dfs -chmod -R 777 /tmp
数据仓库
概念：
数据仓库(Data Warehouse,简称数仓、DW)，是一个用于存储、分析、报告的数据系统。
数据仓库的目的是构建面向分析的集成化数据环境，分析结果为企业提供决策支持。
数据仓库本身并不“生产”任何数据，其数据来源于不同外部系统；
# 数据类型
#primitive_type          #java
TINYINT                       byte
SMALLINT                   short
INT                           int*
BIGIINT                       long*
BOOLEAN                       boolean
FLOAT                       float
DOUBLE                       double
DOUBLE PRECISION           double
STRING                       String*
BINARY
TIMESTAMP                   date*
DECIMAL
DECIMAL(precision,scale)   BigDecimal*
DATE                       date*
VARCHAR                       String
CHAR                       String

复杂类型：
array_type
:ARRAY
map_type
: MAP
struct_type #结构体
:STRUCT
union_type #联合类型
:UNIOnTYPE
#数据表
explain select * from student
create [temporary][external] table [if not exists][dbname.]tabname
[(
colname data_type [comment col_comment],
...,
[constraint_specification]
)]
[comment table_comment] //表注释
[partitioned by (external_colname data_type [comment col_comment],...)]
//分区表里的字段名不能拿出来分区
[clustered by (colname,...) [sorted by (colname ASC|DESC,...)] into num_buckets buckets]
//分桶(方便抽样查看数据的倾斜，对事务级处理优化节约内存资源) 表里固有的字段。可以排序，分桶字段和排序字段可以不是同一个字段
//num_buckets 是提供一个数字给分桶用来取余数分的
[skewed by (colname,colname,...) on ((colvalue,...),(colvalue,...),...) [stored as directories]]
//表里的字段存在倾斜字段在哪些值上 hive内部自动优化
[rowformat row_format]
row format delimited //指定行格式化分隔符
fields terminated by ',' //指定字段之间分隔符
collection items terminated by ';' //指定集合项之间分隔符
map keys terminated by ':' //指定键值之间分隔符
lines terminated by 'n' //指定行结束符
[stored as file_format | stored by 'storage.handler.classname' [with serdeproperties]] //字段关联
[location hdfs_path]
[tblproperties(key=value,...)]表属性

★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★

row_format
row format delimited //指定行格式化分隔符
fields terminated by ',' //指定字段之间分隔符
collection items terminated by ';' //指定集合项之间分隔符
map keys terminated by ':' //指定键值之间分隔符
lines terminated by 'n' //指定行结束符

file_format:
SEQUENCEFULE //二进制文件
TEXTFILE ★//默认用这个
RCFILE
ORC ★//支持压缩 snappy,lzo
PARQUET
AVRO
INPUTFORMAT input_format_classname

//数据类型描述实例
STRUCT 类似于java中的pojo对象，以结构 struct
化的多个属性组成一个共有的数据。
MAP 类似于java中的map，表示 map
键值对组成的元组集合。
ARRAY 类似于java中的数组，表示一组具有 array["A","B","C"]
相同类型和名称的变量集合。数组中
的对象可以通过一个从0开始的index
直接访问。
//举例
//1.有如下的json格式文件
{
"name":"roy",
"friends": ["bob","john"], //列表 Array,
"children": { //键值 Map,
"yula":6,
"sophia":17
}
"address": { //结构 Struct,
"street":"hongshan",
"city":"nanjing"
}
}
//2.转换格式写进文件
roy,bob_john,yula:6_sophia:17,hongshan_nanjing
mike,lea_jacky,rita:7_king:20,chaoyang_beijing
//3.建表
create table test3(
name string,
friends array,
)

1、//建表
create external table kb16.student_ext(
stuname string,
stuage int,
stugender string,
stuhobby array,
stuscore map,
stuaddress struct
)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by ':'
lines terminated by 'n'
stored as textfile
location '/test/hive/student_ext' ;

[root@single01 ~]# hdfs dfs -put student_ext.txt /test/hive/student_ext //上传到hdfs上

修改表结构：
alter table student_ext change stuaddress stuaddress struct
after stuscore;

1.1 内部表MANAGED_TABLE 外部表EXTERNAL_TABLE
create [external]table
1.2create ... [as] select
create table kb16.student_copy
as
select * from kb16.student_ext
#------------------------------------------------------------------------------------------
增加自增主键
mysql> alter table tags add column id int(14) primary key auto_increment;

MySQL 给查询结果增加序号列
SELECt m.name, (@a :=@a + 1) a FROM sys_menu m, (SELECt @a := 0) t1
@a 在mysql中是声明变量

:= 在mysql中是给变量赋值

t1表其实就一条a=0的数据，并列查询，给变量a递增赋值

mysql> create table act select a.*,@id:=@id+1 id from Activity a,(select @id:=0) t;
#----------------------------------------------------------------------------------------
1.3create ... like existed_table
create table kb16.student_like
like
kb16.student_ext

4、create partitioned table
create external table kb16.user_movie_rating(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
row format delimited
fields terminated by ','
location '/test/hive/user_movie_rating'
tblproperties ("skip.header.line.count"="1");

create external table kb16.user_movie_rating_par(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by(dt string)
row format delimited
fields terminated by ',';

开启动态分区（默认已开启）
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition;
+-----------------------------------+
| set |
+-----------------------------------+
| hive.exec.dynamic.partition=true |
将动态分区设置为非严格模式

0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode;
+------------------------------------------+
| set |
+------------------------------------------+
| hive.exec.dynamic.partition.mode=strict |
+------------------------------------------+
1 row selected (0.007 seconds)
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode=nonstrict;
No rows affected (0.005 seconds)
0: jdbc:hive2://192.168.43.200:10000> set hive.exec.dynamic.partition.mode;
+---------------------------------------------+
| set |
+---------------------------------------------+
| hive.exec.dynamic.partition.mode=nonstrict |
+---------------------------------------------+

##查询最大日期和最小日期之间相隔多少个月
select
month_between(maxdt,mindt) monthdiff
from(
select max(from_unixtime(`timestamp`)) maxdt,min(from_unixtime(`timestamp`)) mindt
from kb16.user_movie_rating
)A;

set hive.exec.dynamic.partition;
=>1000
set hive.exec.dynamic.partition.pernode;
=>100
set hive.exec.dynamic.partition.pernode=1000;

set hive.exec.max.dynamic.partitions.pernode=1000;

insert overwrite into table kb16.user_movie_rating_par partition(dt)
select userid,movieid,rating,`timestamp`,date_format(from_unixtime(`timestamp`),'yyyy-MM') dt
from user_movie_rating;

drop table user_movie_rating;
select * from user_movie_rating limit 10
tblproperties ("skip.header.line.count"="1");//Hive导入数据时忽略文件第一行

select from_unixtime( 1641521500 );
select unix_timestamp;
select date_format(from_unixtime( 1641521500 ),'yyyy-MM')

#一年作为分区
insert overwrite table user_movie_rating_par partition(dt)
select U.*,date_format(from_unixtime(U.`timestamp`),'yyyy') from user_movie_rating U

select U.*,date_format(from_unixtime(U.`timestamp`),'yyyy')from user_movie_rating U limit 10;

#hive 分区分桶表
create external table user_movie_rating_par_bucket(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by (years int)
clustered by (`timestamp`) sorted by (`timestamp` ASC) into 5 buckets
row format delimited fields terminated by ',' ;

create external table user_movie_rating_par_bucket(
userid bigint,
movieid bigint,
rating decimal(2,1),
`timestamp` bigint
)
partitioned by (years int)
clustered by (`timestamp`) sorted by (`timestamp` ASC) into 5 buckets
row format delimited fields terminated by ',';

insert overwrite table user_movie_rating_par_bucket partition(years) select *,pmod(cast(date_format(from_unixtime(`timestamp`),'yyyy') as int),5) years from user_movie_rating;

#example
#*************************************************************************
create table kb16.student(
stuname string,
stuage int,
stugender string
);
insert into kb16.student(stuname,stuage,stugender) values('heney',18,'m'),('poly',18,'f'),('hen',18,'s');

0: jdbc:hive2://192.168.43.200:10000> desc formatted kb16.student;

mysql> show tables;

mysql> desc DBS; #数据库表详情

mysql> mysql> select * from DBS; #查询数据库

mysql> mysql> select * from TBLS; #查询表

mysql> select * from COLUMNS_V2; #查询字段

3、搜索语句
select * from student_ext;

select
stuname,stuage,stugender,
stuhobby[0] hobby1,stuhobby[1] hobby2,stuhobby[2] hobby3,
stuscore['java'] javascore,stuscore['mysql'] mysqlscore,
stuaddress.province province,
stuaddress.city city,
stuaddress.district district
from kb16.student_ext
where array_contains(stuhobby,'read');

+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| stuname | stuage | stugender | hobby1 | hobby2 | hobby3 | javascore | mysqlscore | province | city | district |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| henry | 18 | m | sing | dance | read | 88 | 67 | js | nj | xw |
| poly | 16 | f | sing | eat | read | 88 | 76 | js | nj | xz |
| jack | 28 | m | carton | dance | read | 90 | 90 | js | yc | jh |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+

+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| stuname | stuage | stugender | hobby1 | hobby2 | hobby3 | javascore | mysqlscore | province | city | district |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+
| henry | 18 | m | sing | dance | read | 88 | 67 | js | nj | xw |
| poly | 16 | f | sing | eat | read | 88 | 76 | js | nj | xz |
+----------+---------+------------+---------+---------+---------+------------+-------------+-----------+-------+-----------+

hive笔记

Java相关栏目本月热门文章