SparkSQL操作hudi_大数据系统

SparkSQL操作hudi

文章目录

SparkSQL操作hudi

1、登录2、创建普通表3、创建分区表4、从现有表创建表5、用查询结果创建新表(CTAS)6、插入数据7、查询数据8、修改数据9、合并数据10、删除数据11、覆盖写入12、修改数据表13、hudi分区命令

SparkSQL操作hudi 1、登录

#spark 3.1
spark-sql --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' 
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'

#spark 3.0
spark-sql --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' 
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'

2、创建普通表

#创建普通表
create table hudi_cow_nonpcf_tbl (
  uuid int,
  name string,
  price double
) using hudi;
#创建没有分区的没有聚合字段
create table hudi_mor_tbl (
  id int,
  name string,
  price double,
  ts bigint
) using hudi
tblproperties (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
);

3、创建分区表

#创建分区表
create table hudi_cow_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl';

4、从现有表创建表

#非分区表
create table hudi_existing_tbl0 using hudi
location 'file:///tmp/hudi/dataframe_hudi_nonpt_table';
#分区表
create table hudi_existing_tbl1 using hudi
partitioned by (dt, hh)
location 'file:///tmp/hudi/dataframe_hudi_pt_table';

5、用查询结果创建新表(CTAS)

#用查询结果创建新表
create table hudi_ctas_cow_nonpcf_tbl
using hudi
tblproperties (primaryKey = 'id')
as
select 1 as id, 'a1' as name, 10 as price;
#用查询结果创建分区表
create table hudi_ctas_cow_pt_tbl
using hudi
tblproperties (type = 'cow', primaryKey = 'id', preCombineField = 'ts')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as ts, '2021-12-01' as dt;
#创建parquet数据表
create table parquet_mngd using parquet location 'file:///tmp/parquet_dataset

10、删除数据

#语法结构
DELETE FROM tableIdentifier [ WHERe BOOL_expression]
#例程
delete from hudi_cow_nonpcf_tbl where uuid = 1;

11、覆盖写入

#普通表
insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900;
#动态分区表
insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10';
#静态分区表
insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;

12、修改数据表

#修改表名
ALTER TABLE oldTableName RENAME TO newTableName
#例程
ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2;
#增加列簇
ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)
#例程
ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string);
#修改列簇类型
ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType
#例程
ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint;
#修改数据
ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value')
#例程
alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');

13、hudi分区命令

#显示分区
SHOW PARTITIONS tableIdentifier
#例程
show partitions hudi_cow_pt_tbl;
#删除分区
ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] )
#例程
alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');

SparkSQL操作hudi

大数据系统相关栏目本月热门文章