SparkSQL操作hudi
1、登录2、创建普通表3、创建分区表4、从现有表创建表5、用查询结果创建新表(CTAS)6、插入数据7、查询数据8、修改数据9、合并数据10、删除数据11、覆盖写入12、修改数据表13、hudi分区命令
SparkSQL操作hudi 1、登录#spark 3.1 spark-sql --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' #spark 3.0 spark-sql --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'2、创建普通表
#创建普通表 create table hudi_cow_nonpcf_tbl ( uuid int, name string, price double ) using hudi; #创建没有分区的没有聚合字段 create table hudi_mor_tbl ( id int, name string, price double, ts bigint ) using hudi tblproperties ( type = 'mor', primaryKey = 'id', preCombineField = 'ts' );3、创建分区表
#创建分区表 create table hudi_cow_pt_tbl ( id bigint, name string, ts bigint, dt string, hh string ) using hudi tblproperties ( type = 'cow', primaryKey = 'id', preCombineField = 'ts' ) partitioned by (dt, hh) location '/tmp/hudi/hudi_cow_pt_tbl';4、从现有表创建表
#非分区表 create table hudi_existing_tbl0 using hudi location 'file:///tmp/hudi/dataframe_hudi_nonpt_table'; #分区表 create table hudi_existing_tbl1 using hudi partitioned by (dt, hh) location 'file:///tmp/hudi/dataframe_hudi_pt_table';5、用查询结果创建新表(CTAS)
#用查询结果创建新表 create table hudi_ctas_cow_nonpcf_tbl using hudi tblproperties (primaryKey = 'id') as select 1 as id, 'a1' as name, 10 as price; #用查询结果创建分区表 create table hudi_ctas_cow_pt_tbl using hudi tblproperties (type = 'cow', primaryKey = 'id', preCombineField = 'ts') partitioned by (dt) as select 1 as id, 'a1' as name, 10 as price, 1000 as ts, '2021-12-01' as dt; #创建parquet数据表 create table parquet_mngd using parquet location 'file:///tmp/parquet_dataset10、删除数据
#语法结构 DELETE FROM tableIdentifier [ WHERe BOOL_expression] #例程 delete from hudi_cow_nonpcf_tbl where uuid = 1;11、覆盖写入
#普通表 insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900; #动态分区表 insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10'; #静态分区表 insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;12、修改数据表
#修改表名 ALTER TABLE oldTableName RENAME TO newTableName #例程 ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2; #增加列簇 ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*) #例程 ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string); #修改列簇类型 ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType #例程 ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint; #修改数据 ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value') #例程 alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');13、hudi分区命令
#显示分区 SHOW PARTITIONS tableIdentifier #例程 show partitions hudi_cow_pt_tbl; #删除分区 ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] ) #例程 alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');



