SparkSQL操作hudi
1、登录2、创建普通表3、创建分区表4、从现有表创建表5、用查询结果创建新表(CTAS)6、插入数据7、查询数据8、修改数据9、合并数据10、删除数据11、覆盖写入12、修改数据表13、hudi分区命令 SparkSQL操作hudi 1、登录
#spark 3.1spark-sql --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'#spark 3.0spark-sql --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
2、创建普通表#创建普通表create table hudi_cow_nonpcf_tbl ( uuid int, name string, price double) using hudi;#创建没有分区的没有聚合字段create table hudi_mor_tbl ( id int, name string, price double, ts bigint) using huditblproperties ( type = 'mor', primaryKey = 'id', preCombineField = 'ts');
3、创建分区表#创建分区表create table hudi_cow_pt_tbl ( id bigint, name string, ts bigint, dt string, hh string) using huditblproperties ( type = 'cow', primaryKey = 'id', preCombineField = 'ts' )partitioned by (dt, hh)location '/tmp/hudi/hudi_cow_pt_tbl';
4、从现有表创建表#非分区表create table hudi_existing_tbl0 using hudilocation 'file:///tmp/hudi/dataframe_hudi_nonpt_table';#分区表create table hudi_existing_tbl1 using hudipartitioned by (dt, hh)location 'file:///tmp/hudi/dataframe_hudi_pt_table';
5、用查询结果创建新表(CTAS)#用查询结果创建新表create table hudi_ctas_cow_nonpcf_tblusing huditblproperties (primaryKey = 'id')asselect 1 as id, 'a1' as name, 10 as price;#用查询结果创建分区表create table hudi_ctas_cow_pt_tblusing huditblproperties (type = 'cow', primaryKey = 'id', preCombineField = 'ts')partitioned by (dt)asselect 1 as id, 'a1' as name, 10 as price, 1000 as ts, '2021-12-01' as dt;#创建parquet数据表create table parquet_mngd using parquet location 'file:///tmp/parquet_dataset
10、删除数据#语法结构DELETE FROM tableIdentifier [ WHERe BOOL_expression]#例程delete from hudi_cow_nonpcf_tbl where uuid = 1;
11、覆盖写入#普通表insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900;#动态分区表insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10';#静态分区表insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;
12、修改数据表#修改表名ALTER TABLE oldTableName RENAME TO newTableName#例程ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2;#增加列簇ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)#例程ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string);#修改列簇类型ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType#例程ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint;#修改数据ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value')#例程alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');
13、hudi分区命令#显示分区SHOW PARTITIONS tableIdentifier#例程show partitions hudi_cow_pt_tbl;#删除分区ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ..、] )#例程alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');