1. SQL中设置常量
set var :pi_sysdate = 20241114;
Variable PI_SYSDATE set to 20241114
2. CDP中impala 创建内外表
set default_transactional_type= none;
create external table stg. hd_aml_mac_ip_ext (
machinedate string,
vc_fundacco string,
ip string
)
stored as textfile
tblproperties ( 'objcapabilities' = 'extread,extwrite' ) ;
create external table stg. hd_aml_mac_ip (
machinedate string,
vc_fundacco string,
ip string
)
stored as parquet
tblproperties ( "parquet.compression" = "snappy" ) ;
3. hive导出逗号分隔文件到本地
hive - e "SELECT * from student" | sed 's/\t/,/g' > / tmp/student. csv
4. hive on mr 的参数设置 开启动态分区
set hive. exec. dynamic. partition=true;
set hive. exec. dynamic. partition. mode=nonstrict;
set hive. exec. max. dynamic. partitions=500000;
set hive. exec. max. dynamic. partitions. pernode=100000;
set mapreduce. reduce. memory. mb=4096;
5. MYSQL hive元数据
set session group_concat_max_len = 20480 ;
select concat_ws( '' ,
a. create_body_str,
CHAR ( 10 ) ,
c. tbl_comment,
CHAR ( 10 ) ,
b. partition_str,
CHAR ( 10 ) ,
a. stored_format,
CHAR ( 10 ) ,
d. compress_str,
';' ) AS create_sql
FROM (
select t. TBL_ID,
t. TBL_NAME,
case when k. INPUT_FORMAT like '%.parquet%' then 'STORED AS PARQUET'
when k. INPUT_FORMAT like '%.SequenceFile%' then 'STORED AS SEQUENCEFILE'
when k. INPUT_FORMAT like '%.Text%' then ''
else 'STORED AS NULL'
end AS stored_format,
concat_ws( '' ,
'CREATE' ,
CASE t. TBL_TYPE
WHEN 'EXTERNAL_TABLE' THEN ' EXTERNAL'
ELSE '' END ,
' TABLE IF NOT EXISTS ${schema}.' ,
t. TBL_NAME,
'(' ,
CHAR ( 10 ) ,
group_concat( concat_ws( '' ,
g. COLUMN_NAME,
' ' ,
g. TYPE_NAME,
' COMMENT ' ,
'''' ,
REPLACE ( REPLACE ( g. COMMENT , ';' , ' ' ) , '; ' , ' ' ) ,
'''' ,
CHAR ( 10 ) ) ORDER BY g. INTEGER_IDX separator ',' ) ,
')'
) AS create_body_str
from hive. TBLS t, hive. SDS k, hive. COLUMNS_V2 g, hive. DBS s
where t. SD_ID = k. SD_ID
and k. CD_ID = g. CD_ID
and s. DB_ID = t. DB_ID
and k. INPUT_FORMAT not like '%.kudu%'
and s. NAME = 'stg'
group by t. TBL_ID
) a
left join ( select t. TBL_ID,
concat_ws( '' , 'COMMENT ' , '''' , t. param_value, '''' ) AS tbl_comment
from hive. TABLE_PARAMS t
where t. param_key = 'comment'
group by t. TBL_ID
) c
on c. tbl_id = a. tbl_id
left join ( select t. TBL_ID, concat_ws( '' , 'PARTITIONED BY (' , group_concat( concat_ws( '' , t. pkey_name, ' ' , t. pkey_type, ' ' , 'COMMENT ' , '''' , t. pkey_comment, '''' )
order by t. integer_idx separator ',' ) , ')' ) AS partition_str
from hive. PARTITION_KEYS t
group by t. TBL_ID) b
ON b. tbl_id = a. tbl_id
left join ( select t. TBL_ID,
concat_ws( '' ,
'TBLPROPERTIES (' ,
'''' ,
t. PARAM_KEY,
'''' ,
'=' ,
'''' ,
t. PARAM_VALUE,
''')' ) as compress_str
from hive. TABLE_PARAMS t
where t. param_key like '%compression%'
group by t. TBL_ID, t. param_key, t. param_value
) d
on d. tbl_id = a. tbl_id
order by a. tbl_name;
6.修复数据
invalidate metadata ods. tablename;
msck repair table ods. tablename;
7. impala中时间戳转(DATE)指定格式的字符串
SELECT from_timestamp( now ( ) , 'yyyyMMdd' ) ;
SELECT to_timestamp( '20230710' , 'yyyyMMdd' )
select from_timestamp( date_add( to_timestamp( '20231201' , 'yyyyMMdd' ) , 1 ) , 'yyyyMMdd' )
select date_add( now ( ) , interval - 1 years) ;
8. 使用UDF函数
use default ;
show functions;
show create function default . genseq;
hdfs dfs - put / home/ app_adm/ etl/ udf/ udf_0608. jar / user / hive/ warehouse/ udf_0608. jar
hdfs dfs - chown hive:hive / user / hive/ warehouse/ udf_0608. jar
hdfs dfs - chmod 777 / user / hive/ warehouse/ udf_0608. jar
DROP FUNCTION DEFAULT . udf10( STRING, STRING) ;
drop function default . udf10;
create function default . clnseq as 'cn.com.businessmatrix.udf.HLSequenceCleaner' using jar 'hdfs:///user/hive/warehouse/udf_0608' ;
create function default . genseq as 'cn.com.businessmatrix.udf.HLSequenceGenerator' using jar 'hdfs:///user/hive/warehouse/udf_0608' ;
hdfs dfs - put - f / home/ file / ylb_trade_transfer_ext_out / tmp/ hive/ stg/ ylb_trade_transfer_ext_out
sudo - u hdfs hadoop fs - chown - R hive:supergroup / tmp/ hive/ stg/ ylb_trade_transfer_ext
9. impala更新KUDU表 指定主键
upsert into ${var:schema_ods}. mdm_ip_cust(
sk_invpty_of_cust
, gp_flag
)
select t. sk_invpty_of_cust,
0 as gp_flag
from ods. mdm_ip_cust t
where t. gp_flag is null ;
10.使用hadoop的archive将小文件归档
set hive. archive. enabled= true ;
set hive. archive. har. parentdir. settable= true ;
set har. partfile. size= 1099511627776 ;
ALTER TABLE A ARCHIVE PARTITION ( dt= '2020-12-24' , hr= '12' ) ;
ALTER TABLE A UNARCHIVE PARTITION ( dt= '2020-12-24' , hr= '12' ) ;
11.HBASE基本操作
hbase shell
create 'student' , 'info'
put 'student' , '1001' , 'info:sex' , 'male'
scan 'student'
scan 'student' , {STARTROW = > '1001' , STOPROW = > '1001' }
scan 'student' , {STARTROW = > '1001' }
describe 'student'
put 'student' , '1001' , 'info:name' , 'Nick'
get 'student' , '1001'
get 'student' , '1001' , 'info:name'
count 'student'
alter 'student' , {NAME= > 'info' , VERSIONS= > 3 }
get 'student' , '1001' , {COLUMN = > 'info:name' , VERSIONS= > 3 }
deleteall 'student' , '1001'
delete 'student' , '1002' , 'info:sex'
truncate 'student'
truncate 'student'
drop 'student'
list_namespace
create_namespace 'bigdata'
create 'bigdata:student' , 'info'
drop_namespace 'bigdata'
12.hive脱敏 中文不会脱敏
select mask( '不不不bbb123' ) ;
序号 策略名 策略说明 Hive 系统函数
1 Redact 用 x 屏蔽字母字符,用 n 屏蔽数字字符 mask
2 Partial mask: show last 4 仅显示最后四个字符,其他用 x 代替 mask_show_last_n
3 Partial mask: show first 4 仅显示前四个字符,其他用 x 代替 mask_show_first_n
4 Hash 用值的哈希值替换原值 mask_hash
5 Nullify 用 NULL 值替换原值 Ranger 自身实现
6 Unmasked 原样显示 Ranger 自身实现
7 Date : show only year 仅显示日期字符串的年份 mask
8 Custom Hive UDF 来自定义策略
13.基于CDH5升级到CDH6.3.x造成的语法兼容
1. 传入参数使用STRING
2. '''||value||''' - > "'||value||'"
3. "" - > ""
4. 调整成collect_set( ) 函数的使用 hive on mr , 其他使用impala跑进临时表
14.SQL脚本需在代码头部添加说明注释
[ 示例]
[ 示例]
nvl( c. en_ratio, 0 ) as plg_rati,
15.hive 修改字段并指定位置
ALTER TABLE test_change CHANGE a a1 STRING AFTER b;
16.hive中的排序
ORDER BY 全局排序,只有一个Reduce任务
SORT BY 只在本机做排序
17. sqoop将hive中的数据导出到Oracle
sqoop export
-- connect jdbc:oracle:jdbc:oracle:thin:@locallhost:1521/testdb
-- username test
-- password 123456
-- table t_test
-- export-dir '/apps/hive/warehouse/dbi.db/t_test/pdt=20191229'
-- columns ID, data_date, data_type, c1, c2, c3
-- input-fields-terminated-by '\001'
-- input-lines-terminated-by '\n'
-- input-null-string '\\N'
-- input-null-non-string '\\N' > sqoop_oracle. log 2>&1 &
18. hive 列传行
select new_fundaccount, new_bk_tradeaccount, bk_product from stg. tt0liquidateschema_tmp01
LATERAL VIEW explode( split( fundaccount, ',' ) ) fundaccount as new_fundaccount
LATERAL VIEW explode( split( bk_tradeaccount, ',' ) ) bk_tradeaccount as new_bk_tradeaccount;
create table tmp_dz as
select '000855' as bk_product,
'372402834320,37345435345435,37345343434' as fundaccount,
'982342242322342,9842423424,98345333' as tradeaccount from dual;
insert into tmp_dz
select '000845' as bk_product,
'37345343454' as fundaccount,
'98345333433' as tradeaccount from dual;
select nvl( new_fundaccount, fundaccount) as fundaccount,
nvl( new_tradeaccount, tradeaccount) as tradeaccount,
bk_product
from (
SELECT REGEXP_SUBSTR( fundaccount, '[^,]+' , 1 , ROWNUM) as new_fundaccount,
REGEXP_SUBSTR( tradeaccount, '[^,]+' , 1 , ROWNUM) as new_tradeaccount,
t. *
FROM tmp_dz t
CONNECT BY ROWNUM <= regexp_count( fundaccount, ',' ) + 1
) t;
19.hive 列传行 多列逗号分割的字段
with temp as
( select '1,2,3' as id,
'a,b,c' as name union select '4,5,6' as id,
'd,e,f' as name)
select id, name, s_id, s_name
from temp lateral view posexplode( split( id, ',' ) ) t as s_id_index,
s_id lateral view posexplode( split( name, ',' ) ) t as s_name_index,
s_name
where s_id_index = s_name_index
20.CDPhive支持事务,增删改查
create table cdhadmin_table_hive ( col1 int ) ;
insert into table cdhadmin_table_hive values ( 1 ) ;
insert into table cdhadmin_table_hive values ( 51 ) ;
insert into table cdhadmin_table_hive values ( 2 ) ;
insert into table cdhadmin_table_hive values ( 3 ) ;
select * from cdhadmin_table_hive;
delete from cdhadmin_table_hive where col1 = 51 ;
select * from cdhadmin_table_hive;
update cdhadmin_table_hive set col1= 300 where col1= 3 ;
select * from cdhadmin_table_hive;
grant select on database default to user cdhadmin;