SQL应用随笔1——各环境sql函数/语法差异

前言

由于日常工作中经常需要多种sql环境切换使用，发现了不少sql语句无法通用，借此机会做下梳理总结。以下以个别实际使用场景为例，对比sql语句在spark、hive、impala、postgre/greenplum中的异同（sparksql通过zeppelin运行），greenplum是基于postgre开发的，所以代码基本与postgre一致。
此文后续亦会持续更新，若有其他场景或其他方法补充欢迎大佬评论~~。

时间戳处理/转换获取当前时间

select from_unixtime(unix_timestamp());

select now();

获取当前日期

select current_date();

select current_date;

select date_trunc('day',now());

日期增减

select date_add(current_date, 1);
-- 或
select date_sub(current_date, 1);

select date_add(date_trunc('day', now()), 1);

select current_date + interval'1day';
select current_date - interval'2week';
select current_date + interval'3month';
select current_date - interval'4year';

获取年月

---------- 当前年月

select date_format(now(), 'yyyy-MM');
-- 或
select from_unixtime(unix_timestamp(), 'yyyy-MM');

select from_unixtime(unix_timestamp(), 'yyyy-MM');

select to_char(now(), 'yyyy-mm');

文本日期处理/转换日期增减

select date_add('2021-12-01', 1); --'2021-12-02'
select date_add('2021-12-01', -1); --'2021-11-30'
-- 或
select date_sub('2021-12-01', 1); --'2021-11-30'
select date_sub('2021-12-01', -1); --'2021-12-02'

select to_date('2021-12-01', 'yyyy-mm-dd') + interval'1day';
select to_date('2021-12-01', 'yyyy-mm-dd') - interval'2week';
select to_date('2021-12-01', 'yyyy-mm-dd') + interval'3month';
select to_date('2021-12-01', 'yyyy-mm-dd') - interval'4year';

获取年月

select date_format('2021-12-01', 'yyyy-MM');

select from_unixtime(unix_timestamp('2021-12-01'), 'yyyy-MM');

select to_char(to_date('2021-12-01', 'yyyy-mm-dd'), 'yyyy-mm');

字符串处理字符串分隔

场景：从 “hello:world:!!” 中提取 “world”

select split('hello:world:!!', ':')[1];

select split_part('hello:world:!!', ':', 2);

查找字符串首次出现位置

select position('world', 'hello world !! hello world !!!');
-- 或
select instr('hello world !! hello world !!!', 'world');
-- 或
select position('world' in 'hello world !! hello world !!!');

select instr('hello world !! hello world !!!', 'world');

select position('world' in 'hello world !! hello world !!!');

-- 延展：查询表一字段是否与表二字段存在包含关系的字符串
select tb1.product_name,
	tb2.key_words
from tb1
left join tb2
	on instr(tb1.product_name,tb2.key_words) > 0  --hive/impala选这句，zeppelin(sparksql)可选
	-- on position(tb2.key_words in tb1.product_name) > 0 --PG或GP选这句，zeppelin(sparksql)可选
    -- on position(tb2.key_words,tb1.product_name) > 0  --zeppelin(sparksql)可选
where tb1.product_name = '大开领衬衣连衣裙家居通身连衣裙' --示例
	and tb2.key_words in ('衬衣', '连衣裙', '裤子') --衬衣、连衣裙匹配成功都会返回结果；裤子匹配失败返回0，会被过滤掉
;

反转字符串

select reverse('hello world !! hello world !!!'); -- '!!! dlrow olleh !! dlrow olleh'
-- 可用于匹配长字符串中，匹配多个关键字时，筛选最后一个出现的关键字；
/* product_name = '大开领衬衣连衣裙家居通身连衣裙', tb2.key_words in ('衬衣', '连衣裙', '裤子')
* 通过reverse反转字符串后，筛选 min(instr(reverse(product_name),reverse(key_words))) 的匹配结果，可得到最后出现的关键字为'连衣裙'，可用于做商品分类
* /

截取字符串

-- 获取前/后2位字符串

-- substr或substring均可，第1个参数为所截取字符串，第2个参数为起始截取位置，第3个参数为截取长度
select substr('hello world !! hello world !!!', 1, 2); --前2位'he'
select substr('hello world !! hello world !!!', -2, 2); --后2位'!!'--此句pg/gp架构不适用
-- 注意：pg与gp架构无法使用负值代表倒数第n位，而是直接从第1位的基础上继续向左偏移
-- pg/gp中：substr('hello world',-1,4)等价于substr('hello world',0,3)等价于substr('hello world',1,2)

select left('hello world !! hello world !!!', 1); --从左往右截取1位
select right('hello world !! hello world !!!', 2); --从右往左截取2位

字符串拼接

select concat('hello ', 'world'); --'hello world'
select concat('hello ', 'world', null); --[null]
-- 注意：实际使用中，若拼接的任意字段中出现空值，将导致结果为null，为避免此情况，可选进行空值转换处理
select concat(coalesce(column1,''),coalesce(column2,''),coalesce(column3,''));

-- PG、GP库中无需做空值处理
select concat('hello ', 'world'); --'hello world'
select concat('hello ', 'world', null); --'hello world'
-- 或
select 'hello '||'world'; --'hello world'

数值格式转换其他格式转文本

select cast(123456 as string);

select 123456::varchar;
-- 或
select cast(123456 as varchar);

文本转数值

select cast('123456' as int); --int/bigint/decimal(38,2)/……

select '123456'::int; --int/bigint/decimal(38,2)/……
-- 或
select cast('123456' as int); --int/bigint/decimal(38,2)/……

其他获取字段中单层或多层嵌套的json(字典)格式数据

场景：
字段值：{“day_date”: “2021-12-01”, “permissions”: [{“authority”: 123, “brand_code”: “111”, “manager_org_code”: “12345”, “scene_code”: “1”}]}
取出嵌套在json的列表的json中的"brand_code"值： “111”

-- 待tou补xia充lan

with tmp_json as(
select '{"day_date": "2021-12-01", "permissions": [{"authority": 123, "brand_code": "111", "manager_org_code": "12345", "scene_code": "1"}]}' json_1
)
select json_1,
	json_1::json->'permissions' list_1,
	json_1::json->'permissions'->0 dict_1,
	json_1::json->'permissions'->0->'brand_code' text_1,
	json_1::json->'permissions'->0->>'brand_code' text_2
from tmp_json
-- "->"提取后，字段类型依然是json，故最后一步提取字段值需要用"->>"

PG架构运行结果如下图，可以看到每一步操作后，字段的类型是不一样的；

写在最后

-- sparksql运行时需将句末分号";"去除。
-- 未完待续……

SQL应用随笔1——各环境sql函数/语法差异

大数据系统相关栏目本月热门文章