首页 hive高级函数
文章
取消

hive高级函数

对 hive 一些不常见但是很有用的函数做一个整理。这些函数有时候是解决问题的一剂良药,但是还要考虑到这些函数的效率问题。

CONCAT_WS

将多行记录合并成一行。 CONCAT_WS() 代表 Concat With Separator ,是 CONCAT() 的特殊形式。第一个参数是其它参数的分隔符,分隔符的位置放在要连接的两个字符串之间。分隔符可以是一个字符串,也可以是其它参数,如果分隔符为 NULL,则结果为 NULL 。 CONCAT_WS() 会忽略任何分隔符参数后的 NULL 值,但是不会忽略任何空字符串。

1
2
3
4
5
6
7
8
9
SELECT
    CONCAT_WS(',',
        CONCAT('username', ':', 'zhangsan'),
        CONCAT('password', ':', 'lisi')
    )
;

---------------------------------------------------------------------------
username:zhangsan,password:lisi

STR_TO_MAP

将字符串转换成字典类型。

1
2
3
4
5
6
7
8
9
10
11
SELECT
    STR_TO_MAP(
        CONCAT_WS(',',
            CONCAT('username', ':', 'zhangsan'),
            CONCAT('password', ':', 'lisi')
        )
    )
;

---------------------------------------------------------------------------
{"password":"lisi","username":"zhangsan"}

COLLECT_LIST

将某列的多行进行合并,生成一个列表。 COLLECT_LIST() 回的类型是 Array<?> 类型, ? 表示该列的类型。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
WITH user_info AS(
    SELECT 'zhangsan' AS username, 'lisi' AS password UNION ALL
    SELECT 'zhangsan' AS username, 'lisi' AS password
)

SELECT
    username,
    COLLECT_LIST(password) AS password
FROM user_info
GROUP BY
    username
;

---------------------------------------------------------------------------
username    password
zhangsan    ["lisi","lisi"]

COLLECT_SET

COLLECT_SET() 的用法与 COLLECT_LIST() 相似,不同的是 COLLECT_SET() 会对数据进行去重,COLLECT_LIST() 不会对数据进行去重。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
WITH user_info AS(
    SELECT 'zhangsan' AS username, 'lisi' AS password UNION ALL
    SELECT 'zhangsan' AS username, 'lisi' AS password
)

SELECT
    username,
    COLLECT_SET(password) AS password
FROM user_info
GROUP BY
    username
;

---------------------------------------------------------------------------
username    password
zhangsan    ["lisi"]

EXPLODE

将一个列表转换为一列(行转列)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
WITH user_info AS(
    SELECT 'zhangsan' AS username, 'lisi' AS password UNION ALL
    SELECT 'zhangsan' AS username, 'lisi' AS password
)

SELECT
    EXPLODE(password) AS passwords
FROM(
    SELECT
        username,
        COLLECT_LIST(password) AS password
    FROM user_info
    GROUP BY
        username
)t
;

---------------------------------------------------------------------------
passwords
lisi
lisi

LATERAL VIEW

Hive 官方文档

LATERAL VIEW 用于和 SPLIT, EXPLODE 等 UDTF 一起使用,它能够将一行数据拆成多行数据,在此基础上可以对拆分后的数据进行聚合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
WITH page_ads AS(
    SELECT 'front_page'   AS pageid, ARRAY(1, 2, 3) AS adid_list UNION ALL
    SELECT 'contact_page' AS pageid, ARRAY(4, 5, 6) AS adid_list
)

SELECT
    pageid,
    adid
FROM page_ads
LATERAL VIEW EXPLODE(adid_list) adTable AS adid
;

---------------------------------------------------------------------------
pageid          adid
front_page      1
front_page      2
front_page      3
contact_page    1
contact_page    2
contact_page    3

Multiple Lateral Views

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
WITH base AS(
    SELECT ARRAY(1, 2) AS col1, ARRAY('a', 'b', 'c') AS col2 UNION ALL
    SELECT ARRAY(3, 4) AS col1, ARRAY('d', 'e', 'f') AS col2
)

SELECT
    *
FROM base
LATERAL VIEW EXPLODE(col1) myTable1 AS myCol1
LATERAL VIEW EXPLODE(col2) myTable2 AS myCol2
;

---------------------------------------------------------------------------
col1         col2             mycol1   mycol2
["1","2"]    ["a","b","c"]    1        a
["1","2"]    ["a","b","c"]    1        b
["1","2"]    ["a","b","c"]    1        c
["1","2"]    ["a","b","c"]    2        a
["1","2"]    ["a","b","c"]    2        b
["1","2"]    ["a","b","c"]    2        c
["3","4"]    ["d","e","f"]    3        d
["3","4"]    ["d","e","f"]    3        e
["3","4"]    ["d","e","f"]    3        f
["3","4"]    ["d","e","f"]    4        d
["3","4"]    ["d","e","f"]    4        e
["3","4"]    ["d","e","f"]    4        f

Outer Lateral Views

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
WITH base AS(
    SELECT ARRAY(1, 2) AS col1, ARRAY('a', 'b', 'c') AS col2 UNION ALL
    SELECT ARRAY(3, 4) AS col1, ARRAY() AS col2
)

SELECT
    *
FROM base
LATERAL VIEW OUTER EXPLODE(col2) myTable1 AS myCol1
;

---------------------------------------------------------------------------
-- 如果没有 OUTER 则结果没有最后一行数据
col1        col2            mycol1
["1","2"]   ["a","b","c"]   a
["1","2"]   ["a","b","c"]   b
["1","2"]   ["a","b","c"]   c
["3","4"]   []              NULL

PERCENTILE

求百分位数函数,其中第一个参数是所求的目标字段(必须为 int 类型),第二个参数可以是一个百分位数,也可以是一组百分位数。第50百分位数也就是中位数。 百分位数的计算方法自行百度。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
WITH base AS(
    SELECT 0 AS col1 UNION ALL
    SELECT 1 AS col1 UNION ALL
    SELECT 2 AS col1 UNION ALL
    SELECT 3 AS col1 UNION ALL
    SELECT 4 AS col1 UNION ALL
    SELECT 5 AS col1 UNION ALL
    SELECT 6 AS col1 UNION ALL
    SELECT 7 AS col1 UNION ALL
    SELECT 8 AS col1 UNION ALL
    SELECT 9 AS col1
)

SELECT
    PERCENTILE(col1, ARRAY(0.1, 0.2, 0.5, 0.8))
FROM base
;

---------------------------------------------------------------------------
[0.9,1.8,4.5,7.2]

PERCENTILE_APPROX

求百分位数近似数的函数,其中第一个参数是所求的目标字段(必须为数值类型);第二个参数可以是一个百分位数,也可以是一组百分位数;第三个参数控制内存消耗的近似进度,数值越大精度越高,默认为 10000

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
WITH base AS(
    SELECT 0 AS col1 UNION ALL
    SELECT 1 AS col1 UNION ALL
    SELECT 2 AS col1 UNION ALL
    SELECT 3 AS col1 UNION ALL
    SELECT 4 AS col1 UNION ALL
    SELECT 5 AS col1 UNION ALL
    SELECT 6 AS col1 UNION ALL
    SELECT 7 AS col1 UNION ALL
    SELECT 8 AS col1 UNION ALL
    SELECT 9 AS col1
)

SELECT
    PERCENTILE_APPROX(col1, ARRAY(0.1, 0.2, 0.5, 0.8), 10000)
FROM base
;

---------------------------------------------------------------------------
[0.0,1.0,4.0,7.0]

REGEXP_REPLACE

正则表达式替换函数

1
2
3
4
SELECT REGEXP_REPLACE('foobar', 'oo|ar', '');

---------------------------------------------------------------------------
fb

组合用法

CONCAT_WS & COLLECT_SET

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
WITH user_info AS(
    SELECT 'zhangsan' AS username, 'lisi' AS password UNION ALL
    SELECT 'zhangsan' AS username, 'lisi' AS password
)

SELECT
    username,
    CONCAT_WS('|', COLLECT_LIST(password)) as password
FROM user_info
GROUP BY
    username
;

---------------------------------------------------------------------------
username    password
zhangsan    lisi|lisi

LATERAL VIEW & EXPLODE & SPLIT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
WITH base AS(
    SELECT '1,2' AS col1, 'a,b,c' AS col2 UNION ALL
    SELECT '3,4' AS col1, 'd,e,f' AS col2
)

SELECT
    *
FROM base
LATERAL VIEW EXPLODE(SPLIT(col1, ',')) myTable1 AS myCol1
LATERAL VIEW EXPLODE(SPLIT(col2, ',')) myTable2 AS myCol2
;

---------------------------------------------------------------------------
col1   col2     mycol1   mycol2
1,2    a,b,c    1        a
1,2    a,b,c    1        b
1,2    a,b,c    1        c
1,2    a,b,c    2        a
1,2    a,b,c    2        b
1,2    a,b,c    2        c
3,4    d,e,f    3        d
3,4    d,e,f    3        e
3,4    d,e,f    3        f
3,4    d,e,f    4        d
3,4    d,e,f    4        e
3,4    d,e,f    4        f

其它函数

在网上搜集了一些写的不错的对于 Hive 函数讲解的博客,在这里就不一一列出这些函数了,另附上博客地址。

本文由作者按照 CC BY 4.0 进行授权