首页 hive窗口函数(一)
文章
取消

hive窗口函数(一)

Hive 中提供了越来越多的分析函数,用于完成负责的统计分析。利用这些分析函数与窗口函数结合,可以快速、方便的做一些聚合、统计等运算。

SUM

  1. 需要注意的是:结果和 ORDER BY 有关,默认为升序

  2. 如果不指定 ROWS BETWEEN,默认为从起点到当前行

  3. 如果不指定 ORDER BY,则将分组内所有值累加

  4. 关键是理解 ROWS BETWEEN 含义,也叫做 WINDOW 子句:

  • PRECEDING:往前

  • FOLLOWING:往后

  • CURRENT ROW:当前行

  • UNBOUNDED:起点

    • UNBOUNDED PRECEDING 表示从前面的起点
    • UNBOUNDED FOLLOWING:表示到后面的终点
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH sum_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    -- 分组内所有行
    SUM(pv) OVER(PARTITION BY sessionid) AS pv1,
    -- 从起点到当前行(默认)
    SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
    -- 从起点到当前行,结果同 pv2
    SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
    -- 当前行 + 往前3行
    SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
    -- 当前行 + 往前3行 + 往后1行
    SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
    -- 当前行 + 往后所有行
    SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM sum_pv
ORDER BY ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   pv1   pv2   pv3   pv4   pv5   pv6
session1    2019-07-01    1    26    1     1     1     4     26
session1    2019-07-02    3    26    4     4     4     9     25
session1    2019-07-03    5    26    9     9     9     16    22
session1    2019-07-04    7    26    16    16    16    18    17
session1    2019-07-05    2    26    18    18    17    21    10
session1    2019-07-06    4    26    22    22    18    22    8
session1    2019-07-07    4    26    26    26    17    17    4
  • pv1: 分组内(session1)所有的 pv 累加。

  • pv2: 分组内从起点到当前行的 pv 累加。例如:2号的pv2 = 1号的pv + 2号的pv。

  • pv3: 同pv2。

  • pv4: 分组内当前行 + 往前3行。例如:2号 = 1号 + 2号, 5号 = 2号 + 3号 + 4号 + 5号。

  • pv5: 分组内当前行 + 往前3行 + 往后1行。例如:5号 = 2号 + 3号 + 4号 + 5号 + 6号

  • pv6: 分组内当前行 + 往后所有行。例如:5号 = 5号 + 6号 + 7号

AVG

用法和 sum 相同

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
WITH avg_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    -- 分组内所有行
    AVG(pv) OVER(PARTITION BY sessionid) AS pv1,
    -- 从起点到当前行(默认)
    AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
    -- 从起点到当前行,结果同 pv2
    AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
    -- 当前行 + 往前3行
    AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
    -- 当前行 + 往前3行 + 往后1行
    AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
    -- 当前行 + 往后所有行
    AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM avg_pv
ORDER BY ctime
;

--------------------------------------------------------------------------
-- 由于结果小数比较多,所以手动保留了两位(四舍五入)
sessionid   ctime         pv   pv1     pv2     pv3     pv4     pv5     pv6
session1    2019-07-01    1    3.71    1.0     1.0     1.0     2.0     3.71
session1    2019-07-02    3    3.71    2.0     2.0     2.0     3.0     4.17
session1    2019-07-03    5    3.71    3.0     3.0     3.0     4.0     4.4
session1    2019-07-04    7    3.71    4.0     4.0     4.0     3.6     4.25
session1    2019-07-05    2    3.71    3.6     3.6     4.25    4.2     3.33
session1    2019-07-06    4    3.71    3.67    3.67    4.5     4.4     4.0
session1    2019-07-07    4    3.71    3.71    3.71    4.25    4.25    4.0

MIN

用法和 sum 相同

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH min_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    -- 分组内所有行
    MIN(pv) OVER(PARTITION BY sessionid) AS pv1,
    -- 从起点到当前行(默认)
    MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
    -- 从起点到当前行,结果同 pv2
    MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
    -- 当前行 + 往前3行
    MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
    -- 当前行 + 往前3行 + 往后1行
    MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
    -- 当前行 + 往后所有行
    MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM min_pv
ORDER BY ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   pv1  pv2  pv3  pv4  pv5  pv6
session1    2019-07-01    1    1    1    1    1    1    1
session1    2019-07-02    3    1    1    1    1    1    2
session1    2019-07-03    5    1    1    1    1    1    2
session1    2019-07-04    7    1    1    1    1    1    2
session1    2019-07-05    2    1    1    1    2    2    2
session1    2019-07-06    4    1    1    1    2    2    4
session1    2019-07-07    4    1    1    1    2    2    4

MAX

用法和 sum 相同

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH max_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    -- 分组内所有行
    MAX(pv) OVER(PARTITION BY sessionid) AS pv1,
    -- 从起点到当前行(默认)
    MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
    -- 从起点到当前行,结果同 pv2
    MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
    -- 当前行 + 往前3行
    MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
    -- 当前行 + 往前3行 + 往后1行
    MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
    -- 当前行 + 往后所有行
    MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM max_pv
ORDER BY ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   pv1  pv2  pv3  pv4  pv5  pv6
session1    2019-07-01    1    7    1    1    1    3    7
session1    2019-07-02    3    7    3    3    3    5    7
session1    2019-07-03    5    7    5    5    5    7    7
session1    2019-07-04    7    7    7    7    7    7    7
session1    2019-07-05    2    7    7    7    7    7    4
session1    2019-07-06    4    7    7    7    7    7    4
session1    2019-07-07    4    7    7    7    7    7    4

NTILE

  • NTILE(n),用于将分组数据按照顺序切分成n片,返回当前切片值

  • 如果切片不均匀,默认会对第一个分组、第二个分组、第三个分组等依次加一

  • NTILE() 不支持 WINDOW 子句

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
WITH ntile_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    NTILE(2) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn1,
    NTILE(3) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn2,
    NTILE(4) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn3,
    NTILE(4) OVER(ORDER BY ctime) AS rn4
FROM ntile_pv
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   rn1  rn2  rn3  rn4
session1    2019-07-01    1    1    1    1    1
session1    2019-07-02    3    1    1    1    1
session1    2019-07-03    5    1    1    2    2
session1    2019-07-04    7    1    2    2    2
session1    2019-07-05    2    2    2    3    3
session1    2019-07-06    4    2    3    3    4
session1    2019-07-07    4    2    3    4    4
session2    2019-07-01    4    1    1    1    1
session2    2019-07-02    5    1    1    1    1
session2    2019-07-03    6    1    1    2    2
session2    2019-07-04    7    1    2    2    2
session2    2019-07-05    2    2    2    3    3
session2    2019-07-06    3    2    3    3    3
session2    2019-07-07    6    2    3    4    4

例如:统计一个session,pv 数最多的前 1/3 的天

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
WITH ntile_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    NTILE(3) OVER(PARTITION BY sessionid ORDER BY pv DESC) AS rn
FROM ntile_pv
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
-- rn = 1 的记录,就是我们想要的结果
sessionid   ctime         pv   rn
session1    2019-07-01    1    3
session1    2019-07-02    3    2
session1    2019-07-03    5    1
session1    2019-07-04    7    1
session1    2019-07-05    2    3
session1    2019-07-06    4    2
session1    2019-07-07    4    1
session2    2019-07-01    4    2
session2    2019-07-02    5    2
session2    2019-07-03    6    1
session2    2019-07-04    7    1
session2    2019-07-05    2    3
session2    2019-07-06    3    3
session2    2019-07-07    6    1

ROW_NUMBER

  • ROW_NUMBER() 从1开始,按照顺序,生成分组内记录的序列

  • ROW_NUMBER() 应用场景很多,例如:按照pv降序排列,生成分组内每天的pv名次;获取分组内排序第一的记录;获取一个session中的第一条refer等

  • ROW_NUMBER() 不支持 WINDOW 子句

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
WITH row_number_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn1,
    ROW_NUMBER() OVER(ORDER BY pv desc) AS rn2
FROM row_number_pv
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   rn1  rn2
session1    2019-07-01    1    7    14
session1    2019-07-02    3    5    11
session1    2019-07-03    5    2    5
session1    2019-07-04    7    1    2
session1    2019-07-05    2    6    13
session1    2019-07-06    4    4    9
session1    2019-07-07    4    3    7
session2    2019-07-01    4    5    8
session2    2019-07-02    5    4    6
session2    2019-07-03    6    2    4
session2    2019-07-04    7    1    1
session2    2019-07-05    2    7    12
session2    2019-07-06    3    6    10
session2    2019-07-07    6    3    3

RANK 和 DENSE_RANK

  • RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位

  • DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位

  • RANK() 和 DENSE_RANK() 均不支持 WINDOW 子句

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
WITH rank_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    RANK() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn1,
    DENSE_RANK() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn2,
    ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY pv DESC) AS rn3
FROM rank_pv
WHERE sessionid = 'session1'
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime         pv   rn1  rn2  rn3
session1    2019-07-01    1    7    6    7
session1    2019-07-02    3    5    4    5
session1    2019-07-03    5    2    2    2
session1    2019-07-04    7    1    1    1
session1    2019-07-05    2    6    5    6
session1    2019-07-06    4    3    3    4
session1    2019-07-07    4    3    3    3
  • rn1: 6号和7号并列第3, 2号排第5

  • rn2: 6号和7号并列第3, 2号排第4

  • rn3: 如果两个值相等,也会生成唯一的次序,具体是按照什么规则把某一个值排在前面还没有深究

本文由作者按照 CC BY 4.0 进行授权