首页 hive窗口函数(二)
文章
取消

hive窗口函数(二)

继续补充 Hive 窗口函数。

CUME_DIST

  • CUME_DIST() 小于等于当前值的行数/分组内总行数

  • 应用场景:统计小于等于当前 pv 的 session,所占总 session 的比例

  • CUME_DIST() 不支持 WINDOW 子句

注意:这个函数的结果和排序有关,正序排序和倒序排序的结果完全相反。小于等于当前值,这个小于等于应该是相对的,正序排序是小于等于,倒序排序是大于等于。个人理解是分组内与当前值相等的最后一行的行号/分组内总行数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
WITH cume_dist_pv AS(
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,3 AS pv UNION ALL
    SELECT 'session3' AS sessionid,'2019-07-01' AS ctime,5 AS pv UNION ALL
    SELECT 'session4' AS sessionid,'2019-07-01' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
    SELECT 'session3' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
    SELECT 'session4' AS sessionid,'2019-07-02' AS ctime,8 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    CUME_DIST() OVER(ORDER BY pv) AS rn1,
    CUME_DIST() OVER(PARTITION BY ctime ORDER BY pv ASC ) AS rn2,
    CUME_DIST() OVER(PARTITION BY ctime ORDER BY pv DESC) AS rn3
FROM cume_dist_pv
ORDER BY
    ctime,
    pv
;

--------------------------------------------------------------------------
sessionid   ctime         pv   rn1      rn2     rn3
session1    2019-07-01    1    0.125    0.25    1.0
session2    2019-07-01    3    0.375    0.5     0.75
session3    2019-07-01    5    0.75     0.75    0.5
session4    2019-07-01    7    0.875    1.0     0.25
session1    2019-07-02    2    0.25     0.25    1.0
session3    2019-07-02    4    0.625    0.75    0.75
session2    2019-07-02    4    0.625    0.75    0.75
session4    2019-07-02    8    1.0      1.0     0.25

PERCENT_RANK

  • PERCENT_RANK() 分组内当前行的 RANK 值 - 1 / 分组内总行数 - 1

  • PERCENT_RANK() 不支持 WINDOWS 子句

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
WITH percent_rank_pv AS(
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,3 AS pv UNION ALL
    SELECT 'session3' AS sessionid,'2019-07-01' AS ctime,5 AS pv UNION ALL
    SELECT 'session4' AS sessionid,'2019-07-01' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
    SELECT 'session3' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
    SELECT 'session4' AS sessionid,'2019-07-02' AS ctime,6 AS pv UNION ALL
    SELECT 'session5' AS sessionid,'2019-07-02' AS ctime,8 AS pv
)

SELECT
    sessionid,
    ctime,
    pv,
    RANK() OVER(ORDER BY pv) AS rn11,           -- 分组内RANK值
    SUM(1) OVER(PARTITION BY NULL) AS rn12,     -- 分组内总行数
    PERCENT_RANK() OVER(ORDER BY pv) AS rn1,    -- 分组内PERCENT_RANK
    PERCENT_RANK() OVER(PARTITION BY sessionid ORDER BY pv) AS rn2
FROM percent_rank_pv
;

--------------------------------------------------------------------------
sessionid   ctime         pv   rn11 rn12 rn1      rn2
session1    2019-07-01    1    1    9    0.0      0.0
session1    2019-07-02    2    2    9    0.125    1.0
session2    2019-07-01    3    3    9    0.25     0.0
session2    2019-07-02    4    4    9    0.375    1.0
session3    2019-07-02    4    4    9    0.375    0.0
session3    2019-07-01    5    6    9    0.625    1.0
session4    2019-07-02    6    7    9    0.75     0.0
session4    2019-07-01    7    8    9    0.875    1.0
session5    2019-07-02    8    9    9    1.0      0.0

LAG

  • LAG(col, n, DEFAULT) 用于统计窗口内往上第 n 行值
    • 第一个参数为列名
    • 第二个参数为往上第n行(可选,默认为1)
    • 第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
  • LAG(col, n, DEFAULT) 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
WITH lag_url AS (
    SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)

SELECT
    sessionid,
    ctime,
    url,
    ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
    LAG(ctime, 1, '1970-01-01 00:00:00') OVER(PARTITION BY sessionid ORDER BY ctime) AS last_1_time,
    LAG(ctime, 2) OVER(PARTITION BY sessionid ORDER BY ctime) AS last_2_time
FROM lag_url
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime                  url      rn   last_1_time            last_2_time
session1    2019-07-01 00:05:00    url_1    1    1970-01-01 00:00:00    NULL
session1    2019-07-01 00:10:00    url_3    2    2019-07-01 00:05:00    NULL
session1    2019-07-01 00:15:00    url_5    3    2019-07-01 00:10:00    2019-07-01 00:05:00
session1    2019-07-01 00:20:00    url_7    4    2019-07-01 00:15:00    2019-07-01 00:10:00
session1    2019-07-01 00:25:00    url_2    5    2019-07-01 00:20:00    2019-07-01 00:15:00
session1    2019-07-01 00:30:00    url_4    6    2019-07-01 00:25:00    2019-07-01 00:20:00
session1    2019-07-01 00:35:00    url_4    7    2019-07-01 00:30:00    2019-07-01 00:25:00
session2    2019-07-01 00:05:00    url_3    1    1970-01-01 00:00:00    NULL
session2    2019-07-01 00:10:00    url_6    2    2019-07-01 00:05:00    NULL
session2    2019-07-01 00:40:00    url_4    3    2019-07-01 00:10:00    2019-07-01 00:05:00
session2    2019-07-01 00:45:00    url_5    4    2019-07-01 00:40:00    2019-07-01 00:10:00
session2    2019-07-01 00:50:00    url_6    5    2019-07-01 00:45:00    2019-07-01 00:40:00
session2    2019-07-01 00:55:00    url_7    6    2019-07-01 00:50:00    2019-07-01 00:45:00
session2    2019-07-01 01:00:00    url_2    7    2019-07-01 00:55:00    2019-07-01 00:50:00

LEAD

  • LEAD(col, n, DEFAULT) 用于统计窗口内往下第n行值,与 LAG 正好相反
    • 第一个参数为列名
    • 第二个参数为往下第n行(可选,默认为1)
    • 第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
  • LAG(col, n, DEFAULT) 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
WITH lead_url AS (
    SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)

SELECT
    sessionid,
    ctime,
    url,
    ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
    LEAD(ctime, 1, '1970-01-01 00:00:00') OVER(PARTITION BY sessionid ORDER BY ctime) AS next_1_time,
    LEAD(ctime, 2) OVER(PARTITION BY sessionid ORDER BY ctime) AS next_2_time
FROM lead_url
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime                  url      rn   next_1_time            next_2_time
session1    2019-07-01 00:05:00    url_1    1    2019-07-01 00:10:00    2019-07-01 00:15:00
session1    2019-07-01 00:10:00    url_3    2    2019-07-01 00:15:00    2019-07-01 00:20:00
session1    2019-07-01 00:15:00    url_5    3    2019-07-01 00:20:00    2019-07-01 00:25:00
session1    2019-07-01 00:20:00    url_7    4    2019-07-01 00:25:00    2019-07-01 00:30:00
session1    2019-07-01 00:25:00    url_2    5    2019-07-01 00:30:00    2019-07-01 00:35:00
session1    2019-07-01 00:30:00    url_4    6    2019-07-01 00:35:00    NULL
session1    2019-07-01 00:35:00    url_4    7    1970-01-01 00:00:00    NULL
session2    2019-07-01 00:05:00    url_3    1    2019-07-01 00:10:00    2019-07-01 00:40:00
session2    2019-07-01 00:10:00    url_6    2    2019-07-01 00:40:00    2019-07-01 00:45:00
session2    2019-07-01 00:40:00    url_4    3    2019-07-01 00:45:00    2019-07-01 00:50:00
session2    2019-07-01 00:45:00    url_5    4    2019-07-01 00:50:00    2019-07-01 00:55:00
session2    2019-07-01 00:50:00    url_6    5    2019-07-01 00:55:00    2019-07-01 01:00:00
session2    2019-07-01 00:55:00    url_7    6    2019-07-01 01:00:00    NULL
session2    2019-07-01 01:00:00    url_2    7    1970-01-01 00:00:00    NULL

FIRST_VALUE 和 LAST_VALUE

  • FIRST_VALUE() 取分组内排序后,截止到当前行,第一个值

  • LAST_VALUE() 取分组内排序后,截止到当前行,最后一个值

  • 如果不指定 ORDER BY,则默认按照记录在文件中的偏移量进行排序,会出现错误的结果

  • FIRST_VALUE() 和 LAST_VALUE() 均不能使用 WINDOWS 子句

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
WITH value_url AS (
    SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)

SELECT
    sessionid,
    ctime,
    url,
    ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
    FIRST_VALUE(url) OVER(PARTITION BY sessionid ORDER BY ctime) AS first_value,
    LAST_VALUE(url)  OVER(PARTITION BY sessionid ORDER BY ctime) AS last_value1,
    -- 如果想要取分组内排序后最后一个值,则需要变通一下
    FIRST_VALUE(url)  OVER(PARTITION BY sessionid ORDER BY ctime desc) AS last_value2
FROM value_url
ORDER BY sessionid,ctime
;

--------------------------------------------------------------------------
sessionid   ctime                  url      rn   first_value  last_value1  last_value2
session1    2019-07-01 00:05:00    url_1    1    url_1        url_1        url_4
session1    2019-07-01 00:10:00    url_3    2    url_1        url_3        url_4
session1    2019-07-01 00:15:00    url_5    3    url_1        url_5        url_4
session1    2019-07-01 00:20:00    url_7    4    url_1        url_7        url_4
session1    2019-07-01 00:25:00    url_2    5    url_1        url_2        url_4
session1    2019-07-01 00:30:00    url_4    6    url_1        url_4        url_4
session1    2019-07-01 00:35:00    url_4    7    url_1        url_4        url_4
session2    2019-07-01 00:05:00    url_3    1    url_3        url_3        url_2
session2    2019-07-01 00:10:00    url_6    2    url_3        url_6        url_2
session2    2019-07-01 00:40:00    url_4    3    url_3        url_4        url_2
session2    2019-07-01 00:45:00    url_5    4    url_3        url_5        url_2
session2    2019-07-01 00:50:00    url_6    5    url_3        url_6        url_2
session2    2019-07-01 00:55:00    url_7    6    url_3        url_7        url_2
session2    2019-07-01 01:00:00    url_2    7    url_3        url_2        url_2

GROUPING SETS

Hive 官方文档

GROUPING SETS 在做多维度汇总的时候还是蛮有用的,可以在一个 GROUP BY 查询中,根据不同的维度组合进行聚合,等价于将不同维度的 GROUP BY 结果集进行 UNION ALL。

GROUPING__ID

GROUPING__ID 是用来区分结果数据是哪一个分组集合的。

使用 GROUPING SETS 时,非聚合列的值为 NULL,但是也会有聚合列本身就有 NULL 值的情况,这样会出现歧义。需要有方法来识别是非聚合列的 NULL 还是本身就会有 NULL。 GROUPING__ID 函数可以解决这个问题,此函数会返回对应于每个列是否存在的位向量。对于每一列,如果在该行中聚合了该列,则该列的位向量为 1 否则为 0,最后以十进制的形式展现出来。 值得注意的是 GROUPING 和 ID 之间是双下划线

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
-- 设置最大分组个数
set hive.new.job.grouping.set.cardinality=128;

WITH grouping_set_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    SUM(pv) AS total_pv,
    GROUPING__ID
FROM grouping_set_pv
GROUP BY
    sessionid,
    ctime
GROUPING SETS(
    (),
    sessionid,
    ctime,
    (sessionid,ctime)
)
ORDER BY GROUPING__ID
;

-- 相当于
SELECT NULL,      NULL,  SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL,  SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT NULL,      ctime, SUM(pv) AS total_pv, 2 AS grouping__id FROM grouping_set_pv GROUP BY ctime
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;

--------------------------------------------------------------------------
sessionid   ctime         total_pv  grouping__id
NULL        NULL          59        0
session1    NULL          26        1
session2    NULL          33        1
NULL        2019-07-02    8         2
NULL        2019-07-01    5         2
NULL        2019-07-07    10        2
NULL        2019-07-03    11        2
NULL        2019-07-04    14        2
NULL        2019-07-05    4         2
NULL        2019-07-06    7         2
session1    2019-07-02    3         3
session1    2019-07-03    5         3
session2    2019-07-05    2         3
session2    2019-07-01    4         3
session1    2019-07-05    2         3
session2    2019-07-03    6         3
session2    2019-07-07    6         3
session1    2019-07-04    7         3
session2    2019-07-04    7         3
session2    2019-07-06    3         3
session1    2019-07-07    4         3
session2    2019-07-02    5         3
session1    2019-07-06    4         3
session1    2019-07-01    1         3

CUBE

根据 GROUP BY 的维度的所有组合进行聚合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
WITH grouping_set_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    SUM(pv) AS total_pv,
    GROUPING__ID
FROM grouping_set_pv
GROUP BY
    sessionid,
    ctime
WITH CUBE
ORDER BY GROUPING__ID
;

-- 相当于
SELECT NULL,      NULL,  SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL,  SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT NULL,      ctime, SUM(pv) AS total_pv, 2 AS grouping__id FROM grouping_set_pv GROUP BY ctime
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;

--------------------------------------------------------------------------
sessionid   ctime         total_pv  grouping__id
NULL        NULL          59        0
session1    NULL          26        1
session2    NULL          33        1
NULL        2019-07-02    8         2
NULL        2019-07-01    5         2
NULL        2019-07-07    10        2
NULL        2019-07-03    11        2
NULL        2019-07-04    14        2
NULL        2019-07-05    4         2
NULL        2019-07-06    7         2
session1    2019-07-02    3         3
session1    2019-07-03    5         3
session2    2019-07-05    2         3
session2    2019-07-01    4         3
session1    2019-07-05    2         3
session2    2019-07-03    6         3
session2    2019-07-07    6         3
session1    2019-07-04    7         3
session2    2019-07-04    7         3
session2    2019-07-06    3         3
session1    2019-07-07    4         3
session2    2019-07-02    5         3
session1    2019-07-06    4         3
session1    2019-07-01    1         3

ROLLUP

是 CUBE 的子集,以最左侧的维度为主,从该维度进行层级聚合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
WITH grouping_set_pv AS (
    SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
    SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
    SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)

SELECT
    sessionid,
    ctime,
    SUM(pv) AS total_pv,
    GROUPING__ID
FROM grouping_set_pv
GROUP BY
    sessionid,
    ctime
WITH ROLLUP
ORDER BY GROUPING__ID
;

-- 相当于
SELECT NULL,      NULL,  SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL,  SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;

--------------------------------------------------------------------------
sessionid   ctime         total_pv  grouping__id
NULL        NULL          59        0
session1    NULL          26        1
session2    NULL          33        1
session1    2019-07-05    2         3
session1    2019-07-02    3         3
session2    2019-07-04    7         3
session2    2019-07-05    2         3
session2    2019-07-02    5         3
session1    2019-07-04    7         3
session2    2019-07-03    6         3
session2    2019-07-01    4         3
session1    2019-07-07    4         3
session1    2019-07-03    5         3
session2    2019-07-06    3         3
session1    2019-07-06    4         3
session1    2019-07-01    1         3
session2    2019-07-07    6         3
本文由作者按照 CC BY 4.0 进行授权