继续补充 Hive 窗口函数。
CUME_DIST
CUME_DIST() 小于等于当前值的行数/分组内总行数
应用场景:统计小于等于当前 pv 的 session,所占总 session 的比例
CUME_DIST() 不支持 WINDOW 子句
注意:这个函数的结果和排序有关,正序排序和倒序排序的结果完全相反。小于等于当前值,这个小于等于应该是相对的,正序排序是小于等于,倒序排序是大于等于。个人理解是分组内与当前值相等的最后一行的行号/分组内总行数。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
WITH cume_dist_pv AS(
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,3 AS pv UNION ALL
SELECT 'session3' AS sessionid,'2019-07-01' AS ctime,5 AS pv UNION ALL
SELECT 'session4' AS sessionid,'2019-07-01' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
SELECT 'session3' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
SELECT 'session4' AS sessionid,'2019-07-02' AS ctime,8 AS pv
)
SELECT
sessionid,
ctime,
pv,
CUME_DIST() OVER(ORDER BY pv) AS rn1,
CUME_DIST() OVER(PARTITION BY ctime ORDER BY pv ASC ) AS rn2,
CUME_DIST() OVER(PARTITION BY ctime ORDER BY pv DESC) AS rn3
FROM cume_dist_pv
ORDER BY
ctime,
pv
;
--------------------------------------------------------------------------
sessionid ctime pv rn1 rn2 rn3
session1 2019-07-01 1 0.125 0.25 1.0
session2 2019-07-01 3 0.375 0.5 0.75
session3 2019-07-01 5 0.75 0.75 0.5
session4 2019-07-01 7 0.875 1.0 0.25
session1 2019-07-02 2 0.25 0.25 1.0
session3 2019-07-02 4 0.625 0.75 0.75
session2 2019-07-02 4 0.625 0.75 0.75
session4 2019-07-02 8 1.0 1.0 0.25
PERCENT_RANK
PERCENT_RANK() 分组内当前行的 RANK 值 - 1 / 分组内总行数 - 1
PERCENT_RANK() 不支持 WINDOWS 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
WITH percent_rank_pv AS(
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,3 AS pv UNION ALL
SELECT 'session3' AS sessionid,'2019-07-01' AS ctime,5 AS pv UNION ALL
SELECT 'session4' AS sessionid,'2019-07-01' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
SELECT 'session3' AS sessionid,'2019-07-02' AS ctime,4 AS pv UNION ALL
SELECT 'session4' AS sessionid,'2019-07-02' AS ctime,6 AS pv UNION ALL
SELECT 'session5' AS sessionid,'2019-07-02' AS ctime,8 AS pv
)
SELECT
sessionid,
ctime,
pv,
RANK() OVER(ORDER BY pv) AS rn11, -- 分组内RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12, -- 分组内总行数
PERCENT_RANK() OVER(ORDER BY pv) AS rn1, -- 分组内PERCENT_RANK
PERCENT_RANK() OVER(PARTITION BY sessionid ORDER BY pv) AS rn2
FROM percent_rank_pv
;
--------------------------------------------------------------------------
sessionid ctime pv rn11 rn12 rn1 rn2
session1 2019-07-01 1 1 9 0.0 0.0
session1 2019-07-02 2 2 9 0.125 1.0
session2 2019-07-01 3 3 9 0.25 0.0
session2 2019-07-02 4 4 9 0.375 1.0
session3 2019-07-02 4 4 9 0.375 0.0
session3 2019-07-01 5 6 9 0.625 1.0
session4 2019-07-02 6 7 9 0.75 0.0
session4 2019-07-01 7 8 9 0.875 1.0
session5 2019-07-02 8 9 9 1.0 0.0
LAG
- LAG(col, n, DEFAULT) 用于统计窗口内往上第 n 行值
- 第一个参数为列名
- 第二个参数为往上第n行(可选,默认为1)
- 第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
- LAG(col, n, DEFAULT) 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
WITH lag_url AS (
SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)
SELECT
sessionid,
ctime,
url,
ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
LAG(ctime, 1, '1970-01-01 00:00:00') OVER(PARTITION BY sessionid ORDER BY ctime) AS last_1_time,
LAG(ctime, 2) OVER(PARTITION BY sessionid ORDER BY ctime) AS last_2_time
FROM lag_url
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime url rn last_1_time last_2_time
session1 2019-07-01 00:05:00 url_1 1 1970-01-01 00:00:00 NULL
session1 2019-07-01 00:10:00 url_3 2 2019-07-01 00:05:00 NULL
session1 2019-07-01 00:15:00 url_5 3 2019-07-01 00:10:00 2019-07-01 00:05:00
session1 2019-07-01 00:20:00 url_7 4 2019-07-01 00:15:00 2019-07-01 00:10:00
session1 2019-07-01 00:25:00 url_2 5 2019-07-01 00:20:00 2019-07-01 00:15:00
session1 2019-07-01 00:30:00 url_4 6 2019-07-01 00:25:00 2019-07-01 00:20:00
session1 2019-07-01 00:35:00 url_4 7 2019-07-01 00:30:00 2019-07-01 00:25:00
session2 2019-07-01 00:05:00 url_3 1 1970-01-01 00:00:00 NULL
session2 2019-07-01 00:10:00 url_6 2 2019-07-01 00:05:00 NULL
session2 2019-07-01 00:40:00 url_4 3 2019-07-01 00:10:00 2019-07-01 00:05:00
session2 2019-07-01 00:45:00 url_5 4 2019-07-01 00:40:00 2019-07-01 00:10:00
session2 2019-07-01 00:50:00 url_6 5 2019-07-01 00:45:00 2019-07-01 00:40:00
session2 2019-07-01 00:55:00 url_7 6 2019-07-01 00:50:00 2019-07-01 00:45:00
session2 2019-07-01 01:00:00 url_2 7 2019-07-01 00:55:00 2019-07-01 00:50:00
LEAD
- LEAD(col, n, DEFAULT) 用于统计窗口内往下第n行值,与 LAG 正好相反
- 第一个参数为列名
- 第二个参数为往下第n行(可选,默认为1)
- 第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
- LAG(col, n, DEFAULT) 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
WITH lead_url AS (
SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)
SELECT
sessionid,
ctime,
url,
ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
LEAD(ctime, 1, '1970-01-01 00:00:00') OVER(PARTITION BY sessionid ORDER BY ctime) AS next_1_time,
LEAD(ctime, 2) OVER(PARTITION BY sessionid ORDER BY ctime) AS next_2_time
FROM lead_url
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime url rn next_1_time next_2_time
session1 2019-07-01 00:05:00 url_1 1 2019-07-01 00:10:00 2019-07-01 00:15:00
session1 2019-07-01 00:10:00 url_3 2 2019-07-01 00:15:00 2019-07-01 00:20:00
session1 2019-07-01 00:15:00 url_5 3 2019-07-01 00:20:00 2019-07-01 00:25:00
session1 2019-07-01 00:20:00 url_7 4 2019-07-01 00:25:00 2019-07-01 00:30:00
session1 2019-07-01 00:25:00 url_2 5 2019-07-01 00:30:00 2019-07-01 00:35:00
session1 2019-07-01 00:30:00 url_4 6 2019-07-01 00:35:00 NULL
session1 2019-07-01 00:35:00 url_4 7 1970-01-01 00:00:00 NULL
session2 2019-07-01 00:05:00 url_3 1 2019-07-01 00:10:00 2019-07-01 00:40:00
session2 2019-07-01 00:10:00 url_6 2 2019-07-01 00:40:00 2019-07-01 00:45:00
session2 2019-07-01 00:40:00 url_4 3 2019-07-01 00:45:00 2019-07-01 00:50:00
session2 2019-07-01 00:45:00 url_5 4 2019-07-01 00:50:00 2019-07-01 00:55:00
session2 2019-07-01 00:50:00 url_6 5 2019-07-01 00:55:00 2019-07-01 01:00:00
session2 2019-07-01 00:55:00 url_7 6 2019-07-01 01:00:00 NULL
session2 2019-07-01 01:00:00 url_2 7 1970-01-01 00:00:00 NULL
FIRST_VALUE 和 LAST_VALUE
FIRST_VALUE() 取分组内排序后,
截止到当前行
,第一个值LAST_VALUE() 取分组内排序后,
截止到当前行
,最后一个值如果不指定 ORDER BY,则默认按照记录在文件中的偏移量进行排序,会出现错误的结果
FIRST_VALUE() 和 LAST_VALUE() 均不能使用 WINDOWS 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
WITH value_url AS (
SELECT 'session1' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_1' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:15:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:20:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:25:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:30:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session1' AS sessionid,'2019-07-01 00:35:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:40:00' AS ctime,'url_4' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:45:00' AS ctime,'url_5' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:50:00' AS ctime,'url_6' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:55:00' AS ctime,'url_7' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 01:00:00' AS ctime,'url_2' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:05:00' AS ctime,'url_3' AS url UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01 00:10:00' AS ctime,'url_6' AS url
)
SELECT
sessionid,
ctime,
url,
ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY ctime) AS rn,
FIRST_VALUE(url) OVER(PARTITION BY sessionid ORDER BY ctime) AS first_value,
LAST_VALUE(url) OVER(PARTITION BY sessionid ORDER BY ctime) AS last_value1,
-- 如果想要取分组内排序后最后一个值,则需要变通一下
FIRST_VALUE(url) OVER(PARTITION BY sessionid ORDER BY ctime desc) AS last_value2
FROM value_url
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime url rn first_value last_value1 last_value2
session1 2019-07-01 00:05:00 url_1 1 url_1 url_1 url_4
session1 2019-07-01 00:10:00 url_3 2 url_1 url_3 url_4
session1 2019-07-01 00:15:00 url_5 3 url_1 url_5 url_4
session1 2019-07-01 00:20:00 url_7 4 url_1 url_7 url_4
session1 2019-07-01 00:25:00 url_2 5 url_1 url_2 url_4
session1 2019-07-01 00:30:00 url_4 6 url_1 url_4 url_4
session1 2019-07-01 00:35:00 url_4 7 url_1 url_4 url_4
session2 2019-07-01 00:05:00 url_3 1 url_3 url_3 url_2
session2 2019-07-01 00:10:00 url_6 2 url_3 url_6 url_2
session2 2019-07-01 00:40:00 url_4 3 url_3 url_4 url_2
session2 2019-07-01 00:45:00 url_5 4 url_3 url_5 url_2
session2 2019-07-01 00:50:00 url_6 5 url_3 url_6 url_2
session2 2019-07-01 00:55:00 url_7 6 url_3 url_7 url_2
session2 2019-07-01 01:00:00 url_2 7 url_3 url_2 url_2
GROUPING SETS
GROUPING SETS 在做多维度汇总的时候还是蛮有用的,可以在一个 GROUP BY 查询中,根据不同的维度组合进行聚合,等价于将不同维度的 GROUP BY 结果集进行 UNION ALL。
GROUPING__ID
GROUPING__ID 是用来区分结果数据是哪一个分组集合的。
使用 GROUPING SETS 时,非聚合列的值为 NULL,但是也会有聚合列本身就有 NULL 值的情况,这样会出现歧义。需要有方法来识别是非聚合列的 NULL 还是本身就会有 NULL。 GROUPING__ID 函数可以解决这个问题,此函数会返回对应于每个列是否存在的位向量。对于每一列,如果在该行中聚合了该列,则该列的位向量为 1 否则为 0,最后以十进制的形式展现出来。 值得注意的是 GROUPING 和 ID 之间是双下划线
。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
-- 设置最大分组个数
set hive.new.job.grouping.set.cardinality=128;
WITH grouping_set_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
SUM(pv) AS total_pv,
GROUPING__ID
FROM grouping_set_pv
GROUP BY
sessionid,
ctime
GROUPING SETS(
(),
sessionid,
ctime,
(sessionid,ctime)
)
ORDER BY GROUPING__ID
;
-- 相当于
SELECT NULL, NULL, SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL, SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT NULL, ctime, SUM(pv) AS total_pv, 2 AS grouping__id FROM grouping_set_pv GROUP BY ctime
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;
--------------------------------------------------------------------------
sessionid ctime total_pv grouping__id
NULL NULL 59 0
session1 NULL 26 1
session2 NULL 33 1
NULL 2019-07-02 8 2
NULL 2019-07-01 5 2
NULL 2019-07-07 10 2
NULL 2019-07-03 11 2
NULL 2019-07-04 14 2
NULL 2019-07-05 4 2
NULL 2019-07-06 7 2
session1 2019-07-02 3 3
session1 2019-07-03 5 3
session2 2019-07-05 2 3
session2 2019-07-01 4 3
session1 2019-07-05 2 3
session2 2019-07-03 6 3
session2 2019-07-07 6 3
session1 2019-07-04 7 3
session2 2019-07-04 7 3
session2 2019-07-06 3 3
session1 2019-07-07 4 3
session2 2019-07-02 5 3
session1 2019-07-06 4 3
session1 2019-07-01 1 3
CUBE
根据 GROUP BY 的维度的所有组合进行聚合。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
WITH grouping_set_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
SUM(pv) AS total_pv,
GROUPING__ID
FROM grouping_set_pv
GROUP BY
sessionid,
ctime
WITH CUBE
ORDER BY GROUPING__ID
;
-- 相当于
SELECT NULL, NULL, SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL, SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT NULL, ctime, SUM(pv) AS total_pv, 2 AS grouping__id FROM grouping_set_pv GROUP BY ctime
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;
--------------------------------------------------------------------------
sessionid ctime total_pv grouping__id
NULL NULL 59 0
session1 NULL 26 1
session2 NULL 33 1
NULL 2019-07-02 8 2
NULL 2019-07-01 5 2
NULL 2019-07-07 10 2
NULL 2019-07-03 11 2
NULL 2019-07-04 14 2
NULL 2019-07-05 4 2
NULL 2019-07-06 7 2
session1 2019-07-02 3 3
session1 2019-07-03 5 3
session2 2019-07-05 2 3
session2 2019-07-01 4 3
session1 2019-07-05 2 3
session2 2019-07-03 6 3
session2 2019-07-07 6 3
session1 2019-07-04 7 3
session2 2019-07-04 7 3
session2 2019-07-06 3 3
session1 2019-07-07 4 3
session2 2019-07-02 5 3
session1 2019-07-06 4 3
session1 2019-07-01 1 3
ROLLUP
是 CUBE 的子集,以最左侧的维度为主,从该维度进行层级聚合。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
WITH grouping_set_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
SUM(pv) AS total_pv,
GROUPING__ID
FROM grouping_set_pv
GROUP BY
sessionid,
ctime
WITH ROLLUP
ORDER BY GROUPING__ID
;
-- 相当于
SELECT NULL, NULL, SUM(pv) AS total_pv, 0 AS grouping__id FROM grouping_set_pv
UNION ALL
SELECT sessionid, NULL, SUM(pv) AS total_pv, 1 AS grouping__id FROM grouping_set_pv GROUP BY sessionid
UNION ALL
SELECT sessionid, ctime, SUM(pv) AS total_pv, 3 AS grouping__id FROM grouping_set_pv GROUP BY sessionid, ctime
;
--------------------------------------------------------------------------
sessionid ctime total_pv grouping__id
NULL NULL 59 0
session1 NULL 26 1
session2 NULL 33 1
session1 2019-07-05 2 3
session1 2019-07-02 3 3
session2 2019-07-04 7 3
session2 2019-07-05 2 3
session2 2019-07-02 5 3
session1 2019-07-04 7 3
session2 2019-07-03 6 3
session2 2019-07-01 4 3
session1 2019-07-07 4 3
session1 2019-07-03 5 3
session2 2019-07-06 3 3
session1 2019-07-06 4 3
session1 2019-07-01 1 3
session2 2019-07-07 6 3