Hive 中提供了越来越多的分析函数,用于完成负责的统计分析。利用这些分析函数与窗口函数结合,可以快速、方便的做一些聚合、统计等运算。
SUM
需要注意的是:结果和 ORDER BY 有关,默认为升序
如果不指定 ROWS BETWEEN,默认为从起点到当前行
如果不指定 ORDER BY,则将分组内所有值累加
关键是理解 ROWS BETWEEN 含义,也叫做 WINDOW 子句:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点
- UNBOUNDED PRECEDING 表示从前面的起点
- UNBOUNDED FOLLOWING:表示到后面的终点
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH sum_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)
SELECT
sessionid,
ctime,
pv,
-- 分组内所有行
SUM(pv) OVER(PARTITION BY sessionid) AS pv1,
-- 从起点到当前行(默认)
SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
-- 从起点到当前行,结果同 pv2
SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
-- 当前行 + 往前3行
SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
-- 当前行 + 往前3行 + 往后1行
SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
-- 当前行 + 往后所有行
SUM(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM sum_pv
ORDER BY ctime
;
--------------------------------------------------------------------------
sessionid ctime pv pv1 pv2 pv3 pv4 pv5 pv6
session1 2019-07-01 1 26 1 1 1 4 26
session1 2019-07-02 3 26 4 4 4 9 25
session1 2019-07-03 5 26 9 9 9 16 22
session1 2019-07-04 7 26 16 16 16 18 17
session1 2019-07-05 2 26 18 18 17 21 10
session1 2019-07-06 4 26 22 22 18 22 8
session1 2019-07-07 4 26 26 26 17 17 4
pv1: 分组内(session1)所有的 pv 累加。
pv2: 分组内从起点到当前行的 pv 累加。例如:2号的pv2 = 1号的pv + 2号的pv。
pv3: 同pv2。
pv4: 分组内当前行 + 往前3行。例如:2号 = 1号 + 2号, 5号 = 2号 + 3号 + 4号 + 5号。
pv5: 分组内当前行 + 往前3行 + 往后1行。例如:5号 = 2号 + 3号 + 4号 + 5号 + 6号
pv6: 分组内当前行 + 往后所有行。例如:5号 = 5号 + 6号 + 7号
AVG
用法和 sum 相同
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
WITH avg_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)
SELECT
sessionid,
ctime,
pv,
-- 分组内所有行
AVG(pv) OVER(PARTITION BY sessionid) AS pv1,
-- 从起点到当前行(默认)
AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
-- 从起点到当前行,结果同 pv2
AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
-- 当前行 + 往前3行
AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
-- 当前行 + 往前3行 + 往后1行
AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
-- 当前行 + 往后所有行
AVG(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM avg_pv
ORDER BY ctime
;
--------------------------------------------------------------------------
-- 由于结果小数比较多,所以手动保留了两位(四舍五入)
sessionid ctime pv pv1 pv2 pv3 pv4 pv5 pv6
session1 2019-07-01 1 3.71 1.0 1.0 1.0 2.0 3.71
session1 2019-07-02 3 3.71 2.0 2.0 2.0 3.0 4.17
session1 2019-07-03 5 3.71 3.0 3.0 3.0 4.0 4.4
session1 2019-07-04 7 3.71 4.0 4.0 4.0 3.6 4.25
session1 2019-07-05 2 3.71 3.6 3.6 4.25 4.2 3.33
session1 2019-07-06 4 3.71 3.67 3.67 4.5 4.4 4.0
session1 2019-07-07 4 3.71 3.71 3.71 4.25 4.25 4.0
MIN
用法和 sum 相同
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH min_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)
SELECT
sessionid,
ctime,
pv,
-- 分组内所有行
MIN(pv) OVER(PARTITION BY sessionid) AS pv1,
-- 从起点到当前行(默认)
MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
-- 从起点到当前行,结果同 pv2
MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
-- 当前行 + 往前3行
MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
-- 当前行 + 往前3行 + 往后1行
MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
-- 当前行 + 往后所有行
MIN(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM min_pv
ORDER BY ctime
;
--------------------------------------------------------------------------
sessionid ctime pv pv1 pv2 pv3 pv4 pv5 pv6
session1 2019-07-01 1 1 1 1 1 1 1
session1 2019-07-02 3 1 1 1 1 1 2
session1 2019-07-03 5 1 1 1 1 1 2
session1 2019-07-04 7 1 1 1 1 1 2
session1 2019-07-05 2 1 1 1 2 2 2
session1 2019-07-06 4 1 1 1 2 2 4
session1 2019-07-07 4 1 1 1 2 2 4
MAX
用法和 sum 相同
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
WITH max_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv
)
SELECT
sessionid,
ctime,
pv,
-- 分组内所有行
MAX(pv) OVER(PARTITION BY sessionid) AS pv1,
-- 从起点到当前行(默认)
MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime) AS pv2,
-- 从起点到当前行,结果同 pv2
MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv3,
-- 当前行 + 往前3行
MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,
-- 当前行 + 往前3行 + 往后1行
MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,
-- 当前行 + 往后所有行
MAX(pv) OVER(PARTITION BY sessionid ORDER BY ctime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM max_pv
ORDER BY ctime
;
--------------------------------------------------------------------------
sessionid ctime pv pv1 pv2 pv3 pv4 pv5 pv6
session1 2019-07-01 1 7 1 1 1 3 7
session1 2019-07-02 3 7 3 3 3 5 7
session1 2019-07-03 5 7 5 5 5 7 7
session1 2019-07-04 7 7 7 7 7 7 7
session1 2019-07-05 2 7 7 7 7 7 4
session1 2019-07-06 4 7 7 7 7 7 4
session1 2019-07-07 4 7 7 7 7 7 4
NTILE
NTILE(n),用于将分组数据按照顺序切分成n片,返回当前切片值
如果切片不均匀,默认会对第一个分组、第二个分组、第三个分组等依次加一
NTILE() 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
WITH ntile_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
pv,
NTILE(2) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn1,
NTILE(3) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn2,
NTILE(4) OVER(PARTITION BY sessionid ORDER BY ctime) AS rn3,
NTILE(4) OVER(ORDER BY ctime) AS rn4
FROM ntile_pv
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime pv rn1 rn2 rn3 rn4
session1 2019-07-01 1 1 1 1 1
session1 2019-07-02 3 1 1 1 1
session1 2019-07-03 5 1 1 2 2
session1 2019-07-04 7 1 2 2 2
session1 2019-07-05 2 2 2 3 3
session1 2019-07-06 4 2 3 3 4
session1 2019-07-07 4 2 3 4 4
session2 2019-07-01 4 1 1 1 1
session2 2019-07-02 5 1 1 1 1
session2 2019-07-03 6 1 1 2 2
session2 2019-07-04 7 1 2 2 2
session2 2019-07-05 2 2 2 3 3
session2 2019-07-06 3 2 3 3 3
session2 2019-07-07 6 2 3 4 4
例如:统计一个session,pv 数最多的前 1/3 的天
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
WITH ntile_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
pv,
NTILE(3) OVER(PARTITION BY sessionid ORDER BY pv DESC) AS rn
FROM ntile_pv
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
-- rn = 1 的记录,就是我们想要的结果
sessionid ctime pv rn
session1 2019-07-01 1 3
session1 2019-07-02 3 2
session1 2019-07-03 5 1
session1 2019-07-04 7 1
session1 2019-07-05 2 3
session1 2019-07-06 4 2
session1 2019-07-07 4 1
session2 2019-07-01 4 2
session2 2019-07-02 5 2
session2 2019-07-03 6 1
session2 2019-07-04 7 1
session2 2019-07-05 2 3
session2 2019-07-06 3 3
session2 2019-07-07 6 1
ROW_NUMBER
ROW_NUMBER() 从1开始,按照顺序,生成分组内记录的序列
ROW_NUMBER() 应用场景很多,例如:按照pv降序排列,生成分组内每天的pv名次;获取分组内排序第一的记录;获取一个session中的第一条refer等
ROW_NUMBER() 不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
WITH row_number_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
pv,
ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn1,
ROW_NUMBER() OVER(ORDER BY pv desc) AS rn2
FROM row_number_pv
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime pv rn1 rn2
session1 2019-07-01 1 7 14
session1 2019-07-02 3 5 11
session1 2019-07-03 5 2 5
session1 2019-07-04 7 1 2
session1 2019-07-05 2 6 13
session1 2019-07-06 4 4 9
session1 2019-07-07 4 3 7
session2 2019-07-01 4 5 8
session2 2019-07-02 5 4 6
session2 2019-07-03 6 2 4
session2 2019-07-04 7 1 1
session2 2019-07-05 2 7 12
session2 2019-07-06 3 6 10
session2 2019-07-07 6 3 3
RANK 和 DENSE_RANK
RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
RANK() 和 DENSE_RANK() 均不支持 WINDOW 子句
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
WITH rank_pv AS (
SELECT 'session1' AS sessionid,'2019-07-01' AS ctime,1 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-02' AS ctime,3 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-03' AS ctime,5 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-06' AS ctime,4 AS pv UNION ALL
SELECT 'session1' AS sessionid,'2019-07-07' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-01' AS ctime,4 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-02' AS ctime,5 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-03' AS ctime,6 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-04' AS ctime,7 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-05' AS ctime,2 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-06' AS ctime,3 AS pv UNION ALL
SELECT 'session2' AS sessionid,'2019-07-07' AS ctime,6 AS pv
)
SELECT
sessionid,
ctime,
pv,
RANK() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn1,
DENSE_RANK() OVER(PARTITION BY sessionid ORDER BY pv desc) AS rn2,
ROW_NUMBER() OVER(PARTITION BY sessionid ORDER BY pv DESC) AS rn3
FROM rank_pv
WHERE sessionid = 'session1'
ORDER BY sessionid,ctime
;
--------------------------------------------------------------------------
sessionid ctime pv rn1 rn2 rn3
session1 2019-07-01 1 7 6 7
session1 2019-07-02 3 5 4 5
session1 2019-07-03 5 2 2 2
session1 2019-07-04 7 1 1 1
session1 2019-07-05 2 6 5 6
session1 2019-07-06 4 3 3 4
session1 2019-07-07 4 3 3 3
rn1: 6号和7号并列第3, 2号排第5
rn2: 6号和7号并列第3, 2号排第4
rn3: 如果两个值相等,也会生成唯一的次序,具体是按照什么规则把某一个值排在前面还没有深究