-
[BigQuery] 사용자의 서비스 사용 속성별로 집계데이터 분석/SQL 2022. 5. 13. 20:22더보기
출처 : 데이터 분석을 위한 SQL 레시피
11장 사용자 전체의 특징과 경향 찾기
더보기DROP TABLE IF EXISTS ch4.mst_users;CREATE TABLE ch4.mst_users(user_id string(255), sex string(255), birth_date string(255), register_date string(255), register_device string(255), withdraw_date string(255));INSERT INTO ch4.mst_usersVALUES('U001', 'M', '1977-06-17', '2016-10-01', 'pc' , NULL ), ('U002', 'F', '1953-06-12', '2016-10-01', 'sp' , '2016-10-10'), ('U003', 'M', '1965-01-06', '2016-10-01', 'pc' , NULL ), ('U004', 'F', '1954-05-21', '2016-10-05', 'pc' , NULL ), ('U005', 'M', '1987-11-23', '2016-10-05', 'sp' , NULL ), ('U006', 'F', '1950-01-21', '2016-10-10', 'pc' , '2016-10-10'), ('U007', 'F', '1950-07-18', '2016-10-10', 'app', NULL ), ('U008', 'F', '2006-12-09', '2016-10-10', 'sp' , NULL ), ('U009', 'M', '2004-10-23', '2016-10-15', 'pc' , NULL ), ('U010', 'F', '1987-03-18', '2016-10-16', 'pc' , NULL );DROP TABLE IF EXISTS ch4.action_log;CREATE TABLE ch4.action_log(session string(255), user_id string(255), action string(255), category string(255), products string(255), amount integer, stamp string(255));INSERT INTO ch4.action_logVALUES('989004ea', 'U001', 'purchase', 'drama' , 'D001,D002', 2000, '2016-11-03 18:10:00'), ('989004ea', 'U001', 'view' , NULL , NULL , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'favorite', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'review' , 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 18:00:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D002' , NULL, '2016-11-03 18:01:00'), ('989004ea', 'U001', 'add_cart', 'drama' , 'D001,D002', NULL, '2016-11-03 18:02:00'), ('989004ea', 'U001', 'purchase', 'drama' , 'D001,D002', 2000, '2016-11-03 18:10:00'), ('47db0370', 'U002', 'add_cart', 'drama' , 'D001' , NULL, '2016-11-03 19:00:00'), ('47db0370', 'U002', 'purchase', 'drama' , 'D001' , 1000, '2016-11-03 20:00:00'), ('47db0370', 'U002', 'add_cart', 'drama' , 'D002' , NULL, '2016-11-03 20:30:00'), ('87b5725f', 'U001', 'add_cart', 'action', 'A004' , NULL, '2016-11-04 12:00:00'), ('87b5725f', 'U001', 'add_cart', 'action', 'A005' , NULL, '2016-11-04 12:00:00'), ('87b5725f', 'U001', 'add_cart', 'action', 'A006' , NULL, '2016-11-04 12:00:00'), ('9afaf87c', 'U002', 'purchase', 'drama' , 'D002' , 1000, '2016-11-04 13:00:00'), ('9afaf87c', 'U001', 'purchase', 'action', 'A005,A006', 1000, '2016-11-04 15:00:00');11-1 사용자 액션 수 집계하기
사용 함수 : COUNT, COUNT(DISTINCT), ROLLUP
-- 로그인 상태에 따라 액션 수 등을 따로 집계하는 쿼리 with -- 가상 테이블 생성 action_log_with_status as ( select session, user_id, action, case when coalesce(user_id, '') <> '' then 'login' else 'guest' end as login_status from ch4.action_log ) select -- action, login_status에 따른 사용자 수 집계 coalesce(action, 'all') as action, coalesce(login_status, 'all') as login_status, count(distinct session) as action_uu, count(1) as action_count from action_log_with_status group by rollup(action, login_status);
-- 회원 상태를 판별하는 쿼리 with action_log_with_status as ( select session, user_id, action, -- 각 세션에서 user_id가 있는지 확인하고 회원과 비회원 구별 case when coalesce(max(user_id) over(partition by session order by stamp rows between unbounded preceding and current row) , '') <> '' then 'member' else 'none' end as member_status, stamp from ch4.action_log ) select * from action_log_with_status;
11-2 연령별 구분 집계하기
사용 함수 : CASE, CAST
--연령별 구분의 사람 수를 계산하는 쿼리 with mst_users_with_int_birth_date as ( select *, 20170101 as int_specific_date, cast(replace(substring(birth_date, 1, 10), '-', '') as integer) as int_birth_date from ch4.mst_users ), mst_users_with_age as ( select *, floor((int_specific_date - int_birth_date)/10000) as age from mst_users_with_int_birth_date ) ,mst_users_with_category as ( select user_id, sex, age, concat( case when 20 <= age then sex else '' end ,case when age between 4 and 12 then 'C' when age between 13 and 19 then 'T' when age between 20 and 34 then '1' when age between 35 and 49 then '2' when age >= 50 then '3' end ) as category from mst_users_with_age ) select category, count(1) as user_count from mst_users_with_category group by category;
11-3 연령별 구분의 특징 추출하기
사용 함수 : JOIN, GROUP BY
-- 연령별 구분과 카테고리를 집계하는 쿼리 with mst_users_with_int_birth_date as ( select *, 20170101 as int_specific_date, cast(replace(substring(birth_date, 1, 10), '-', '') as integer) as int_birth_date from ch4.mst_users ), mst_users_with_age as ( select *, floor((int_specific_date - int_birth_date)/10000) as age from mst_users_with_int_birth_date ) ,mst_users_with_category as ( select user_id, sex, age, concat( case when 20 <= age then sex else '' end ,case when age between 4 and 12 then 'C' when age between 13 and 19 then 'T' when age between 20 and 34 then '1' when age between 35 and 49 then '2' when age >= 50 then '3' end ) as category from mst_users_with_age ) select p.category as product_category, u.category as user_category, count(*) as purchase_count from ch4.action_log as p join mst_users_with_category as u on p.user_id = u.user_id where action = 'purchase' group by p.category, u.category order by p.category, u.category;
11-4 사용자의 방문 빈도 집계하기
사용 함수 : SUM
==구성비와 구성비누계를 계산하는 쿼리 with action_log_with_dt as ( select *, substring(stamp, 1, 10) as dt from ch4.action_log ), action_day_count_per_user as ( select user_id, count(distinct dt) as action_day_count from action_log_with_dt where dt between '2016-11-01' and '2016-11-07' group by user_id ) select action_day_count, count(distinct user_id) as user_count, 100.0 * count(distinct user_id)/ sum(count(distinct user_id)) over() as composition_ratio, 100.0 * sum(count(distinct user_id)) over(order by action_day_count rows between unbounded preceding and current row) / sum(count(distinct user_id)) over() as cumulative_ratio from action_day_count_per_user group by action_day_count order by action_day_count;
11-5 벤 다이어그램으로 사용자 액션 집계하기
사용 함수 : SIGN, SUM, CASE
-- CUBE 구문을 사용하지 않고 표준 SQL 구문만으로 CUBE 같이 표현하는 쿼리 with user_action_flag as ( select user_id, sign(sum(case when action = 'purchase' then 1 else 0 end)) as has_purchase, sign(sum(case when action = 'review' then 1 else 0 end)) as has_review, sign(sum(case when action = 'favorite' then 1 else 0 end)) as has_favorite from `windy-elevator-348511.ch4.action_log` group by user_id ), action_venn_diagram as ( select has_purchase, has_review, has_favorite, count(1) as users from user_action_flag group by has_purchase, has_review, has_favorite union all select null as has_purchase, has_review, has_favorite, count(1) as users from user_action_flag group by has_review, has_favorite union all select has_purchase, null as has_review, has_favorite, count(1) as users from user_action_flag group by has_purchase, has_favorite union all select has_purchase, has_review, null as has_favorite, count(1) as users from user_action_flag group by has_purchase, has_review union all select null as has_purchase, null as has_review, has_favorite, count(1) as users from user_action_flag group by has_favorite union all select null as has_purchase, has_review, null as has_favorite, count(1) as users from user_action_flag group by has_review union all select has_purchase, null as has_review, null as has_favorite, count(1) as users from user_action_flag group by has_purchase union all select null as has_purchase, null as has_review, null as has_favorite, count(1) as users from user_action_flag ) select * from action_venn_diagram order by has_purchase, has_review, has_favorite;
--유사적으로 null을 포함한 레코드를 추가해서 CUBE 구문과 같은 결과를 얻는 쿼리 with user_action_flag as ( select user_id, sign(sum(case when action = 'purchase' then 1 else 0 end)) as has_purchase, sign(sum(case when action = 'review' then 1 else 0 end)) as has_review, sign(sum(case when action = 'favorite' then 1 else 0 end)) as has_favorite from `windy-elevator-348511.ch4.action_log` group by user_id ),action_venn_diagram as ( select mod_has_purchase as has_purchase, mod_has_review as has_review, mod_has_favorite as has_favorite, count(1) as users from user_action_flag cross join unnest(array[has_purchase,null]) as mod_has_purchase cross join unnest(array[has_review, null]) as mod_has_review cross join unnest(array[has_favorite, null]) as mod_has_favorite group by mod_has_purchase, mod_has_review, mod_has_favorite ) select * from action_venn_diagram order by has_purchase, has_review, has_favorite;
11-6 Decile 분석을 사용해 사용자를 10단계 그룸으로 나누기
사용 함수 : NTILE
-- 구매액이 많은 decile 순서로 구성비와 구성비누계를 계산하는 쿼리 with user_purchase_amount as ( select user_id, sum(amount) as purchase_amount from ch4.action_log where action = 'purchase' group by user_id ), users_with_decile as ( select user_id, purchase_amount, ntile(10) over (order by purchase_amount desc) as decile from user_purchase_amount ), decile_with_purchase_amount as ( select decile, sum(purchase_amount) as amount, avg(purchase_amount) as avg_amount, sum(sum(purchase_amount)) over(order by decile) as cumulative_amount, sum(sum(purchase_amount)) over() as total_amount from users_with_decile group by decile ) select decile, amount, avg_amount, 100.0 * amount / total_amount as total_ratio, 100.0 * cumulative_amount/total_amount as cumulative_ratio from decile_with_purchase_amount;
11-7 RFM 분석으로 사용자를 3가지 관점의 그룹으로 나누기
사용 함수 : CASE, generate_series
-- R과 F를 사용해 2차원 사용자 층의 사용자 수를 집계하는 쿼리 with purchase_log as( select user_id, amount, substring(stamp,1,10) as dt from ch4.action_log where action = 'purchase' ), user_rfm as ( select user_id, max(dt) as recent_date, date_diff(current_date, date(timestamp(max(dt))),day) as recency, count(dt) as frequency, sum(amount) as monetary from purchase_log group by user_id ), user_rfm_rank as ( select user_id, recent_date, recency, frequency, monetary, case when recency < 14 then 5 when recency < 28 then 4 when recency < 60 then 3 when recency < 90 then 2 else 1 end as r, case when 20 <= frequency then 5 when 10 <= frequency then 4 when 5 <= frequency then 3 when 2 <= frequency then 2 when 1 = frequency then 1 end as f , case when 300000 <= monetary then 5 when 100000 <= monetary then 4 when 30000 <= monetary then 3 when 5000 <= monetary then 2 else 1 end as m from user_rfm ) select concat('r_', CAST(r as string)) as r_rank, count(case when f = 5 then 1 end) as f_5, count(case when f = 4 then 1 end) as f_4, count(case when f = 3 then 1 end) as f_3, count(case when f = 2 then 1 end) as f_2, count(case when f = 1 then 1 end) as f_1 from user_rfm_rank group by r order by r_rank desc;
'데이터 분석 > SQL' 카테고리의 다른 글
[BigQuery] ROLLUP 없이 ROLLUP 구현하기 (0) 2022.05.11 [BigQuery] Table 생성 (0) 2022.05.07 [BigQuery] CROSS JOIN을 이용한 Table 열 압축, 정리 (0) 2022.05.05 [PostgreSQL] timestamp 다루기 (0) 2022.04.10 [PostgreSQL] CASE문 (0) 2022.04.09