ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • [BigQuery] 사용자의 서비스 사용 속성별로 집계
    데이터 분석/SQL 2022. 5. 13. 20:22
    더보기

    출처 : 데이터 분석을 위한 SQL 레시피

    11장 사용자 전체의 특징과 경향 찾기

    더보기
    DROP TABLE IF EXISTS ch4.mst_users;
    CREATE TABLE ch4.mst_users(
        user_id         string(255)
      , sex             string(255)
      , birth_date      string(255)
      , register_date   string(255)
      , register_device string(255)
      , withdraw_date   string(255)
    );

     

    INSERT INTO ch4.mst_users
    VALUES
        ('U001''M''1977-06-17''2016-10-01''pc' , NULL        )
      , ('U002''F''1953-06-12''2016-10-01''sp' , '2016-10-10')
      , ('U003''M''1965-01-06''2016-10-01''pc' , NULL        )
      , ('U004''F''1954-05-21''2016-10-05''pc' , NULL        )
      , ('U005''M''1987-11-23''2016-10-05''sp' , NULL        )
      , ('U006''F''1950-01-21''2016-10-10''pc' , '2016-10-10')
      , ('U007''F''1950-07-18''2016-10-10''app'NULL        )
      , ('U008''F''2006-12-09''2016-10-10''sp' , NULL        )
      , ('U009''M''2004-10-23''2016-10-15''pc' , NULL        )
      , ('U010''F''1987-03-18''2016-10-16''pc' , NULL        )
    ;

     

    DROP TABLE IF EXISTS ch4.action_log;
    CREATE TABLE ch4.action_log(
        session  string(255)
      , user_id  string(255)
      , action   string(255)
      , category string(255)
      , products string(255)
      , amount   integer
      , stamp    string(255)
    );

     

    INSERT INTO ch4.action_log
    VALUES
        ('989004ea''U001''purchase''drama' , 'D001,D002'2000'2016-11-03 18:10:00')
      , ('989004ea''U001''view'    , NULL    , NULL       , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''favorite''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''review'  , 'drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D001'     , NULL'2016-11-03 18:00:00')
      , ('989004ea''U001''add_cart''drama' , 'D002'     , NULL'2016-11-03 18:01:00')
      , ('989004ea''U001''add_cart''drama' , 'D001,D002'NULL'2016-11-03 18:02:00')
      , ('989004ea''U001''purchase''drama' , 'D001,D002'2000'2016-11-03 18:10:00')
      , ('47db0370''U002''add_cart''drama' , 'D001'     , NULL'2016-11-03 19:00:00')
      , ('47db0370''U002''purchase''drama' , 'D001'     , 1000'2016-11-03 20:00:00')
      , ('47db0370''U002''add_cart''drama' , 'D002'     , NULL'2016-11-03 20:30:00')
      , ('87b5725f''U001''add_cart''action''A004'     , NULL'2016-11-04 12:00:00')
      , ('87b5725f''U001''add_cart''action''A005'     , NULL'2016-11-04 12:00:00')
      , ('87b5725f''U001''add_cart''action''A006'     , NULL'2016-11-04 12:00:00')
      , ('9afaf87c''U002''purchase''drama' , 'D002'     , 1000'2016-11-04 13:00:00')
      , ('9afaf87c''U001''purchase''action''A005,A006'1000'2016-11-04 15:00:00')
    ;

    11-1 사용자 액션 수 집계하기

    사용 함수 : COUNT, COUNT(DISTINCT), ROLLUP

    -- 로그인 상태에 따라 액션 수 등을 따로 집계하는 쿼리
    with -- 가상 테이블 생성
    action_log_with_status as (
        select
      session,
      user_id,
      action,
      case when coalesce(user_id, '') <> '' then 'login' else 'guest' end
      as login_status
      from ch4.action_log
    )
    select -- action, login_status에 따른 사용자 수 집계
    coalesce(action, 'all') as action,
    coalesce(login_status, 'all') as login_status,
    count(distinct session) as action_uu,
    count(1) as action_count
    from action_log_with_status
    group by rollup(action, login_status);
    -- 회원 상태를 판별하는 쿼리
    with
    action_log_with_status as (
      select
      session,
      user_id,
      action,
      -- 각 세션에서 user_id가 있는지 확인하고 회원과 비회원 구별
      case
      when coalesce(max(user_id)
      over(partition by session order by stamp rows between unbounded preceding and current row)
      , '') <> ''
      then 'member'
      else 'none'
      end as member_status,
      stamp
      from ch4.action_log
    )
    select *
    from action_log_with_status;

    11-2 연령별 구분 집계하기

    사용 함수 : CASE, CAST

    --연령별 구분의 사람 수를 계산하는 쿼리
    with
    mst_users_with_int_birth_date as (
        select *,
      20170101 as int_specific_date,
      cast(replace(substring(birth_date, 1, 10), '-', '') as integer) as int_birth_date
      from ch4.mst_users
    ),
    mst_users_with_age as (
        select
      *,
      floor((int_specific_date - int_birth_date)/10000) as age
      from mst_users_with_int_birth_date
    )
    ,mst_users_with_category as (
      select
      user_id,
      sex,
      age,
      concat(
        case
        when 20 <= age then sex
        else ''
        end
        ,case
        when age between 4 and 12 then 'C'
        when age between 13 and 19 then 'T'
        when age between 20 and 34 then '1'
        when age between 35 and 49 then '2'
        when age >= 50 then '3'
        end
      ) as category
      from mst_users_with_age
    )
    select
    category,
    count(1) as user_count
    from mst_users_with_category
    group by category;

    11-3 연령별 구분의 특징 추출하기

    사용 함수 : JOIN, GROUP BY

    -- 연령별 구분과 카테고리를 집계하는 쿼리
    with
    mst_users_with_int_birth_date as (
        select *,
      20170101 as int_specific_date,
      cast(replace(substring(birth_date, 1, 10), '-', '') as integer) as int_birth_date
      from ch4.mst_users
    ),
    mst_users_with_age as (
        select
      *,
      floor((int_specific_date - int_birth_date)/10000) as age
      from mst_users_with_int_birth_date
    )
    ,mst_users_with_category as (
      select
      user_id,
      sex,
      age,
      concat(
        case
        when 20 <= age then sex
        else ''
        end
        ,case
        when age between 4 and 12 then 'C'
        when age between 13 and 19 then 'T'
        when age between 20 and 34 then '1'
        when age between 35 and 49 then '2'
        when age >= 50 then '3'
        end
      ) as category
      from mst_users_with_age
    )
    select
    p.category as product_category,
    u.category as user_category,
    count(*) as purchase_count
    from ch4.action_log as p
    join
    mst_users_with_category as u
    on p.user_id = u.user_id
    where
    action = 'purchase'
    group by p.category, u.category
    order by p.category, u.category;

    11-4 사용자의 방문 빈도 집계하기

    사용 함수 : SUM

    ==구성비와 구성비누계를 계산하는 쿼리
    with
    action_log_with_dt as (
      select *,
      substring(stamp, 1, 10) as dt
      from ch4.action_log
    ),
    action_day_count_per_user as (
      select
      user_id,
      count(distinct dt) as action_day_count
      from action_log_with_dt
      where dt between '2016-11-01' and '2016-11-07'
      group by user_id
    )
    select
    action_day_count,
    count(distinct user_id) as user_count,
    100.0 * count(distinct user_id)/ sum(count(distinct user_id)) over() as composition_ratio,
    100.0 * sum(count(distinct user_id)) over(order by action_day_count rows between unbounded preceding and current row)
    / sum(count(distinct user_id)) over() as cumulative_ratio
    from action_day_count_per_user
    group by action_day_count
    order by action_day_count;

    11-5 벤 다이어그램으로 사용자 액션 집계하기

    사용 함수 : SIGN, SUM, CASE

    -- CUBE 구문을 사용하지 않고 표준 SQL 구문만으로 CUBE 같이 표현하는 쿼리
    with
    user_action_flag as (
      select
      user_id,
      sign(sum(case when action = 'purchase' then 1 else 0 end)) as has_purchase,
      sign(sum(case when action = 'review' then 1 else 0 end)) as has_review,
      sign(sum(case when action = 'favorite' then 1 else 0 end)) as has_favorite
      from `windy-elevator-348511.ch4.action_log`
      group by user_id
    ),
    action_venn_diagram as (
      select has_purchase, has_review, has_favorite, count(1) as users
      from user_action_flag
      group by has_purchase, has_review, has_favorite
      union all 
      select null as has_purchase, has_review, has_favorite, count(1) as users
      from user_action_flag
      group by has_review, has_favorite
      union all
      select has_purchase, null as has_review, has_favorite, count(1) as users
      from user_action_flag
      group by has_purchase, has_favorite
      union all
      select has_purchase, has_review, null as has_favorite, count(1) as users
      from user_action_flag
      group by has_purchase, has_review
      union all
      select null as has_purchase, null as has_review, has_favorite, count(1) as users
      from user_action_flag
      group by has_favorite
      union all
      select null as has_purchase, has_review, null as has_favorite, count(1) as users
      from user_action_flag
      group by has_review
      union all
      select has_purchase, null as has_review, null as has_favorite, count(1) as users
      from user_action_flag
      group by has_purchase
      union all
      select
      null as has_purchase, null as has_review, null as has_favorite, count(1) as users
      from user_action_flag
    )
    
    select *
    from action_venn_diagram
    order by has_purchase, has_review, has_favorite;
    --유사적으로 null을 포함한 레코드를 추가해서 CUBE 구문과 같은 결과를 얻는 쿼리
    with
    user_action_flag as (
      select
      user_id,
      sign(sum(case when action = 'purchase' then 1 else 0 end)) as has_purchase,
      sign(sum(case when action = 'review' then 1 else 0 end)) as has_review,
      sign(sum(case when action = 'favorite' then 1 else 0 end)) as has_favorite
      from `windy-elevator-348511.ch4.action_log`
      group by user_id
    ),action_venn_diagram as (
      select
      mod_has_purchase as has_purchase,
      mod_has_review as has_review,
      mod_has_favorite as has_favorite,
      count(1) as users
      from user_action_flag
      cross join unnest(array[has_purchase,null]) as mod_has_purchase
      cross join unnest(array[has_review, null]) as mod_has_review
      cross join unnest(array[has_favorite, null]) as mod_has_favorite
      group by mod_has_purchase, mod_has_review, mod_has_favorite
    )
    select *
    from action_venn_diagram
    order by has_purchase, has_review, has_favorite;

     

    11-6 Decile 분석을 사용해 사용자를 10단계 그룸으로 나누기

    사용 함수 : NTILE 

    -- 구매액이 많은 decile 순서로 구성비와 구성비누계를 계산하는 쿼리
    with
    user_purchase_amount as (
      select
      user_id,
      sum(amount) as purchase_amount
      from
      ch4.action_log
      where action = 'purchase'
      group by user_id
    ),
    users_with_decile as (
      select
      user_id,
      purchase_amount,
      ntile(10) over (order by purchase_amount desc) as decile
      from user_purchase_amount
    ),
    decile_with_purchase_amount as (
      select
      decile,
      sum(purchase_amount) as amount,
      avg(purchase_amount) as avg_amount,
      sum(sum(purchase_amount)) over(order by decile) as cumulative_amount,
      sum(sum(purchase_amount)) over() as total_amount
      from users_with_decile
      group by decile
    )
    select
    decile,
    amount,
    avg_amount,
    100.0 * amount / total_amount as total_ratio,
    100.0 * cumulative_amount/total_amount as cumulative_ratio
    from decile_with_purchase_amount;

    11-7 RFM 분석으로 사용자를 3가지 관점의 그룹으로 나누기

    사용 함수 : CASE, generate_series 

     

    -- R과 F를 사용해 2차원 사용자 층의 사용자 수를 집계하는 쿼리
    with
    purchase_log as(
      select
      user_id,
      amount,
      substring(stamp,1,10) as dt
      from ch4.action_log
      where action = 'purchase'
    ),
    user_rfm as (
      select
      user_id,
      max(dt) as recent_date,
      date_diff(current_date, date(timestamp(max(dt))),day) as recency,
      count(dt) as frequency,
      sum(amount) as monetary
      from purchase_log
      group by user_id
    ),
    user_rfm_rank as (
      select
      user_id,
      recent_date,
      recency,
      frequency,
      monetary,
      case
      when recency < 14 then 5
      when recency < 28 then 4
      when recency < 60 then 3
      when recency < 90 then 2
      else 1
      end as r,
      case
      when 20 <= frequency then 5
      when 10 <= frequency then 4
      when 5 <= frequency then 3
      when 2 <= frequency then 2
      when 1 = frequency then 1
      end as f
      ,
      case
      when 300000 <= monetary then 5
      when 100000 <= monetary then 4
      when 30000 <= monetary then 3
      when 5000 <= monetary then 2
      else 1
      end as m
      from user_rfm
    )
    select
    concat('r_', CAST(r as string)) as r_rank,
    count(case when f = 5 then 1 end) as f_5,
    count(case when f = 4 then 1 end) as f_4,
    count(case when f = 3 then 1 end) as f_3,
    count(case when f = 2 then 1 end) as f_2,
    count(case when f = 1 then 1 end) as f_1
    from user_rfm_rank
    group by r
    order by r_rank desc;

    댓글

Designed by Tistory.