Chapter 6 Adhoc solutions

6.1 DATA QA EFFORTS

6.1.1 COUNTS

SELECT
    COUNT(*) AS NUM_ROWS,
    COUNT(DISTINCT ID) AS DISTINCT_ID_COUNT,
    COUNT(DISTINCT HHID) AS DISTINCT_HHID_COUNT,
    
    COUNT_IF(ID IS NOT NULL) AS VALID_ID_COUNT,
    COUNT_IF(HHID IS NOT NULL) AS VALID_HHID_COUNT
FROM DATA_TABLE

6.2 Balancing Weights

WITH CENSUS AS (

SELECT
    AGE_BIN, INCOME_BIN, GENDER_BIN,
    SUM(PERWT) AS TARGET,
    ROW_NUMBER() OVER(ORDER BY AGE_BIN, INCOME_BIN, GENDER_BIN) AS SEGMENTS
FROM CENSUS
GROUP BY AGE_BIN, INCOME_BIN, GENDER_BIN
),
ID_TABLE AS (
SELECT 
    DISTINCT ID,
    AGE_BIN, INCOME_BIN, GENDER_BIN,
    COUNT(DISTINCT ID) OVER()::DOUBLE PRECISION AS SAMPLE_SIZE
FROM POPULATION_TABLE
WHERE AGE_BIN IS NOT NULL
  AND INCOME_BIN IS NOT NULL
  AND GENDER_BIN IS NOT NULL
),
FINAL AS (

SELECT
    ID,
    AGE_BIN, INCOME_BIN, GENDER_BIN,
    B.SEGMENTS,
    B.TARGET*(SAMPLE_SIZE/B.TOTAL_POP) AS NORMALIZED_TARGETS,
    COUNT(*) OVER(PARTITION BY B.SEGMENTS ORDER BY NULL) AS SEGMENT_COUNTS,
    NORMALIZED_TARGETS/SEGMENT_COUNTS AS WEIGHT,
    B.TARGET_COUNTS/SEGMENT_COUNTS AS POP_WEIGHT
FROM ID_TABLE A
INNER JOIN CENSUS B
  ON A.AGE_BIN = B.AGE_BIN
  AND A.INCOME_BIN = B.INCOME_BIN 
  AND C.GENDER_BIN = B.GENDER_BIN
ORDER BY SEGMENTS
)
SELECT ID, WEIGHT, POP_WEIGHT
FROM FINAL
ORDER BY ID

6.3 Weighted Sampling

6.3.1 Solution

WITH ONE AS (
  SELECT 1 AS CHILDCARE, 0.17 AS CHILD
  UNION ALL
  SELECT 0 AS CHILDCARE, 0.83 AS CHILD
),
TWO AS (
  SELECT 1 AS SENIOR_CARE, 0.3 AS SENIOR
  UNION ALL
  SELECT 0 AS SENIOR_CARE, 0.7 AS SENIOR
),
THREE AS (
  SELECT 1 AS TUTORING, 0.22 AS TUTOR
  UNION ALL
  SELECT 0 AS TUTORING, 0.78 AS TUTOR
),
TARGET_COUNTS AS (
  SELECT 
      CHILDCARE,
      SENIOR_CARE,
      TUTORING,
      ROW_NUMBER() OVER (ORDER BY CHILDCARE, SENIOR_CARE, TUTORING) AS SEGMENTS,
      ROUND(CHILD*SENIOR*TUTOR*6000000) AS TARGET_COUNT
  FROM ONE, TWO, THREE
)
SELECT
    A.*,
    COUNT(*) OVER(PARTITION BY B.SEGMENTS ORDER BY NULL) AS ID_COUNTS,
    (B.TARGET_COUNT/BLU_COUNTS) AS WEIGHT
FROM TABLE_AUDIENCE A
INNER JOIN TARGET_COUNTS B
  ON A.CHILD_CARE=B.CHILD_CARE
  AND A.SENIOR_CARE=B.SENIOR_CARE
  AND A.TUTORING=B.TUTORING
ORDER BY SEGMENTS

6.4 Demographic Distribution

Assuming TABLE_POPULATION is distinct on ID column.

SELECT
    AGE_CATEGORY, 
    COUNT(ID) AS RAW_COUNT,
    SUM(COUNT(ID)) OVER() AS TOTAL_COUNTS,
    COUNT(ID) / SUM(COUNT(ID)) OVER() AS PCT_DIST,
    
    SUM(WEIGHT) AS WEIGHTED_COUNT,
    SUM(SUM(WEIGHT)) OVER() AS WEIGHTED_TOTAL_COUNTS,
    SUM(WEIGHT) / SUM(SUM(WEIGHT)) OVER() AS WEIGHTED_PCT_DIST
FROM TABLE_POPULATION
GROUP BY AGE_CATEGORY

6.5 Risky Projects

Identify projects that are at risk for going overbudget. A project is considered to be overbudget if the cost of all employees assigned to the project is greater than the budget of the project.

You’ll need to prorate the cost of the employees to the duration of the project. For example, if the budget for a project that takes half a year to complete is $10K, then the total half-year salary of all employees assigned to the project should not exceed $10K. Salary is defined on a yearly basis, so be careful how to calculate salaries for the projects that last less or more than one year.

Output a list of projects that are overbudget with their project name, project budget, and prorated total employee expense (rounded to the next dollar amount).

HINT: to make it simpler, consider that all years have 365 days. You don’t need to think about the leap years.

Tables: linkedin_projects, linkedin_emp_projects, linkedin_employees

6.5.1 tables

table: linkedin_projects

id–int,
title–varchar
budget–int
start_date–datetime
end_date–datetime

table: linkedin_emp_projects

emp_id–int
project_id–

table: linkedin_employees

id–int,
first_name   last_name   salary–int  

6.5.2 SOLUTION 1

DURATION YEAR 1 IS CALCULATED SEPARATELY WILL CAUSE PROBLEMS, BECAUSE IT IS NOT COSTLY TO KEEP IT ACCURATE TO DECIMAL POINTS

WITH 
PROJECT_DURATION AS (
    SELECT DISTINCT id, DATEDIFF(end_date, start_date)/365.0 AS DURATION_YR
    FROM linkedin_projects
    ORDER BY DURATION_YR DESC
),
COST AS (
    SELECT A.project_id, SUM(B.SALARY * C.DURATION_YR) AS PROJECT_COST
    FROM linkedin_emp_projects A
    INNER JOIN linkedin_employees B ON A.EMP_ID = B.ID
    INNER JOIN PROJECT_DURATION C ON A.project_id = C.ID
    GROUP BY A.project_id
)
SELECT A.TITLE, A.BUDGET, CEILING(B.PROJECT_COST) AS PROJECT_COST
FROM linkedin_projects A
INNER JOIN COST B ON A.ID = B.project_id
WHERE A.BUDGET < CEILING(B.PROJECT_COST)
ORDER BY TITLE ASC;

6.5.3 SOULTION 2

BETTER SOLUTION

SELECT 
    A.TITLE, 
    A.BUDGET, 
    CEILING(SUM(B.SALARY) * DATEDIFF(A.end_date, A.start_date) / 365.0) AS PROJECT_COST
FROM linkedin_projects A
LEFT JOIN linkedin_emp_projects C    ON A.ID = C.project_id
LEFT JOIN linkedin_employees B       ON C.EMP_ID = B.ID
GROUP BY A.ID
HAVING  A.BUDGET < PROJECT_COST;

6.6 How many users have applied to the same companies they have applied before the past year?


WITH UserApplications AS (
    SELECT 
        user_id,
        company_id,
        application_date,
        ROW_NUMBER() OVER (PARTITION BY user_id, company_id ORDER BY application_date) AS application_order
    FROM applications
    WHERE application_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
)

SELECT 
    user_id,
    COUNT(DISTINCT company_id) AS companies_applied_again
FROM UserApplications
WHERE application_order > 1
GROUP BYuser_id
HAVING COUNT(DISTINCT company_id) > 0;