Chapter 6 Adhoc solutions
6.2 Balancing Weights
WITH CENSUS AS (
SELECT
AGE_BIN, INCOME_BIN, GENDER_BIN,
SUM(PERWT) AS TARGET,
ROW_NUMBER() OVER(ORDER BY AGE_BIN, INCOME_BIN, GENDER_BIN) AS SEGMENTS
FROM CENSUS
GROUP BY AGE_BIN, INCOME_BIN, GENDER_BIN
),
ID_TABLE AS (
SELECT
DISTINCT ID,
AGE_BIN, INCOME_BIN, GENDER_BIN,
COUNT(DISTINCT ID) OVER()::DOUBLE PRECISION AS SAMPLE_SIZE
FROM POPULATION_TABLE
WHERE AGE_BIN IS NOT NULL
AND INCOME_BIN IS NOT NULL
AND GENDER_BIN IS NOT NULL
),
FINAL AS (
SELECT
ID,
AGE_BIN, INCOME_BIN, GENDER_BIN,
B.SEGMENTS,
B.TARGET*(SAMPLE_SIZE/B.TOTAL_POP) AS NORMALIZED_TARGETS,
COUNT(*) OVER(PARTITION BY B.SEGMENTS ORDER BY NULL) AS SEGMENT_COUNTS,
NORMALIZED_TARGETS/SEGMENT_COUNTS AS WEIGHT,
B.TARGET_COUNTS/SEGMENT_COUNTS AS POP_WEIGHT
FROM ID_TABLE A
INNER JOIN CENSUS B
ON A.AGE_BIN = B.AGE_BIN
AND A.INCOME_BIN = B.INCOME_BIN
AND C.GENDER_BIN = B.GENDER_BIN
ORDER BY SEGMENTS
)
SELECT ID, WEIGHT, POP_WEIGHT
FROM FINAL
ORDER BY ID
6.3 Weighted Sampling
6.3.1 Solution
WITH ONE AS (
SELECT 1 AS CHILDCARE, 0.17 AS CHILD
UNION ALL
SELECT 0 AS CHILDCARE, 0.83 AS CHILD
),
TWO AS (
SELECT 1 AS SENIOR_CARE, 0.3 AS SENIOR
UNION ALL
SELECT 0 AS SENIOR_CARE, 0.7 AS SENIOR
),
THREE AS (
SELECT 1 AS TUTORING, 0.22 AS TUTOR
UNION ALL
SELECT 0 AS TUTORING, 0.78 AS TUTOR
),
TARGET_COUNTS AS (
SELECT
CHILDCARE,
SENIOR_CARE,
TUTORING,
ROW_NUMBER() OVER (ORDER BY CHILDCARE, SENIOR_CARE, TUTORING) AS SEGMENTS,
ROUND(CHILD*SENIOR*TUTOR*6000000) AS TARGET_COUNT
FROM ONE, TWO, THREE
)
SELECT
A.*,
COUNT(*) OVER(PARTITION BY B.SEGMENTS ORDER BY NULL) AS ID_COUNTS,
(B.TARGET_COUNT/BLU_COUNTS) AS WEIGHT
FROM TABLE_AUDIENCE A
INNER JOIN TARGET_COUNTS B
ON A.CHILD_CARE=B.CHILD_CARE
AND A.SENIOR_CARE=B.SENIOR_CARE
AND A.TUTORING=B.TUTORING
ORDER BY SEGMENTS
6.4 Demographic Distribution
Assuming TABLE_POPULATION is distinct on ID column.
SELECT
AGE_CATEGORY,
COUNT(ID) AS RAW_COUNT,
SUM(COUNT(ID)) OVER() AS TOTAL_COUNTS,
COUNT(ID) / SUM(COUNT(ID)) OVER() AS PCT_DIST,
SUM(WEIGHT) AS WEIGHTED_COUNT,
SUM(SUM(WEIGHT)) OVER() AS WEIGHTED_TOTAL_COUNTS,
SUM(WEIGHT) / SUM(SUM(WEIGHT)) OVER() AS WEIGHTED_PCT_DIST
FROM TABLE_POPULATION
GROUP BY AGE_CATEGORY
6.5 Risky Projects
Identify projects that are at risk for going overbudget. A project is considered to be overbudget if the cost of all employees assigned to the project is greater than the budget of the project.
You’ll need to prorate the cost of the employees to the duration of the project. For example, if the budget for a project that takes half a year to complete is $10K, then the total half-year salary of all employees assigned to the project should not exceed $10K. Salary is defined on a yearly basis, so be careful how to calculate salaries for the projects that last less or more than one year.
Output a list of projects that are overbudget with their project name, project budget, and prorated total employee expense (rounded to the next dollar amount).
HINT: to make it simpler, consider that all years have 365 days. You don’t need to think about the leap years.
Tables: linkedin_projects, linkedin_emp_projects, linkedin_employees
6.5.1 tables
table: linkedin_projects
id–int,
title–varchar
budget–int
start_date–datetime
end_date–datetime
table: linkedin_emp_projects
emp_id–int
project_id–
table: linkedin_employees
id–int,
first_name
last_name
salary–int
6.5.2 SOLUTION 1
DURATION YEAR 1 IS CALCULATED SEPARATELY WILL CAUSE PROBLEMS, BECAUSE IT IS NOT COSTLY TO KEEP IT ACCURATE TO DECIMAL POINTS
WITH
PROJECT_DURATION AS (
SELECT DISTINCT id, DATEDIFF(end_date, start_date)/365.0 AS DURATION_YR
FROM linkedin_projects
ORDER BY DURATION_YR DESC
),
COST AS (
SELECT A.project_id, SUM(B.SALARY * C.DURATION_YR) AS PROJECT_COST
FROM linkedin_emp_projects A
INNER JOIN linkedin_employees B ON A.EMP_ID = B.ID
INNER JOIN PROJECT_DURATION C ON A.project_id = C.ID
GROUP BY A.project_id
)
SELECT A.TITLE, A.BUDGET, CEILING(B.PROJECT_COST) AS PROJECT_COST
FROM linkedin_projects A
INNER JOIN COST B ON A.ID = B.project_id
WHERE A.BUDGET < CEILING(B.PROJECT_COST)
ORDER BY TITLE ASC;
6.6 How many users have applied to the same companies they have applied before the past year?
WITH UserApplications AS (
SELECT
user_id,
company_id,
application_date,
ROW_NUMBER() OVER (PARTITION BY user_id, company_id ORDER BY application_date) AS application_order
FROM applications
WHERE application_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
)
SELECT
user_id,
COUNT(DISTINCT company_id) AS companies_applied_again
FROM UserApplications
WHERE application_order > 1
GROUP BYuser_id
HAVING COUNT(DISTINCT company_id) > 0;