# Load required packages (hint: you need tidycensus, tidyverse, and knitr)
library(dplyr)
library(tidycensus)
library(tidyverse)
library(sf)
library(tigris)
library(knitr)
library(stringr)
library(scales)
library(kableExtra)
# Set your Census API key
census_api_key(Sys.getenv("CENSUS_API_KEY"), install = FALSE, overwrite = FALSE)
# Choose your state for analysis - assign it to a variable called my_state
<- "PA" my_state
Assignment 1: Census Data Quality for Policy Decisions
Evaluating Data Reliability for Algorithmic Decision-Making
Assignment Overview
Scenario
You are a data analyst for the Pennsylvania Department of Human Services. The department is considering implementing an algorithmic system to identify communities that should receive priority for social service funding and outreach programs. Your supervisor has asked you to evaluate the quality and reliability of available census data to inform this decision.
Drawing on our Week 2 discussion of algorithmic bias, you need to assess not just what the data shows, but how reliable it is and what communities might be affected by data quality issues.
Learning Objectives
- Apply dplyr functions to real census data for policy analysis
- Evaluate data quality using margins of error
- Connect technical analysis to algorithmic decision-making
- Identify potential equity implications of data reliability issues
- Create professional documentation for policy stakeholders
Submission Instructions
Submit by posting your updated portfolio link on Canvas. Your assignment should be accessible at your-portfolio-url/assignments/assignment_1/
Make sure to update your _quarto.yml
navigation to include this assignment under an “Assignments” menu.
Part 1: Portfolio Integration
Create this assignment in your portfolio repository under an assignments/assignment_1/
folder structure. Update your navigation menu to include:
- text: Assignments
menu:
- href: assignments/assignment_1/your_file_name.qmd
text: "Assignment 1: Census Data Exploration"
If there is a special character like comma, you need use double quote mark so that the quarto can identify this as text
Setup
State Selection: I have chosen Pennsylvania for this analysis because: the scenario explicitly involves the Pennsylvania Department of Human Services, and PA’s mix of urban, suburban, and rural counties provides useful variation in ACS reliability for evaluating algorithmic decisions.
Part 2: County-Level Resource Assessment
2.1 Data Retrieval
Your Task: Use get_acs()
to retrieve county-level data for your chosen state.
Requirements: - Geography: county level - Variables: median household income (B19013_001) and total population (B01003_001)
- Year: 2022 - Survey: acs5 - Output format: wide
Hint: Remember to give your variables descriptive names using the variables = c(name = "code")
syntax.
# Write your get_acs() code here
= c(
county_vars med_hh_income = "B19013_001",
total_pop = "B01003_001"
)
<- get_acs(
county_data geography = "county",
state = my_state,
year = 2022,
survey = "acs5",
variables = county_vars,
output = "wide",
)
# Clean the county names to remove state name and "County"
<- county_data %>%
county_data mutate(county_name = str_remove(NAME, paste0(", ", "Pennsylvania")) %>%
str_remove("County")
)
# Hint: use mutate() with str_remove()
# Display the first few rows
head(county_data)
# A tibble: 6 × 7
GEOID NAME med_hh_incomeE med_hh_incomeM total_popE total_popM county_name
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
1 42001 Adams C… 78975 3334 104604 NA "Adams "
2 42003 Alleghe… 72537 869 1245310 NA "Allegheny…
3 42005 Armstro… 61011 2202 65538 NA "Armstrong…
4 42007 Beaver … 67194 1531 167629 NA "Beaver "
5 42009 Bedford… 58337 2606 47613 NA "Bedford "
6 42011 Berks C… 74617 1191 428483 NA "Berks "
glimpse(county_data)
Rows: 67
Columns: 7
$ GEOID <chr> "42001", "42003", "42005", "42007", "42009", "42011", "…
$ NAME <chr> "Adams County, Pennsylvania", "Allegheny County, Pennsy…
$ med_hh_incomeE <dbl> 78975, 72537, 61011, 67194, 58337, 74617, 59386, 60650,…
$ med_hh_incomeM <dbl> 3334, 869, 2202, 1531, 2606, 1191, 2058, 2167, 1516, 21…
$ total_popE <dbl> 104604, 1245310, 65538, 167629, 47613, 428483, 122640, …
$ total_popM <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ county_name <chr> "Adams ", "Allegheny ", "Armstrong ", "Beaver ", "Bedfo…
2.2 Data Quality Assessment
Your Task: Calculate margin of error percentages and create reliability categories.
Requirements: - Calculate MOE percentage: (margin of error / estimate) * 100 - Create reliability categories: - High Confidence: MOE < 5% - Moderate Confidence: MOE 5-10%
- Low Confidence: MOE > 10% - Create a flag for unreliable estimates (MOE > 10%)
Hint: Use mutate()
with case_when()
for the categories.
# Calculate MOE percentage and reliability categories using mutate()
<- county_data %>%
county_cat mutate(moe_perc = ((med_hh_incomeM / med_hh_incomeE) * 100) ,
reliability = case_when(moe_perc < 5 ~ "High Confidence",
< 10 ~ "Moderate Confidence",
moe_perc TRUE ~ "Low Confidence")
)
# Create a summary showing count of counties in each reliability category
%>%
county_cat count(reliability, name = "count") %>%
mutate(share = scales::percent(count / sum(count)))
# A tibble: 2 × 3
reliability count share
<chr> <int> <chr>
1 High Confidence 57 85%
2 Moderate Confidence 10 15%
# Hint: use count() and mutate() to add percentages
2.3 High Uncertainty Counties
Your Task: Identify the 5 counties with the highest MOE percentages.
Requirements: - Sort by MOE percentage (highest first) - Select the top 5 counties - Display: county name, median income, margin of error, MOE percentage, reliability category - Format as a professional table using kable()
Hint: Use arrange()
, slice()
, and select()
functions.
# Create table of top 5 counties by MOE percentage
<- county_cat %>%
top5_uncertain arrange(desc(moe_perc)) %>%
slice(1:5) %>%
transmute(
county = county_name,
median_income = paste0("$", format(round(med_hh_incomeE), big.mark=",")),
moe_dollars = paste0("$", format(round(med_hh_incomeM), big.mark=",")),
moe_percent = sprintf("%.1f%%", moe_perc),
reliability
)
# Format as table with kable() - include appropriate column names and caption
%>%
top5_uncertain kable(
col.names = c("County","Median Income","MOE ($)","MOE (%)","Reliability"),
caption = "Top 5 Counties by Median Income MOE% (Highest First)",
align = c("l","r","r","r","l")
%>%
) kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
County | Median Income | MOE ($) | MOE (%) | Reliability |
---|---|---|---|---|
Forest | $46,188 | $4,612 | 10.0% | Moderate Confidence |
Sullivan | $62,910 | $5,821 | 9.3% | Moderate Confidence |
Union | $64,914 | $4,753 | 7.3% | Moderate Confidence |
Montour | $72,626 | $5,146 | 7.1% | Moderate Confidence |
Elk | $61,672 | $4,091 | 6.6% | Moderate Confidence |
Data Quality Commentary:
Across Pennsylvania, the counties with the largest household income MOE are small counties such as Forest, Sullivan, Union, Montour, and Elk, with MOE between 6% and 10%. If an algorithm ranks counties by median income without accounting for this uncertainty, these counties can be misclassified because of sampling error and may cross decision thresholds incorrectly. Because they have smaller populations, they should be treated with caution and considered for manual review or aggregation to larger geographies.
Part 3: Neighborhood-Level Analysis
3.1 Focus Area Selection
Your Task: Select 2-3 counties from your reliability analysis for detailed tract-level study.
Strategy: Choose counties that represent different reliability levels (e.g., 1 high confidence, 1 moderate, 1 low confidence) to compare how data quality varies.
# Use filter() to select 2-3 counties from your county_reliability data
# Store the selected counties in a variable called selected_counties
<- county_cat %>% mutate(county_code = substr(GEOID, 3, 5))
county_cat
<- county_cat %>%
selected_counties filter(reliability %in% c("High Confidence","Moderate Confidence","Low Confidence")) %>%
group_by(reliability) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(county_code, county_name, med_hh_incomeE, med_hh_incomeM, moe_perc, reliability)
# Display the selected counties with their key characteristics
%>%
selected_counties ::mutate(
dplyr`Median Income` = paste0("$", format(round(med_hh_incomeE), big.mark=",")),
`MOE (%)` = sprintf("%.1f%%", moe_perc)
%>%
) ::select(
dplyrCounty = county_name, `Median Income`, `MOE (%)`, Reliability = reliability
%>%
) kable(caption = "Selected Counties for Tract-Level Study") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
County | Median Income | MOE (%) | Reliability |
---|---|---|---|
Adams | $78,975 | 4.2% | High Confidence |
Cameron | $46,186 | 5.6% | Moderate Confidence |
# Show: county name, median income, MOE percentage, reliability category
Comment on the output: Both chosen counties are at the high and moderate confidence levels. The absence of any low confidence counties suggests that county level income is measured with fair stability across the state, though smaller rural places may still sit near the threshold and deserve careful review.
3.2 Tract-Level Demographics
Your Task: Get demographic data for census tracts in your selected counties.
Requirements: - Geography: tract level - Variables: white alone (B03002_003), Black/African American (B03002_004), Hispanic/Latino (B03002_012), total population (B03002_001) - Use the same state and year as before - Output format: wide - Challenge: You’ll need county codes, not names. Look at the GEOID patterns in your county data for hints.
# Define your race/ethnicity variables with descriptive names
<- c(
race_vars total = "B03002_001",
white = "B03002_003",
black = "B03002_004",
hispanic = "B03002_012"
)
# Use get_acs() to retrieve tract-level data
# Hint: You may need to specify county codes in the county parameter
<- selected_counties$county_code
selected_codes
<- get_acs(
tract_raw geography = "tract",
state = my_state,
county = selected_codes,
variables = race_vars,
year = 2022,
survey = "acs5",
output = "wide"
)
# Calculate percentage of each group using mutate()
# Create percentages for white, Black, and Hispanic populations
# Add readable tract and county name columns using str_extract() or similar
<- tract_raw %>%
tract_data mutate(county_code = substr(GEOID, 9, 11)) %>%
separate(NAME, into = c("tract_name", "county", "state"),
sep = ";\\s*", remove = FALSE, fill = "right") %>%
mutate(
pct_white = (100 * whiteE / totalE),
pct_black = (100 * blackE / totalE),
pct_hisp = (100 * hispanicE / totalE)
%>%
) select(-state)
3.3 Demographic Analysis
Your Task: Analyze the demographic patterns in your selected areas.
# Find the tract with the highest percentage of Hispanic/Latino residents
# Hint: use arrange() and slice() to get the top tract
<- tract_data %>%
top_hispanic_tracts group_by(county) %>%
arrange(desc(pct_hisp), .by_group = TRUE) %>%
slice(1) %>%
ungroup() %>%
transmute(
Tract = tract_name,
County = county,
`Pct Hispanic/Latino` = pct_hisp,
`Total Pop` = totalE
)
%>%
top_hispanic_tracts kable(
caption = "Top % Hispanic/Latino Tract in Each Selected County",
col.names = c("County","Tract","Pct Hispanic/Latino","Total Pop"),
align = c("l","l","r","r")
%>%
) kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
County | Tract | Pct Hispanic/Latino | Total Pop |
---|---|---|---|
Census Tract 315.02 | Adams County | 20.880246 | 3908 |
Census Tract 9601 | Cameron County | 2.867203 | 1988 |
# Calculate average demographics by county using group_by() and summarize()
# Show: number of tracts, average percentage for each racial/ethnic group
<- tract_data %>%
county_avgs group_by(county) %>%
summarise(
Tracts = n(),
`Avg % White` = mean(pct_white, na.rm = TRUE),
`Avg % Black` = mean(pct_black, na.rm = TRUE),
`Avg % Hispanic/Latino` = mean(pct_hisp, na.rm = TRUE)
%>%
) rename(County = county)
# Create a nicely formatted table of your results using kable()
%>%
county_avgs ::kable(
knitrcaption = "Average Demographics by County (Selected Counties)",
col.names = c("County","Tracts","Avg % White","Avg % Black","Avg % Hispanic/Latino"),
digits = c(NA, 0, 2, 2, 2), # one decimal place for % cols
align = c("l","r","r","r","r")
%>%
) ::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed")) kableExtra
County | Tracts | Avg % White | Avg % Black | Avg % Hispanic/Latino |
---|---|---|---|---|
Adams County | 27 | 88.33 | 1.31 | 7.14 |
Cameron County | 2 | 93.19 | 0.04 | 2.10 |
Part 4: Comprehensive Data Quality Evaluation
4.1 MOE Analysis for Demographic Variables
Your Task: Examine margins of error for demographic variables to see if some communities have less reliable data.
Requirements: - Calculate MOE percentages for each demographic variable - Flag tracts where any demographic variable has MOE > 15% - Create summary statistics
# Calculate MOE percentages for white, Black, and Hispanic variables
# Hint: use the same formula as before (margin/estimate * 100)
# Create a flag for tracts with high MOE on any demographic variable
# Use logical operators (| for OR) in an ifelse() statement
<- tract_data %>%
tract_moe mutate(
moe_pct_white = ifelse(whiteE > 0, 100 * whiteM / whiteE, NA_real_),
moe_pct_black = ifelse(blackE > 0, 100 * blackM / blackE, NA_real_),
moe_pct_hisp = ifelse(hispanicE > 0, 100 * hispanicM / hispanicE, NA_real_),
high_moe_any = (moe_pct_white > 15) | (moe_pct_black > 15) | (moe_pct_hisp > 15)
)
# Create summary statistics showing how many tracts have data quality issues
<- tract_moe %>%
summary_tract_moe group_by(county) %>%
summarise(
`Tracts (total)` = n(),
`Tracts (high MOE)` = sum(high_moe_any, na.rm = TRUE),
`Share high MOE` = sprintf("%.2f%%", 100 * `Tracts (high MOE)` / `Tracts (total)`)
)
%>%
summary_tract_moe kable(
col.names = c("County","Tracts","High MOE","Share high MOE"),
caption = "High-MOE Tracts by County",
align = c("l","r","r","r")
%>%
) kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
County | Tracts | High MOE | Share high MOE |
---|---|---|---|
Adams County | 27 | 26 | 96.30% |
Cameron County | 2 | 2 | 100.00% |
4.2 Pattern Analysis
Your Task: Investigate whether data quality problems are randomly distributed or concentrated in certain types of communities.
# Group tracts by whether they have high MOE issues
# Calculate average characteristics for each group:
# - population size, demographic percentages
# Use group_by() and summarize() to create this comparison
<- tract_moe %>%
tract_pattern mutate(high_moe_any = ifelse(is.na(high_moe_any), FALSE, high_moe_any)) %>%
group_by(county, high_moe_any) %>%
summarise(
Tracts = n(),
`Avg Pop` = mean(totalE, na.rm = TRUE),
`Avg % White` = mean(pct_white, na.rm = TRUE),
`Avg % Black` = mean(pct_black, na.rm = TRUE),
`Avg % Hispanic/Latino`= mean(pct_hisp, na.rm = TRUE),
.groups = "drop_last"
%>%
) mutate(Group = if_else(high_moe_any, "High-MOE tracts", "Other tracts")) %>%
ungroup() %>%
rename(County = county) %>%
select(County, Group, Tracts, `Avg Pop`, `Avg % White`, `Avg % Black`, `Avg % Hispanic/Latino`)
# Create a professional table showing the patterns
%>%
tract_pattern kable(
caption = "Comparing Tracts With vs. Without High MOE (Any Demographic Variable)",
digits = c(NA,NA, 0, 0, 2, 2, 2),
align = c("l","l","r","r","r","r","r")
%>%
) kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
County | Group | Tracts | Avg Pop | Avg % White | Avg % Black | Avg % Hispanic/Latino |
---|---|---|---|---|---|---|
Adams County | Other tracts | 1 | 2416 | 98.68 | 0.00 | 0.00 |
Adams County | High-MOE tracts | 26 | 3930 | 87.93 | 1.36 | 7.42 |
Cameron County | High-MOE tracts | 2 | 2268 | 93.19 | 0.04 | 2.10 |
Pattern Analysis: In Adams County, almost all tracts are flagged as high MOE, and the tracts with high MOE also show more racial and ethnic diversity than the single tract without issues. In Cameron County, every tract is flagged as high MOE, reflecting the effect of very small population size. This pattern confirms that high MOE is concentrated both in small rural places and in tracts where subgroup counts are small, which increases the risk of unstable algorithmic classifications.
Part 5: Policy Recommendations
5.1 Analysis Integration and Professional Summary
Your Task: Write an executive summary that integrates findings from all four analyses.
Executive Summary Requirements: 1. Overall Pattern Identification: What are the systematic patterns across all your analyses? 2. Equity Assessment: Which communities face the greatest risk of algorithmic bias based on your findings? 3. Root Cause Analysis : What underlying factors drive both data quality issues and bias risk? 4. Strategic Recommendations: What should the Department implement to address these systematic issues?
Executive Summary:
Overall Pattern Identification: Most counties in Pennsylvania fall into the high confidence group, and only a handful are moderate confidence. At the county scale, ACS estimates for income are fairly stable. At the tract scale, however, many estimates exceed the reliability threshold, showing that data quality declines as geography becomes more granular.
Equity Assessment: Communities most at risk of algorithmic bias are those with smaller populations and greater diversity. These are the places where sampling variability makes the data least reliable. If an algorithm ranks tracts or counties without accounting for uncertainty, these communities could be wrongly deprioritized or overrepresented in resource allocations.
Root Cause Analysis: The reliability issues are driven by sample size and subgroup sparsity. Smaller counties and tracts have fewer responses, and race and ethnicity subgroups within them have even smaller counts, leading to higher margins of error. County level data appears stable because the sample is larger, while tract level data shows greater variation. These patterns reflect how ACS sampling works rather than errors in the data.
Strategic Recommendations: Algorithmic prioritization can move forward in high confidence counties, with standard monitoring in place. In moderate confidence counties, margins of error should be factored into scoring and human review should be required for borderline cases. At the tract level, reliability weighting, aggregation, and the use of complementary administrative data can reduce the risk of misclassification. All tracts flagged with high MOE should be subject to human oversight, and the Department should establish monitoring routines to catch disparities introduced by uncertainty.
6.3 Specific Recommendations
Your Task: Create a decision framework for algorithm implementation.
# Create a summary table using your county reliability data
# Include: county name, median income, MOE percentage, reliability category
# Add a new column with algorithm recommendations using case_when():
# - High Confidence: "Safe for algorithmic decisions"
# - Moderate Confidence: "Use with caution - monitor outcomes"
# - Low Confidence: "Requires manual review or additional data"
<- county_cat %>%
rec_table transmute(
County = county_name,
`Median Income` = scales::dollar(round(med_hh_incomeE)),
`MOE (%)` = sprintf("%.2f%%", moe_perc),
Reliability = reliability,
Recommendation = case_when(
== "High Confidence" ~ "Safe for algorithmic decisions",
Reliability == "Moderate Confidence" ~ "Use with caution — monitor outcomes",
Reliability TRUE ~ "Requires manual review or additional data"
)
)
# Format as a professional table with kable()
%>%
rec_table ::kable(
knitrcaption = "County-Level Decision Framework for Algorithm Implementation",
align = c("l","r","r","l","l"),
col.names = c("County","Median Income","MOE (%)","Reliability","Recommendation")
%>%
) ::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed")) kableExtra
County | Median Income | MOE (%) | Reliability | Recommendation |
---|---|---|---|---|
Adams | $78,975 | 4.22% | High Confidence | Safe for algorithmic decisions |
Allegheny | $72,537 | 1.20% | High Confidence | Safe for algorithmic decisions |
Armstrong | $61,011 | 3.61% | High Confidence | Safe for algorithmic decisions |
Beaver | $67,194 | 2.28% | High Confidence | Safe for algorithmic decisions |
Bedford | $58,337 | 4.47% | High Confidence | Safe for algorithmic decisions |
Berks | $74,617 | 1.60% | High Confidence | Safe for algorithmic decisions |
Blair | $59,386 | 3.47% | High Confidence | Safe for algorithmic decisions |
Bradford | $60,650 | 3.57% | High Confidence | Safe for algorithmic decisions |
Bucks | $107,826 | 1.41% | High Confidence | Safe for algorithmic decisions |
Butler | $82,932 | 2.61% | High Confidence | Safe for algorithmic decisions |
Cambria | $54,221 | 3.34% | High Confidence | Safe for algorithmic decisions |
Cameron | $46,186 | 5.64% | Moderate Confidence | Use with caution — monitor outcomes |
Carbon | $64,538 | 5.31% | Moderate Confidence | Use with caution — monitor outcomes |
Centre | $70,087 | 2.77% | High Confidence | Safe for algorithmic decisions |
Chester | $118,574 | 1.70% | High Confidence | Safe for algorithmic decisions |
Clarion | $58,690 | 4.37% | High Confidence | Safe for algorithmic decisions |
Clearfield | $56,982 | 2.79% | High Confidence | Safe for algorithmic decisions |
Clinton | $59,011 | 3.86% | High Confidence | Safe for algorithmic decisions |
Columbia | $59,457 | 3.76% | High Confidence | Safe for algorithmic decisions |
Crawford | $58,734 | 3.91% | High Confidence | Safe for algorithmic decisions |
Cumberland | $82,849 | 2.20% | High Confidence | Safe for algorithmic decisions |
Dauphin | $71,046 | 2.27% | High Confidence | Safe for algorithmic decisions |
Delaware | $86,390 | 1.53% | High Confidence | Safe for algorithmic decisions |
Elk | $61,672 | 6.63% | Moderate Confidence | Use with caution — monitor outcomes |
Erie | $59,396 | 2.55% | High Confidence | Safe for algorithmic decisions |
Fayette | $55,579 | 4.16% | High Confidence | Safe for algorithmic decisions |
Forest | $46,188 | 9.99% | Moderate Confidence | Use with caution — monitor outcomes |
Franklin | $71,808 | 3.00% | High Confidence | Safe for algorithmic decisions |
Fulton | $63,153 | 3.65% | High Confidence | Safe for algorithmic decisions |
Greene | $66,283 | 6.41% | Moderate Confidence | Use with caution — monitor outcomes |
Huntingdon | $61,300 | 4.72% | High Confidence | Safe for algorithmic decisions |
Indiana | $57,170 | 4.65% | High Confidence | Safe for algorithmic decisions |
Jefferson | $56,607 | 3.41% | High Confidence | Safe for algorithmic decisions |
Juniata | $61,915 | 4.79% | High Confidence | Safe for algorithmic decisions |
Lackawanna | $63,739 | 2.58% | High Confidence | Safe for algorithmic decisions |
Lancaster | $81,458 | 1.79% | High Confidence | Safe for algorithmic decisions |
Lawrence | $57,585 | 3.07% | High Confidence | Safe for algorithmic decisions |
Lebanon | $72,532 | 2.69% | High Confidence | Safe for algorithmic decisions |
Lehigh | $74,973 | 2.00% | High Confidence | Safe for algorithmic decisions |
Luzerne | $60,836 | 2.35% | High Confidence | Safe for algorithmic decisions |
Lycoming | $63,437 | 4.39% | High Confidence | Safe for algorithmic decisions |
McKean | $57,861 | 4.75% | High Confidence | Safe for algorithmic decisions |
Mercer | $57,353 | 3.63% | High Confidence | Safe for algorithmic decisions |
Mifflin | $58,012 | 3.43% | High Confidence | Safe for algorithmic decisions |
Monroe | $80,656 | 3.17% | High Confidence | Safe for algorithmic decisions |
Montgomery | $107,441 | 1.27% | High Confidence | Safe for algorithmic decisions |
Montour | $72,626 | 7.09% | Moderate Confidence | Use with caution — monitor outcomes |
Northampton | $82,201 | 1.93% | High Confidence | Safe for algorithmic decisions |
Northumberland | $55,952 | 2.67% | High Confidence | Safe for algorithmic decisions |
Perry | $76,103 | 3.17% | High Confidence | Safe for algorithmic decisions |
Philadelphia | $57,537 | 1.38% | High Confidence | Safe for algorithmic decisions |
Pike | $76,416 | 4.90% | High Confidence | Safe for algorithmic decisions |
Potter | $56,491 | 4.42% | High Confidence | Safe for algorithmic decisions |
Schuylkill | $63,574 | 2.40% | High Confidence | Safe for algorithmic decisions |
Snyder | $65,914 | 5.56% | Moderate Confidence | Use with caution — monitor outcomes |
Somerset | $57,357 | 2.78% | High Confidence | Safe for algorithmic decisions |
Sullivan | $62,910 | 9.25% | Moderate Confidence | Use with caution — monitor outcomes |
Susquehanna | $63,968 | 3.14% | High Confidence | Safe for algorithmic decisions |
Tioga | $59,707 | 3.23% | High Confidence | Safe for algorithmic decisions |
Union | $64,914 | 7.32% | Moderate Confidence | Use with caution — monitor outcomes |
Venango | $59,278 | 3.45% | High Confidence | Safe for algorithmic decisions |
Warren | $57,925 | 5.19% | Moderate Confidence | Use with caution — monitor outcomes |
Washington | $74,403 | 2.38% | High Confidence | Safe for algorithmic decisions |
Wayne | $59,240 | 4.79% | High Confidence | Safe for algorithmic decisions |
Westmoreland | $69,454 | 1.99% | High Confidence | Safe for algorithmic decisions |
Wyoming | $67,968 | 3.85% | High Confidence | Safe for algorithmic decisions |
York | $79,183 | 1.79% | High Confidence | Safe for algorithmic decisions |
Key Recommendations:
Your Task: Use your analysis results to provide specific guidance to the department.
Counties suitable for immediate algorithmic implementation: These are counties with high confidence data. Their margins of error are below 5 percent, which means the ACS estimates are stable and can be used in automated ranking and prioritization. The department can proceed with algorithmic allocation in these places with standard outcome monitoring to ensure fairness. Counties: Adams, Allegheny, Armstrong, Beaver, Bedford, Berks, Blair, Bradford, Bucks, Butler, Cambria, Centre, Chester, Clarion, Clearfield, Clinton, Columbia, Crawford, Cumberland, Dauphin, Delaware, Erie, Fayette, Franklin, Fulton, Huntingdon, Indiana, Jefferson, Juniata, Lackawanna, Lancaster, Lawrence, Lebanon, Lehigh, Luzerne, Lycoming, McKean, Mercer, Mifflin,
Monroe, Montgomery, Northampton, Northumberland, Perry, Philadelphia, Pike, Potter, Schuylkill, Somerset, Susquehanna, Tioga, Venango, Washington, Wayne, Westmoreland, Wyoming, and York.Counties requiring additional oversight: These are counties with moderate confidence data, where margins of error fall between 5 and 10 percent. Estimates here are less precise, so algorithms should be used with caution. The department should adjust scoring to account for reliability, add human review for borderline cases, and monitor allocations to verify that communities are neither under- nor over-prioritized due to sampling error. Counties: Cameron, Carbon, Elk, Forest, Greene, Montour, Snyder, Sullivan, Union, and Warren.
Counties needing alternative approaches: These are counties or tracts where margins of error exceed acceptable thresholds. At the county level this group may be empty, but at the tract level many places fall here. In such cases, alternative strategies are needed: aggregating small geographies, combining ACS with administrative data, and requiring manual validation before final decisions. This ensures communities are not misclassified due to unstable estimates.
Questions for Further Investigation
Are high MOE tracts geographically clustered and do these clusters overlap with areas of higher service need?
How stable are county and tract level reliability measures across ACS releases?
What other datasets could be combined with ACS to reduce uncertainty and strengthen targeting?
Technical Notes
Data Sources: - U.S. Census Bureau, American Community Survey 2018-2022 5-Year Estimates - Retrieved via tidycensus R package on 09/23/2025.
Reproducibility: - All analysis conducted in R version 4.4.1 - Census API key required for replication - Complete code and documentation available at: https://musa-5080-fall-2025.github.io/portfolio-setup-kavanaraju/assignments/assignment_1/assignment1_template.html
Methodology Notes: Reliability categories were defined using relative margins of error. Counties were selected to represent different reliability levels. Tracts were flagged as high MOE if any race or ethnicity variable exceeded a 15 percent margin of error.
Limitations: Margins of error are especially large at small geographic scales and for subgroup estimates, making tract level analysis more uncertain. This work uses one ACS release and does not test temporal consistency. Only a subset of variables was analyzed, and nonresponse patterns in ACS may add further bias not captured by MOE.
Submission Checklist
Before submitting your portfolio link on Canvas:
Remember: Submit your portfolio URL on Canvas, not the file itself. Your assignment should be accessible at your-portfolio-url/assignments/assignment_1/your_file_name.html