# Load required packages
library(sf)
library(tidyverse)
library(tigris)
library(tidycensus)
library(scales)
library(patchwork)
library(knitr)
# Set Census API key
census_api_key("42bf8a20a3df1def380f330cf7edad0dd5842ce6")
# Load spatial data
pa_counties <- st_read("./data/Pennsylvania_County_Boundaries.shp")
hospitals <- st_read("./data/hospitals.geojson")
census_tracts <- tracts(state = "PA", cb = TRUE)
# Check that all data loaded correctly
pa_counties
hospitals
census_tractsAssignment 2: Spatial Analysis and Visualization
Healthcare Access and Equity in Pennsylvania
Part 1: Healthcare Access for Vulnerable Populations
Research Question:
Which Pennsylvania counties have the highest proportion of vulnerable populations (elderly + low-income) living far from hospitals?
Step 1: Data Collection
There are 223 hospitals and 3445 census tracts in the datasets. The coordinate reference system for each dataset is as follows: WGS 84 / Pseudo-Mercator for county data, WGS 84 for the hospital data, and NAD83 for the census tracts.
Step 2: Get Demographic Data
# Get demographic data from ACS
tract_data <- get_acs(
geography="tract",
variables= c(
total_pop = "B01003_001",
median_income = "B19013_001"
),
state = "PA",
year = 2022,
output = "wide"
)
over_65 = c("B01001_020","B01001_021","B01001_022","B01001_023",
"B01001_024","B01001_025","B01001_044","B01001_045",
"B01001_046","B01001_047","B01001_048","B01001_049")
elderly_data = get_acs(
geography="tract",
variables =c(over_65
),
state = "PA",
year = 2022,
output = "wide"
)
over_65_E<- paste0(over_65, "E")
elderly_data<- elderly_data%>%
mutate(elderly_pop = rowSums(elderly_data[, over_65_E], na.rm = TRUE))
total_elderly <-elderly_data%>%
select(GEOID,elderly_pop)
tract_data <-left_join(tract_data, total_elderly, by="GEOID")
# Join to tract boundaries
data_with_bounds = left_join(census_tracts, tract_data, by="GEOID")
data_with_bounds
no_record= sum(is.na(tract_data$median_incomeE))
no_record
median_median= median(tract_data$median_incomeE, na.rm = TRUE)
median_medianThe ACS data being used in this analysis is from 2022. Excluding the 63 tracts with missing income data, the median income across PA census tracts was $70,188.
Step 3: Define Vulnerable Populations
# Filter for vulnerable tracts based on your criteria
data_with_bounds <- data_with_bounds%>%
mutate(
elderly_pct = elderly_pop/total_popE * 100)
vulnerable_data <- data_with_bounds%>%
filter(median_incomeE < 36000 & elderly_pct > 20)
vulnerable_data
print((nrow(vulnerable_data)/nrow(data_with_bounds))*100)The income threshold I chose to identify tracts with low median income was a income less than $36,000, approximately half of the median of all tracts. The threshold used to identify large elderly populations was a population over 20% elderly as it would indicate that more than 1/5th of the tract’s population was elderly. These thresholds were defined to serve as measurements of tract vulnerability. Based on these vulnerability thresholds, 49 tracts or 1.4% of Pennsylvania census tracts were considered vulnerable.
Step 4: Calculate Distance to Hospitals
# Transform to appropriate projected CRS
vulnerable_data <- st_transform(vulnerable_data,3365)
hospitals <- st_transform(hospitals, 3365)
# Calculate distance from each tract centroid to nearest hospital
vulnerable_distance <- vulnerable_data%>%
mutate(
distance_to_nearest= as.numeric(apply(st_distance(st_centroid(.),hospitals),
1, min)
))
vulnerable_distance
median_distance= median(vulnerable_distance$distance_to_nearest , na.rm = TRUE)
median_distance
max_distance = max(vulnerable_distance$distance_to_nearest , na.rm = TRUE)
max_distanceThe datasets’ projected coordinate reference system were transformed to 3365, the ideal projection for Pennsylvania and for measuring distance in units and not distorting degrees of longitude and latitude. The average distance to the nearest hospital for vulnerable tracts was approximately 6395 feet. The maximum distance was about 98,596 feet. Only 1 tract, out all the vulnerable tracts, was more than 15 miles from the nearest hospital.
Step 5: Identify Underserved Areas
# Create underserved variable
underserved_tracts <- vulnerable_distance%>%
filter(distance_to_nearest > 79200)
underserved_tracts
print((nrow(underserved_tracts)/nrow(vulnerable_data))*100)Only one singular tract is considered underserved by the condition of being more than 15 miles or 79,200 feet from the nearest hospital. This means that 2% of tracts identified as vulnerable are underserved. Considering that Pennsylvania is overwhelmingly rural, I am surprised that many vulnerable tracts have a hospital within 15 miles.
Step 6: Aggregate to County Level
# Spatial join tracts to counties
vulnerable_distance <- st_transform(vulnerable_distance, st_crs(pa_counties))
tracts_per_county <- vulnerable_distance %>%
st_join(pa_counties) %>%
st_drop_geometry() %>%
group_by(COUNTY_NAM)
# Aggregate statistics by county
county_vulnerability <- tracts_per_county %>%
summarise(
num_of_tracts = n(),
num_of_underserved = sum(distance_to_nearest > 79200, na.rm = TRUE),
pct_underserved = num_of_underserved/num_of_tracts *100,
avg_distance = mean(distance_to_nearest, na.rm = TRUE),
total_vulnerable = sum(total_popE, na.rm = TRUE)
)%>%
arrange(desc(avg_distance))
formated_county_vulnerability <- county_vulnerability%>%
mutate(
total_vulnerable = comma(total_vulnerable),
avg_distance = comma(avg_distance),
pct_underserved = percent(pct_underserved/100)
)
formated_county_vulnerability%>%
select(COUNTY_NAM, num_of_underserved, num_of_tracts, pct_underserved, avg_distance, total_vulnerable)%>%
kable(
col.names = c(
"County Name", "Number of Underserved Tracts",
"Total Number of Vulnerable Tracts", "Percentage Underserved", "Average Distance", "Total Vulnerable Population"),
digits=c(0,1,1,1,1))| County Name | Number of Underserved Tracts | Total Number of Vulnerable Tracts | Percentage Underserved | Average Distance | Total Vulnerable Population |
|---|---|---|---|---|---|
| CAMERON | 1 | 1 | 100% | 98,596 | 1,988 |
| NORTHUMBERLAND | 0 | 1 | 0% | 50,187 | 2,350 |
| BEAVER | 0 | 1 | 0% | 21,733 | 2,093 |
| LEHIGH | 0 | 1 | 0% | 17,985 | 3,974 |
| WESTMORELAND | 0 | 3 | 0% | 16,963 | 3,927 |
| LUZERNE | 0 | 3 | 0% | 13,026 | 7,931 |
| ALLEGHENY | 0 | 15 | 0% | 11,849 | 27,204 |
| DELAWARE | 0 | 1 | 0% | 11,338 | 2,373 |
| MIFFLIN | 0 | 1 | 0% | 9,779 | 2,418 |
| YORK | 0 | 1 | 0% | 8,109 | 1,659 |
| MERCER | 0 | 2 | 0% | 5,722 | 4,992 |
| CRAWFORD | 0 | 1 | 0% | 4,899 | 2,842 |
| WAYNE | 0 | 1 | 0% | 4,419 | 4,469 |
| FAYETTE | 0 | 1 | 0% | 3,726 | 3,205 |
| PHILADELPHIA | 0 | 11 | 0% | 3,538 | 35,060 |
| CAMBRIA | 0 | 2 | 0% | 3,223 | 3,241 |
| LAWRENCE | 0 | 1 | 0% | 2,743 | 3,893 |
| LACKAWANNA | 0 | 1 | 0% | 2,651 | 2,901 |
| BLAIR | 0 | 1 | 0% | 2,192 | 1,436 |
| ERIE | 0 | 2 | 0% | 1,238 | 3,070 |
Only 1 county, Cameron county, has 100% underserved vulnerable tracts. Allegheny county has the most vulnerable people, 27,204, living far away, about 11,849 feet, from hospitals. There are no patterns in underserved counties but there is a pattern in vulnerability as most vulnerable counties have vulnerable populations less than 5,000, indicating counties with smaller overall population.
Step 7: Create Summary Table
# Create and format priority counties table
top_10_counties <- formated_county_vulnerability %>%
slice_head(n=10)
# Format as table with kable() - include appropriate column names and caption
top_10_counties %>%
select(COUNTY_NAM, pct_underserved, avg_distance, total_vulnerable)%>%
kable(
col.names = c(
"County Name",
"Percentage of Tracts Lacking Adequate Access to Hospitals (over 15 miles to nearest)",
"Average Distance to Nearest Hospital (ft)",
"Total Population of Vulnerable Tracts (tracts with 20% over the age of 65 and a median income less than 3600)"),
caption = "Top 10 Pennslyvania Counties to Prioritize Healthcare Investments Based on Vulnerability and Access",
digits=c(0,1,1,1)
)| County Name | Percentage of Tracts Lacking Adequate Access to Hospitals (over 15 miles to nearest) | Average Distance to Nearest Hospital (ft) | Total Population of Vulnerable Tracts (tracts with 20% over the age of 65 and a median income less than 3600) |
|---|---|---|---|
| CAMERON | 100% | 98,596 | 1,988 |
| NORTHUMBERLAND | 0% | 50,187 | 2,350 |
| BEAVER | 0% | 21,733 | 2,093 |
| LEHIGH | 0% | 17,985 | 3,974 |
| WESTMORELAND | 0% | 16,963 | 3,927 |
| LUZERNE | 0% | 13,026 | 7,931 |
| ALLEGHENY | 0% | 11,849 | 27,204 |
| DELAWARE | 0% | 11,338 | 2,373 |
| MIFFLIN | 0% | 9,779 | 2,418 |
| YORK | 0% | 8,109 | 1,659 |
Due to limited data availability for underserved counties, rankings were based on the ten counties with the highest average distance.
Part 2: Comprehensive Visualization
# Create county-level access map
hospitals <- st_transform(hospitals, st_crs(pa_counties))
counties_with_demographics <- pa_counties %>%
left_join(county_vulnerability, by = "COUNTY_NAM")
ggplot(counties_with_demographics) +
geom_sf(aes(fill = pct_underserved), color = "black", size = 0.5) +
geom_sf(data = hospitals, color = "red", size = 2) +
scale_fill_viridis_c(
na.value = "white",
name = "Percentage Underserved",
labels = scales::label_number(suffix = "%"),
option = "plasma"
) +
labs(
title = "Pennsylvania Counties with Vulnerable Tracts\nand their Percentage of Underserved Vulnerable Tracts",
subtitle = "Red = Hospitals",
caption = "Source: ACS 2022"
) +
theme_void()+
theme(
plot.title = element_text(face = "bold", hjust =0))Map 2: Detailed Vulnerability Map
hospitals <- st_transform(hospitals, st_crs(pa_counties))
hospitals_with_counties <- hospitals %>%
st_join(pa_counties %>% select(COUNTY_NAM))
census_tracts <- st_transform(census_tracts, st_crs(pa_counties))
target_county <- pa_counties %>%
filter(FIPS_COUNT == underserved_tracts$COUNTYFP)
my_neighbors <- pa_counties %>%
st_filter(target_county, .predicate = st_touches)
clipped_neighbor_tracts <- st_intersection(census_tracts, my_neighbors)
clipped_target_tracts <- st_intersection(census_tracts, target_county)
neighbor_hospitals <- hospitals_with_counties %>%
filter(COUNTY_NAM %in% my_neighbors$COUNTY_NAM)
ggplot()+
geom_sf(data = clipped_neighbor_tracts, color="black")+
geom_sf(data = clipped_target_tracts, color="black")+
geom_sf(data = target_county, fill="lightgray", color="black", alpha=0.7)+
geom_sf(data = my_neighbors, fill="white", color="black", alpha=0.7)+
geom_sf(data = underserved_tracts, fill= "blue")+
geom_sf(data = neighbor_hospitals, color="red", size=2)+
labs(
title = "Surrounding Pennsylvania Counties and Tracts of the Underserved Tract",
subtitle = "Red = Hospitals, Blue = Underserved Tract",
caption = "Source: ACS 2022")+
theme_void()+
theme(
plot.title = element_text(face = "bold", hjust = 0.5))Chart: Distribution Analysis
# Create distribution visualization
ggplot(county_vulnerability, aes(x = COUNTY_NAM, y = avg_distance)) +
geom_col(fill = "blue", color = "black") +
labs(title = "Bar Plot Average Distance to Hospitals Across Vulnerable Tracts By County",
x = "County Name",
y = "Average Distance (feet)") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank())This bar chart shows the average distance to hospitals across vulnerable tracts by Pennsylvania counties. Cameron County stands out dramatically with the highest value, suggesting that residents there may face significant challenges accessing hospital care. Most other counties have relatively low averages, indicating better proximity to healthcare facilities.
Per previous submission feedback, I removed instructional text from the final version of my assignment. Additionally, I omitted tables and outputs that were generated for my personal reference.