Assignment 2: Spatial Analysis and Visualization

Healthcare Access and Equity in Pennsylvania

Author

Angel Rutherford

Published

December 2, 2025

Part 1: Healthcare Access for Vulnerable Populations

Research Question:

Which Pennsylvania counties have the highest proportion of vulnerable populations (elderly + low-income) living far from hospitals?

Step 1: Data Collection

# Load required packages
library(sf)
library(tidyverse)
library(tigris)
library(tidycensus)
library(scales)
library(patchwork)
library(knitr)

# Set Census API key
census_api_key("42bf8a20a3df1def380f330cf7edad0dd5842ce6")

# Load spatial data
pa_counties <- st_read("./data/Pennsylvania_County_Boundaries.shp")
hospitals <- st_read("./data/hospitals.geojson")
census_tracts <- tracts(state = "PA", cb = TRUE)

# Check that all data loaded correctly
pa_counties
hospitals
census_tracts

There are 223 hospitals and 3445 census tracts in the datasets. The coordinate reference system for each dataset is as follows: WGS 84 / Pseudo-Mercator for county data, WGS 84 for the hospital data, and NAD83 for the census tracts.

Step 2: Get Demographic Data

# Get demographic data from ACS

tract_data <- get_acs(
  geography="tract",
  variables= c(
    total_pop = "B01003_001",
    median_income = "B19013_001"
  ),
  state = "PA",
  year = 2022,
  output = "wide"
)

over_65 = c("B01001_020","B01001_021","B01001_022","B01001_023",
            "B01001_024","B01001_025","B01001_044","B01001_045",
            "B01001_046","B01001_047","B01001_048","B01001_049")

elderly_data = get_acs(
  geography="tract",
  variables =c(over_65
),
  state = "PA",
  year = 2022,
  output = "wide"
)

over_65_E<- paste0(over_65, "E")

elderly_data<- elderly_data%>%
  mutate(elderly_pop = rowSums(elderly_data[, over_65_E], na.rm = TRUE))
  
total_elderly <-elderly_data%>%
  select(GEOID,elderly_pop)

tract_data <-left_join(tract_data, total_elderly, by="GEOID")

# Join to tract boundaries

data_with_bounds = left_join(census_tracts, tract_data, by="GEOID")
data_with_bounds

no_record= sum(is.na(tract_data$median_incomeE))
no_record

median_median= median(tract_data$median_incomeE, na.rm = TRUE)
median_median

The ACS data being used in this analysis is from 2022. Excluding the 63 tracts with missing income data, the median income across PA census tracts was $70,188.

Step 3: Define Vulnerable Populations

# Filter for vulnerable tracts based on your criteria
data_with_bounds <- data_with_bounds%>%
  mutate(
    elderly_pct = elderly_pop/total_popE * 100)

vulnerable_data <- data_with_bounds%>%
  filter(median_incomeE < 36000 & elderly_pct > 20) 
  

vulnerable_data

print((nrow(vulnerable_data)/nrow(data_with_bounds))*100)

The income threshold I chose to identify tracts with low median income was a income less than $36,000, approximately half of the median of all tracts. The threshold used to identify large elderly populations was a population over 20% elderly as it would indicate that more than 1/5th of the tract’s population was elderly. These thresholds were defined to serve as measurements of tract vulnerability. Based on these vulnerability thresholds, 49 tracts or 1.4% of Pennsylvania census tracts were considered vulnerable.

Step 4: Calculate Distance to Hospitals

# Transform to appropriate projected CRS
vulnerable_data <- st_transform(vulnerable_data,3365)
hospitals <- st_transform(hospitals, 3365)

# Calculate distance from each tract centroid to nearest hospital
vulnerable_distance <- vulnerable_data%>%
  mutate(
    distance_to_nearest= as.numeric(apply(st_distance(st_centroid(.),hospitals),
    1, min)
  ))

vulnerable_distance

median_distance= median(vulnerable_distance$distance_to_nearest , na.rm = TRUE)
median_distance

max_distance = max(vulnerable_distance$distance_to_nearest , na.rm = TRUE)
max_distance

The datasets’ projected coordinate reference system were transformed to 3365, the ideal projection for Pennsylvania and for measuring distance in units and not distorting degrees of longitude and latitude. The average distance to the nearest hospital for vulnerable tracts was approximately 6395 feet. The maximum distance was about 98,596 feet. Only 1 tract, out all the vulnerable tracts, was more than 15 miles from the nearest hospital.

Step 5: Identify Underserved Areas

# Create underserved variable
underserved_tracts <- vulnerable_distance%>%
  filter(distance_to_nearest > 79200)

underserved_tracts

print((nrow(underserved_tracts)/nrow(vulnerable_data))*100)

Only one singular tract is considered underserved by the condition of being more than 15 miles or 79,200 feet from the nearest hospital. This means that 2% of tracts identified as vulnerable are underserved. Considering that Pennsylvania is overwhelmingly rural, I am surprised that many vulnerable tracts have a hospital within 15 miles.

Step 6: Aggregate to County Level

# Spatial join tracts to counties
vulnerable_distance <- st_transform(vulnerable_distance, st_crs(pa_counties))

tracts_per_county <- vulnerable_distance %>% 
  st_join(pa_counties) %>%
  st_drop_geometry() %>%
  group_by(COUNTY_NAM)


# Aggregate statistics by county
county_vulnerability <- tracts_per_county %>%
  summarise(
    num_of_tracts = n(),
    num_of_underserved = sum(distance_to_nearest > 79200, na.rm = TRUE),
    pct_underserved =  num_of_underserved/num_of_tracts *100,
    avg_distance = mean(distance_to_nearest, na.rm = TRUE),
    total_vulnerable = sum(total_popE, na.rm = TRUE)
  )%>%
  arrange(desc(avg_distance))

formated_county_vulnerability <- county_vulnerability%>%
  mutate(
    total_vulnerable = comma(total_vulnerable),
    avg_distance = comma(avg_distance),
    pct_underserved = percent(pct_underserved/100)
  )

formated_county_vulnerability%>%  
  select(COUNTY_NAM, num_of_underserved, num_of_tracts, pct_underserved, avg_distance, total_vulnerable)%>%
    kable(
      col.names = c(
"County Name", "Number of Underserved Tracts",
"Total Number of Vulnerable Tracts", "Percentage Underserved", "Average Distance", "Total Vulnerable Population"),
     digits=c(0,1,1,1,1))

County Name	Number of Underserved Tracts	Total Number of Vulnerable Tracts	Percentage Underserved	Average Distance	Total Vulnerable Population
CAMERON	1	1	100%	98,596	1,988
NORTHUMBERLAND	0	1	0%	50,187	2,350
BEAVER	0	1	0%	21,733	2,093
LEHIGH	0	1	0%	17,985	3,974
WESTMORELAND	0	3	0%	16,963	3,927
LUZERNE	0	3	0%	13,026	7,931
ALLEGHENY	0	15	0%	11,849	27,204
DELAWARE	0	1	0%	11,338	2,373
MIFFLIN	0	1	0%	9,779	2,418
YORK	0	1	0%	8,109	1,659
MERCER	0	2	0%	5,722	4,992
CRAWFORD	0	1	0%	4,899	2,842
WAYNE	0	1	0%	4,419	4,469
FAYETTE	0	1	0%	3,726	3,205
PHILADELPHIA	0	11	0%	3,538	35,060
CAMBRIA	0	2	0%	3,223	3,241
LAWRENCE	0	1	0%	2,743	3,893
LACKAWANNA	0	1	0%	2,651	2,901
BLAIR	0	1	0%	2,192	1,436
ERIE	0	2	0%	1,238	3,070

Only 1 county, Cameron county, has 100% underserved vulnerable tracts. Allegheny county has the most vulnerable people, 27,204, living far away, about 11,849 feet, from hospitals. There are no patterns in underserved counties but there is a pattern in vulnerability as most vulnerable counties have vulnerable populations less than 5,000, indicating counties with smaller overall population.

Step 7: Create Summary Table

# Create and format priority counties table
top_10_counties <- formated_county_vulnerability %>%
  slice_head(n=10)

# Format as table with kable() - include appropriate column names and caption 
top_10_counties %>%
  select(COUNTY_NAM, pct_underserved, avg_distance, total_vulnerable)%>%
    kable(
      col.names = c(
"County Name", 
"Percentage of Tracts Lacking Adequate Access to Hospitals (over 15 miles to nearest)",
"Average Distance to Nearest Hospital (ft)",
"Total Population of Vulnerable Tracts (tracts with 20% over the age of 65 and a median income less than 3600)"),
      caption = "Top 10 Pennslyvania Counties to Prioritize Healthcare Investments Based on Vulnerability and Access",
     digits=c(0,1,1,1)
)

Top 10 Pennslyvania Counties to Prioritize Healthcare Investments Based on Vulnerability and Access
County Name	Percentage of Tracts Lacking Adequate Access to Hospitals (over 15 miles to nearest)	Average Distance to Nearest Hospital (ft)	Total Population of Vulnerable Tracts (tracts with 20% over the age of 65 and a median income less than 3600)
CAMERON	100%	98,596	1,988
NORTHUMBERLAND	0%	50,187	2,350
BEAVER	0%	21,733	2,093
LEHIGH	0%	17,985	3,974
WESTMORELAND	0%	16,963	3,927
LUZERNE	0%	13,026	7,931
ALLEGHENY	0%	11,849	27,204
DELAWARE	0%	11,338	2,373
MIFFLIN	0%	9,779	2,418
YORK	0%	8,109	1,659

Due to limited data availability for underserved counties, rankings were based on the ten counties with the highest average distance.

Part 2: Comprehensive Visualization

# Create county-level access map
hospitals <- st_transform(hospitals, st_crs(pa_counties))
counties_with_demographics <- pa_counties %>%
  left_join(county_vulnerability, by = "COUNTY_NAM")

ggplot(counties_with_demographics) +
  geom_sf(aes(fill = pct_underserved), color = "black", size = 0.5) +
  geom_sf(data = hospitals, color = "red", size = 2) +
  scale_fill_viridis_c(
    na.value = "white",
    name = "Percentage Underserved",
    labels = scales::label_number(suffix = "%"),
    option = "plasma"
  ) +
  labs(
    title = "Pennsylvania Counties with Vulnerable Tracts\nand their Percentage of Underserved Vulnerable Tracts",
    subtitle = "Red = Hospitals",
    caption = "Source: ACS 2022"
  ) +
  theme_void()+
  theme(
  plot.title = element_text(face = "bold", hjust =0))

Map 2: Detailed Vulnerability Map

hospitals <- st_transform(hospitals, st_crs(pa_counties))
hospitals_with_counties <- hospitals %>%
  st_join(pa_counties %>% select(COUNTY_NAM))

census_tracts <- st_transform(census_tracts, st_crs(pa_counties))

target_county <- pa_counties %>%
  filter(FIPS_COUNT == underserved_tracts$COUNTYFP)

my_neighbors <- pa_counties %>%
  st_filter(target_county, .predicate = st_touches)


clipped_neighbor_tracts <- st_intersection(census_tracts, my_neighbors)
clipped_target_tracts <- st_intersection(census_tracts, target_county)

neighbor_hospitals <- hospitals_with_counties %>%
  filter(COUNTY_NAM %in% my_neighbors$COUNTY_NAM)

ggplot()+
  geom_sf(data = clipped_neighbor_tracts, color="black")+
  geom_sf(data = clipped_target_tracts, color="black")+
  geom_sf(data = target_county, fill="lightgray", color="black", alpha=0.7)+
  geom_sf(data = my_neighbors, fill="white", color="black", alpha=0.7)+
  geom_sf(data = underserved_tracts, fill= "blue")+
  geom_sf(data = neighbor_hospitals, color="red", size=2)+
  labs(
    title = "Surrounding Pennsylvania Counties and Tracts of the Underserved Tract",
    subtitle = "Red = Hospitals, Blue = Underserved Tract",
    caption = "Source: ACS 2022")+
  theme_void()+
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5))

Chart: Distribution Analysis

# Create distribution visualization

ggplot(county_vulnerability, aes(x = COUNTY_NAM, y = avg_distance)) +
  geom_col(fill = "blue", color = "black") +
  labs(title = "Bar Plot Average Distance to Hospitals Across Vulnerable Tracts By County",
       x = "County Name", 
       y = "Average Distance (feet)") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
  panel.grid.major = element_blank())

This bar chart shows the average distance to hospitals across vulnerable tracts by Pennsylvania counties. Cameron County stands out dramatically with the highest value, suggesting that residents there may face significant challenges accessing hospital care. Most other counties have relatively low averages, indicating better proximity to healthcare facilities.

Per previous submission feedback, I removed instructional text from the final version of my assignment. Additionally, I omitted tables and outputs that were generated for my personal reference.