# Load required packages
library(tidyverse)
library(tidycensus)
library(janitor)
library(sf)
library(tigris)
library(scales)
library(patchwork)
library(RColorBrewer)
library(units)
library(knitr)
library(caret)
library(tidygeocoder)
library(cowplot)Data Axle - Businesses in Sharswood
URBS 4000: Urban Studies Thesis
Project Title: Mixed by Design
Part 1: Raw Data from Data Axle
Data Axle Reference Solutions (formerly ReferenceUSA and Infogroup) is a big data, analytics, and marketing services provider that delivers best in class data-driven, customer-centric technology solutions. It offers two main types of data:
- Residential Historic Data: Analyze community growth and general population differences.
- Historic Business Data: Analyze market trends, economic growth, or specific industries. Access categories including Company Name, Geocodes, SIC/NAICS codes, Census Tracts.
Steps to Download Data Axle U.S. Businesses Dataset as CSV Files
I retrieved my raw data files from Data Axle through Penn Libraries and Wharton Research and Data Services (WRDS) on October 15, 2025.
After you log in with your PennKey at this website (http://www.referenceusa.com.proxy.library.upenn.edu/Home/Home), click on U.S. Businesses or U.S. Historical Businesses and you will see the Data Axle interface shown below.
- Filter by Neighborhood (Pennsylvania > PHL > Sharswood) or Zip Code (19122) and check “Omit: ATM and Kiosk”
- Click on “View Results” in the top right-hand corner of the screen
- On the results page, select each page (25 records per page) and click on “Download”
- On the downloads page, select the CSV file format as well as the custom fields shown below and click on “Download Records” at the bottom of the screen
Record Types
Verified Records: have been verified by DataAxle staff through various compilation processes including phone validation
Unverified Records: separate database from verified records because of one of the following reasons:
- records appear within data sources but have yet to be fully verified
- partial information is contained in the record, but not all data elements are present to be considered ‘verified’ at the point of data retrieval
- addresses are unverified (e.g. a business may have moved)
Step 1: Load Raw Data as csv files
- DataAxle’s U.S. Businesses Data Dictionary
- Verified Businesses in Sharswood Neighborhood
- Unverified Businesses in Sharswood Neighborhood
- Philadelphia census tracts and neighborhood boundaries
# Load Data Axle Data
# data dictionary
datadict <- read_csv("data/Data_Dict.csv")
datadict
# verified businesses
verified <- read_csv("data/Sharswood_Verified_Biz.csv")
# merge unverified businesses due to download limits from DataAxle
unverified1 <- read_csv("data/Sharswood_Unverified_Biz_Pg1to10.csv")
unverified2 <- read_csv("data/Sharswood_Unverified_Biz_Pg11to17.csv")
unverified_combined <- bind_rows(unverified1, unverified2)
write_csv(unverified_combined, "data/Sharswood_Unverified_Biz_Combined.csv")# rename column headings
verified <- verified |> clean_names()
unverified_combined <- unverified_combined |> clean_names()
# filter out small businesses with irrelevant SIC Codes
drop_SIC <- c("Atm-Automated Teller Machines", "Schools", "Churches",
"Nonclassified Establishments", "Real Estate Management",
"Townhouses", "Mailing & Shipping Kiosks",
"Offices Of Real Estate Agents And Brokers", "Real Estate")
verified_clean <- verified |>
filter(!primary_sic_description %in% drop_SIC) |>
select(company_name, address, latitude, longitude, census_block_group,
primary_sic_code, primary_sic_description, primary_naics,
primary_naics_description, years_in_database,
location_type, location_employee_size_range,
square_footage, record_type)
unverified_clean <- unverified_combined |>
filter(!primary_sic_description %in% drop_SIC) |>
select(company_name, address, latitude, longitude, census_block_group,
primary_sic_code, primary_sic_description, primary_naics,
primary_naics_description, years_in_database,
location_type, location_employee_size_range,
square_footage, record_type)# Load spatial data
census_tracts <- tracts(state = "PA", county = "Philadelphia", year = 2020,
class = "sf", cb = TRUE, progress = FALSE)
block_grps <- block_groups(state = "PA", county = "Philadelphia", year = 2020,
class = "sf", cb = TRUE, progress = FALSE)Step 2: Get Philadelphia Demographic Data using tidycensus
# Load all available variables for ACS 5-year 2022
acs_vars_2022 <- load_variables(2022, "acs5", cache = TRUE)# Get tract-level demographic data from 2022 ACS 5-Yr Estimates for Philadelphia
phl_tract_data <- get_acs(
geography = "tract",
variables = c(
total_pop = "B01003_001",
median_income = "B19013_001",
poverty = "B17001_001",
White = "B03002_003",
Black = "B03002_004",
Hispanic = "B03002_012"
),
state = "PA",
county = "Philadelphia",
year = 2022,
survey = "acs5",
output = "wide",
geometry = TRUE
)
# Clean the county names to remove state name and "County"
phl_tract_clean <- phl_tract_data |>
separate(
NAME,
into = c("tract_name", "county_name", "state_name"),
sep = "; "
) |>
mutate(
tract_name = str_remove(tract_name, "Census Tract "),
county_name = str_remove(county_name, " County")
)
phl_tract_summary <- phl_tract_clean |>
select(GEOID, tract_name, county_name, total_popE, median_incomeE)Step 3: Make Data Axle dataset Spatial
# Convert Data Axle dataset to sf object
verified.sf <- verified_clean |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |> # WGS 84
st_transform(2272) # PA State Plane (in US Survey Feet)
unverified.sf <- unverified_clean |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |>
st_transform(2272)Join Data Axle Shapefiles to Philadelphia Census Tracts and Block Groups
[Sharswood Property Parcels]
- Data Source: https://opendataphilly.org/datasets/department-of-records-property-parcels/
- Date Downloaded: Oct 19, 2025
- Department: Department of Records
- File Type: GeoJSON
- Description: Department of Records (DOR) boundaries of real estate property parcels derived from legal recorded deed documents
# Filter Property Parcels for Sharswood
prop_parcels <- st_read("data/DOR_Parcel.geojson") |>
st_transform(st_crs(sharswood_bg_sf))
sharswood_parcels <- st_filter(prop_parcels, sharswood_bg_sf, .predicate = st_intersects)
st_write(
sharswood_parcels, "data/sharswood_parcels.geojson",
driver = "GeoJSON",
delete_dsn = TRUE,
layer_options = c("RFC7946=YES","COORDINATE_PRECISION=6","WRITE_BBOX=NO")
)sharswood_parcels <- st_read("data/sharswood_parcels.geojson")# Filter Census Tracts for Sharswood
sharswood_tract_sf <- phl_tract_summary |>
filter(tract_name == c("138", "139"))
# Filter Block Groups for Sharswood
ids <- c("421010138001","421010138002","421010139001",
"421010139002","421010139003")
sharswood_bg_sf <- block_grps |>
filter(GEOID %in% ids)# Join Shapefiles and Plot Map
sharswood_bg_sf <- st_transform(sharswood_bg_sf, st_crs(verified.sf))
ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray50", linewidth = 0.3) +
geom_sf(data = verified.sf, aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9) +
labs(
title = "Current Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void()
ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray50", linewidth = 0.3) +
geom_sf(data = unverified.sf, shape = 21, size = 2,
fill = "black", alpha = 0.9) +
labs(
title = "Unverified Businesses in Sharswood — Data Axle",
subtitle = "PA South State Plane (ftUS)",
caption = paste0("CRS: ",
st_crs(block_grps)$input %||% st_crs(block_grps)$epsg)
) +
theme_void()Step 4: Map Businesses to Property Parcels and Affordable Housing Projects
[Affordable Housing Projects]
- Data Source: https://opendataphilly.org/datasets/affordable-housing-production/
- Date Downloaded: Oct 19, 2025
- Department: Philadelphia Division of Housing and Community Development (DHCD)
- File Type: GeoJSON
- Description: dataset includes all DHCD-funded housing projects completed since 1994 for which data is available
affordable_housing_parcels <- st_read("data/Affordable_housing.geojson") |>
st_transform(st_crs(sharswood_bg_sf))
# Spatial Filter for Affordable Housing Units in Sharswood
sharswood_affordable <- st_filter(affordable_housing_parcels, sharswood_bg_sf,
.predicate = st_within)
ggplot() +
geom_sf(data = sharswood_bg_sf, fill = "gray97",
color = "gray50", linewidth = 0.3) +
geom_sf(data = sharswood_affordable, aes(size = total_units),
fill = NA, color = "firebrick", linewidth = 0.6) +
geom_sf_text(data = sharswood_affordable, aes(label = project_name),
size = 2, check_overlap = TRUE) +
labs(title = "Affordable Housing Projects in Sharswood (1994–2024)") +
theme_void()Putting it All Together
ggplot() +
geom_sf(data = sharswood_parcels, fill = NA, color = "gray50", linewidth = 0.6) +
geom_sf(data = sharswood_affordable, fill = NA,
color = "firebrick", linewidth = 1) +
geom_sf_text(data = sharswood_affordable, aes(label = project_name),
size = 3, check_overlap = TRUE) +
geom_sf(data = verified.sf, fill = "blue", shape = 21, size = 1.5, alpha = 0.9) +
geom_sf(data = unverified.sf, fill = "darkgreen",
shape = 21, size = 1.5, alpha = 0.9) +
labs(title = "Businesses and Affordable Housing Projects in Sharswood") +
theme_void()Step 5: Merge Verified and Unverified Datasets
sharswood_curr_biz <- bind_rows(verified_clean, unverified_clean)
count(sharswood_curr_biz, primary_sic_description)
count(sharswood_curr_biz, primary_naics_description)# Make Combined Dataset Spatial
sharswood_curr_biz.sf <- sharswood_curr_biz |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |> # WGS 84
st_transform(2272) # PA State Plane (in US Survey Feet)ggplot() +
geom_sf(data = sharswood_parcels, fill = NA, color = "gray50", linewidth = 0.6) +
geom_sf(data = sharswood_curr_biz.sf, fill = "darkgreen",
shape = 21, size = 1.5, alpha = 0.9) +
labs(title = "Location of Current Businesses in Sharswood",
caption = "Data Source: Data Axle U.S. Businesses") +
theme_void()Part 2: Repeat Steps for 2012, 2017, and 2022
# Further Cleaning
drop_SIC_2 <- c("Apartments", "Housing Authorities",
"Office Buildings & Parks",
"Schools-Universities & Colleges Academic",
"Gardens", "Dialysis",
"City Government-Housing Programs")2012 Data Axle Business Records
# Load in 2012 Data
dataaxle2012 <- read_csv("data/2012_combined.csv") |>
clean_names()
dataaxle2012_clean <- dataaxle2012 |>
filter(!primary_sic_description %in% drop_SIC) |>
mutate(address = str_c(address, "Philadelphia", "PA 19121", sep = ", "))
# Geocode
dataaxle2012_geo <- dataaxle2012_clean |>
geocode(address, method = 'osm', lat = latitude, long = longitude)write_csv(dataaxle2012_geo, "data/Sharswood_Biz_2012_geo.csv")# Convert Data Axle dataset to sf object
dataaxle2012.sf <- read_csv("data/Sharswood_Biz_2012_geo.csv") |>
filter(!primary_sic_description %in% drop_SIC_2) |>
drop_na(longitude, latitude) |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE) |>
st_transform(2272)
dataaxle2012.sf <- dataaxle2012.sf |>
st_transform(st_crs(sharswood_bg_sf)) |>
st_crop(sharswood_bg_sf) |>
st_filter(sharswood_bg_sf, .predicate = st_within)count(dataaxle2012.sf, primary_sic_description)# Plot with Formatted Legend
p_2012 <- ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray60", linewidth = 0.3) +
geom_sf(data = dataaxle2012.sf,
aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9, color = "gray30") +
labs(
title = "2012 Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void() +
theme(
legend.position = "bottom",
theme(legend.title = element_blank()),
legend.text = element_text(size = 8),
legend.key.size = unit(0.5, "lines"),
# legend.box.margin = margin(t = 12), # space above legend
plot.margin = margin(12, 16, 50, 16), # extra bottom margin for legend
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5)
) +
guides(fill = guide_legend(title = NULL, ncol = 4, byrow = TRUE,
override.aes = list(size = 3)))
# Make the plotting area “regular size”
p_2012 + plot_layout(heights = c(1)) # (device size controls final size)ggsave("sharswood_2012.png", p_2012, width = 10, height = 7, dpi = 300)ggplot() +
geom_sf(data = sharswood_parcels, fill = NA, color = "gray50", linewidth = 0.6) +
geom_sf(data = dataaxle2012.sf, fill = "darkgreen",
shape = 21, size = 1, alpha = 0.9) +
labs(title = "2012 Verified Businesses in Sharswood",
caption = "Data Source: Data Axle U.S. Historical Businesses") +
theme_void()2017 Data Axle Business Records
# Load in 2017 Data
dataaxle2017 <- read_csv("data/2017_combined.csv") |>
clean_names()
dataaxle2017_clean <- dataaxle2017 |>
filter(!primary_sic_description %in% drop_SIC) |>
mutate(address = str_c(address, "Philadelphia", "PA 19121", sep = ", "))
# Geocode
dataaxle2017_geo <- dataaxle2017_clean |>
geocode(address, method = 'osm', lat = latitude, long = longitude)write_csv(dataaxle2017_geo, "data/Sharswood_Biz_2017_geo.csv")# Convert Data Axle dataset to sf object
dataaxle2017.sf <- read_csv("data/Sharswood_Biz_2017_geo.csv") |>
filter(!primary_sic_description %in% drop_SIC_2) |>
drop_na(longitude, latitude) |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE) |>
st_transform(2272)
# Spatial Filter
dataaxle2017.sf <- dataaxle2017.sf |>
st_transform(st_crs(sharswood_bg_sf)) |>
st_crop(sharswood_bg_sf) |>
st_filter(sharswood_bg_sf, .predicate = st_within)count(dataaxle2017.sf, primary_sic_description)# Map
ggplot() +
geom_sf(data = sharswood_parcels, fill = NA, color = "gray50", linewidth = 0.6) +
geom_sf(data = dataaxle2017.sf, fill = "darkgreen",
shape = 21, size = 1, alpha = 0.9) +
labs(title = "2017 Verified Businesses in Sharswood",
caption = "Data Source: Data Axle U.S. Historical Businesses") +
theme_void()# Plot with Formatted Legend
p_2017 <- ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray60", linewidth = 0.3) +
geom_sf(data = dataaxle2017.sf,
aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9, color = "gray30") +
labs(
title = "2017 Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void() +
theme(
legend.position = "bottom",
theme(legend.title = element_blank()),
legend.text = element_text(size = 8),
legend.key.size = unit(0.5, "lines"),
legend.box.margin = margin(t = 2), # space above legend
plot.margin = margin(12, 16, 50, 16), # extra bottom margin for legend
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5)
) +
guides(fill = guide_legend(title = NULL, ncol = 4, byrow = TRUE,
override.aes = list(size = 3)))
# Make the plotting area “regular size”
p_2017 + plot_layout(heights = c(1)) # (device size controls final size)ggsave("sharswood_2017.png", p_2017, width = 10, height = 7, dpi = 300)ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray50", linewidth = 0.3) +
geom_sf(data = dataaxle2017.sf, aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9) +
labs(
title = "2017 Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void()2022 Data Axle Business Records
# Load in 2022 Data
dataaxle2022 <- read_csv("data/2022_combined.csv") |>
clean_names()
dataaxle2022_clean <- dataaxle2022 |>
filter(!primary_sic_description %in% drop_SIC) |>
mutate(address = str_c(address, "Philadelphia", "PA 19121", sep = ", "))
# Geocode
dataaxle2022_geo <- dataaxle2022_clean |>
geocode(address, method = 'osm', lat = latitude, long = longitude)write_csv(dataaxle2022_geo, "data/Sharswood_Biz_2022_geo.csv")# Convert Data Axle dataset to sf object
dataaxle2022.sf <- read_csv("data/Sharswood_Biz_2022_geo.csv") |>
filter(!primary_sic_description %in% drop_SIC_2) |>
drop_na(longitude, latitude) |>
st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE) |>
st_transform(2272)
# Spatial Filter
dataaxle2022.sf <- dataaxle2022.sf |>
st_transform(st_crs(sharswood_bg_sf)) |>
st_crop(sharswood_bg_sf) |>
st_filter(sharswood_bg_sf, .predicate = st_within)count(dataaxle2022.sf, primary_sic_description)# Map
ggplot() +
geom_sf(data = sharswood_parcels, fill = NA, color = "gray50", linewidth = 0.6) +
geom_sf(data = dataaxle2022.sf, fill = "darkgreen",
shape = 21, size = 1, alpha = 0.9) +
labs(title = "2022 Verified Businesses in Sharswood",
caption = "Data Source: Data Axle U.S. Historical Businesses") +
theme_void()# Plot with Formatted Legend
p_2022 <- ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray60", linewidth = 0.3) +
geom_sf(data = dataaxle2022.sf,
aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9, color = "gray30") +
labs(
title = "2022 Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void() +
theme(
legend.position = "bottom",
theme(legend.title = element_blank()),
legend.text = element_text(size = 8),
legend.key.size = unit(0.5, "lines"),
legend.box.margin = margin(t = 2), # space above legend
plot.margin = margin(12, 16, 50, 16), # extra bottom margin for legend
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5)
) +
guides(fill = guide_legend(title = NULL, ncol = 4, byrow = TRUE,
override.aes = list(size = 3)))
# Make the plotting area “regular size”
p_2022 + plot_layout(heights = c(1)) # (device size controls final size)ggsave("sharswood_2022.png", p_2022, width = 10, height = 7, dpi = 300)ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray50", linewidth = 0.3) +
geom_sf(data = dataaxle2022.sf, aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9) +
labs(
title = "2022 Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void()Current Businesses in Sharswood
# Plot with Formatted Legend
p_curr <- ggplot() +
geom_sf(data = sharswood_parcels, fill = "gray97",
color = "gray60", linewidth = 0.3) +
geom_sf(data = verified.sf,
aes(fill = primary_sic_description),
shape = 21, size = 2, alpha = 0.9, color = "gray30") +
labs(
title = "Current Businesses in Sharswood",
subtitle = "Verified by Data Axle",
fill = "SIC Description"
) +
theme_void() +
theme(
legend.position = "bottom",
theme(legend.title = element_blank()),
legend.text = element_text(size = 8),
legend.key.size = unit(0.5, "lines"),
legend.box.margin = margin(t = 2), # space above legend
plot.margin = margin(12, 16, 50, 16), # extra bottom margin for legend
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5)
) +
guides(fill = guide_legend(title = NULL, ncol = 4, byrow = TRUE,
override.aes = list(size = 3)))
# Make the plotting area “regular size”
p_curr + plot_layout(heights = c(1)) # (device size controls final size)ggsave("sharswood_curr.png", p_curr, width = 10, height = 7, dpi = 300)