acs_vars <- c(
"commute_score",
"pct_hispanic",
"pct_asian",
"pct_carless",
"emp_density",
"pop_density",
"pct_wfh"
)
spatial_vars <- c(
"dist_university_mi",
"dist_hospital_mi",
"violent_incidents_2014_2023"
)
model_data <- tract_features %>%
dplyr::select(GEOID, fe_bucket, ridership_2023_fall, all_of(acs_vars), all_of(spatial_vars)) %>%
drop_na(ridership_2023_fall) %>%
distinct(GEOID, .keep_all = TRUE)
folds <- vfold_cv(model_data, v = 5)
build_rhs <- function(features) {
if (length(features) > 0) paste(features, collapse = " + ") else "1"
}
calc_train_r2 <- function(features, fe = FALSE, type = "OLS") {
rhs <- build_rhs(features)
formula_str <- paste("ridership_2023_fall ~", rhs)
preds <- switch(type,
"OLS" = {
if (!fe) {
fit <- lm(as.formula(formula_str), data = model_data)
} else {
fit <- feols(as.formula(paste(formula_str, "| fe_bucket")), data = model_data)
}
predict(fit, newdata = model_data)
},
"Poisson" = {
if (!fe) {
fit <- glm(as.formula(formula_str), data = model_data, family = poisson())
predict(fit, newdata = model_data, type = "response")
} else {
fit <- fepois(as.formula(paste(formula_str, "| fe_bucket")), data = model_data)
predict(fit, newdata = model_data)
}
}
)
rsq_vec(model_data$ridership_2023_fall, preds)
}
cv_mae <- function(folds, features, fe = FALSE, type = "OLS") {
map_dbl(folds$splits, function(split) {
train <- analysis(split)
test <- assessment(split)
rhs <- build_rhs(features)
formula_str <- paste("ridership_2023_fall ~", rhs)
preds <- switch(type,
"OLS" = {
if (!fe) {
fit <- lm(as.formula(formula_str), data = train)
predict(fit, newdata = test)
} else {
fit <- feols(as.formula(paste(formula_str, "| fe_bucket")), data = train)
predict(fit, newdata = test)
}
},
"Poisson" = {
if (!fe) {
fit <- glm(as.formula(formula_str), data = train, family = poisson())
predict(fit, newdata = test, type = "response")
} else {
fit <- fepois(as.formula(paste(formula_str, "| fe_bucket")), data = train)
predict(fit, newdata = test)
}
}
)
mae_vec(test$ridership_2023_fall, preds)
}) %>%
mean()
}
specs <- tribble(
~spec_name, ~features, ~fe,
"ACS only", acs_vars, FALSE,
"ACS + Spatial", c(acs_vars, spatial_vars), FALSE,
"ACS + Spatial + FE", c(acs_vars, spatial_vars), TRUE
)
model_specs_full <- expand_grid(
model_type = c("OLS", "Poisson"),
specs
) %>%
mutate(model_label = paste(model_type, spec_name, sep = " - "))
model_performance <- model_specs_full %>%
mutate(
mae = pmap_dbl(list(features, fe, model_type), ~ cv_mae(folds, ..1, ..2, ..3)),
feature_count = map_int(features, length),
train_r2 = pmap_dbl(list(features, fe, model_type), ~ calc_train_r2(..1, ..2, ..3))
) %>%
dplyr::select(model = model_label, mae, train_r2, feature_count, model_type, spec_name)
knitr::kable(
model_performance %>%
mutate(
mae = scales::comma(mae, accuracy = 1),
train_r2 = scales::number(train_r2, accuracy = 0.01)
),
col.names = c("Model Spec", "CV MAE (riders)", "Train R²", "Feature Count", "Model Type", "Feature Set"),
format.args = list(big.mark = ","),
digits = 1
)