Build a logistic regression model for binary classification
Evaluate model performance using multiple metrics
Analyze disparate impact across demographic groups
Test different decision thresholds
Make evidence-based policy recommendations
SETUP
Code
# Load required packageslibrary(tidyverse) # For data manipulation and visualizationlibrary(caret) # For model training and confusion matrices librarylibrary(pROC) # For ROC curves and AUC# Set random seed for reproducibilityset.seed(2025)# Configure visualization themetheme_set(theme_minimal())
PART 1: DATA LOADING AND EXPLORATION
Code
## Load Georgia Department of Corrections recidivism data# Source: National Institute of Justice Recidivism Challengerecidivism_data <-read_csv("data/NIJ_s_Recidivism_Challenge_Full_Dataset_20240407.csv") # Examine data structure glimpse(recidivism_data)# Check outcome variable distributiontable(recidivism_data$Recidivism_Within_3years, useNA ="ifany")
# Generate predicted probabilities on test settest_data <- test_data %>%mutate(predicted_prob =predict(logit_model, newdata = test_data, type ="response"))# Examine distribution of predicted probabilitiessummary(test_data$predicted_prob)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.07046 0.45871 0.63226 0.60528 0.76880 0.96374
Code
# Visualize predicted probabilities by actual outcomeggplot(test_data,aes(x = predicted_prob,fill =as.factor(recidivism)) ) +geom_histogram(bins =50, alpha =0.7, position ="identity" ) +scale_fill_manual(values =c("steelblue", "coral"),labels =c("No Recidivism", "Recidivism") ) +labs(title ="Distribution of Predicted Probabilities",x ="Predicted Probability of Recidivism",y ="Count",fill ="Actual Outcome")
PART 6: MODEL EVALUATION - OVERALL PERFORMANCE
6.1: Confusion Matrix at Default Threshold (0.5)
Code
test_data <- test_data %>%mutate(predicted_class_50 =ifelse(predicted_prob >0.5, 1, 0))# Create confusion matrixcm_50 <-confusionMatrix(as.factor(test_data$predicted_class_50),as.factor(test_data$recidivism), positive ="1") # "1" is the positive class (recidivism)print(cm_50)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1294 719
1 1334 3204
Accuracy : 0.6866
95% CI : (0.6752, 0.6978)
No Information Rate : 0.5988
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3215
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8167
Specificity : 0.4924
Pos Pred Value : 0.7060
Neg Pred Value : 0.6428
Prevalence : 0.5988
Detection Rate : 0.4891
Detection Prevalence : 0.6927
Balanced Accuracy : 0.6546
'Positive' Class : 1
Code
# Extract key metricscat("nKey Metrics at Threshold = 0.5:n")
nKey Metrics at Threshold = 0.5:n
Code
cat("Sensitivity (Recall):", round(cm_50$byClass["Sensitivity"], 3),"- Proportion of actual recidivists correctly identifiedn")
Sensitivity (Recall): 0.817 - Proportion of actual recidivists correctly identifiedn
Code
cat("Specificity:", round(cm_50$byClass["Specificity"], 3), "- Proportion of non-recidivists correctly identifiedn")
Specificity: 0.492 - Proportion of non-recidivists correctly identifiedn
Code
cat("Precision (PPV):", round(cm_50$byClass["Precision"], 3),"- Proportion of predicted recidivists who actually recidivatedn")
Precision (PPV): 0.706 - Proportion of predicted recidivists who actually recidivatedn