# =========================================================
# Spatial Prediction Comparison Plot
# =========================================================
if (all(c("x_coord", "y_coord") %in% names(df))) {
cat("\n", rep("=", 80), "\n", sep = "")
cat("=== Generating Spatial Prediction Comparison Plot ===\n\n")
# Prepare data
spatial_pred_data <- df %>%
filter(!is.na(x_coord) & !is.na(y_coord)) %>%
mutate(
predicted = fitted(models[["M4: +Interact+FE"]]),
residual = predicted - .data[[y_var]]
) %>%
filter(!is.na(predicted))
# If the dataset is too large, randomly sample to speed up plotting
if (nrow(spatial_pred_data) > 10000) {
set.seed(2025)
spatial_pred_data <- spatial_pred_data %>%
slice_sample(n = 10000)
cat(" Large dataset detected; randomly sampled 10,000 points for plotting\n")
}
# 1. Spatial Distribution of Actual Sale Prices
p_actual <- ggplot(spatial_pred_data, aes(x_coord, y_coord, color = .data[[y_var]])) +
geom_point(alpha = 0.6, size = 1) +
scale_color_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = median(spatial_pred_data[[y_var]], na.rm = TRUE),
name = "Log Price"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Actual Sale Price",
x = "X Coordinate",
y = "Y Coordinate"
)
# 2. Spatial Distribution of Predicted Sale Prices
p_predicted <- ggplot(spatial_pred_data, aes(x_coord, y_coord, color = predicted)) +
geom_point(alpha = 0.6, size = 1) +
scale_color_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = median(spatial_pred_data$predicted, na.rm = TRUE),
name = "Log Price"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Predicted Sale Price (Model 4)",
x = "X Coordinate",
y = "Y Coordinate"
)
# 3. Spatial Distribution of Residuals
p_residual_spatial <- ggplot(spatial_pred_data, aes(x_coord, y_coord, color = residual)) +
geom_point(alpha = 0.6, size = 1) +
scale_color_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = 0,
name = "Residual"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Prediction Residuals",
subtitle = "Blue = Underpredicted | Red = Overpredicted",
x = "X Coordinate",
y = "Y Coordinate"
)
# Combine the 3 plots
p_spatial_comparison <- (p_actual | p_predicted | p_residual_spatial) +
plot_annotation(
title = "Spatial Distribution: Actual vs. Predicted Sale Prices",
subtitle = sprintf("Model 4 predictions (R² = %.4f, RMSE = %.4f)",
perf_table$R2[4], perf_table$RMSE[4]),
theme = theme(
plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
plot.background = element_rect(fill = "white", color = NA)
)
)
ggsave("plot/spatial_prediction_comparison.png", p_spatial_comparison,
width = 18, height = 6, dpi = 300, bg = "white")
cat(" plot/spatial_prediction_comparison.png\n")
# 4. High-Resolution Hexbin Heatmap Version
p_actual_hex <- ggplot(spatial_pred_data, aes(x_coord, y_coord, z = .data[[y_var]])) +
stat_summary_hex(bins = 40, fun = mean) +
scale_fill_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = median(spatial_pred_data[[y_var]], na.rm = TRUE),
name = "Avg Log Price"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA)
) +
labs(title = "Actual (Hexbin Average)", x = "X Coordinate", y = "Y Coordinate")
p_predicted_hex <- ggplot(spatial_pred_data, aes(x_coord, y_coord, z = predicted)) +
stat_summary_hex(bins = 40, fun = mean) +
scale_fill_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = median(spatial_pred_data$predicted, na.rm = TRUE),
name = "Avg Log Price"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA)
) +
labs(title = "Predicted (Hexbin Average)", x = "X Coordinate", y = "Y Coordinate")
p_residual_hex <- ggplot(spatial_pred_data, aes(x_coord, y_coord, z = residual)) +
stat_summary_hex(bins = 40, fun = mean) +
scale_fill_gradient2(
low = "#053061", mid = "#F7F7F7", high = "#67001F",
midpoint = 0,
name = "Avg Residual"
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA)
) +
labs(title = "Residuals (Hexbin Average)", x = "X Coordinate", y = "Y Coordinate")
p_spatial_hexbin <- (p_actual_hex | p_predicted_hex | p_residual_hex) +
plot_annotation(
title = "Spatial Distribution (Hexbin Aggregation): Actual vs. Predicted",
subtitle = "Hexagonal bins show average values in each area",
theme = theme(
plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
plot.background = element_rect(fill = "white", color = NA)
)
)
ggsave("plot/spatial_prediction_hexbin.png", p_spatial_hexbin,
width = 18, height = 6, dpi = 300, bg = "white")
cat(" ✓ plot/spatial_prediction_hexbin.png\n")
# 5. Spatial Clustering of Residuals
# Calculate spatial statistics of residuals
residual_stats <- spatial_pred_data %>%
summarise(
mean_residual = mean(residual, na.rm = TRUE),
sd_residual = sd(residual, na.rm = TRUE),
min_residual = min(residual, na.rm = TRUE),
max_residual = max(residual, na.rm = TRUE)
)
cat("\n Spatial Residual Statistics:\n")
cat(sprintf(" Mean: %.4f\n", residual_stats$mean_residual))
cat(sprintf(" Standard Deviation: %.4f\n", residual_stats$sd_residual))
cat(sprintf(" Range: [%.4f, %.4f]\n",
residual_stats$min_residual, residual_stats$max_residual))
# Identify high-residual areas
high_residual_areas <- spatial_pred_data %>%
filter(abs(residual) > 2 * residual_stats$sd_residual) %>%
mutate(residual_type = ifelse(residual > 0, "Overpredicted", "Underpredicted"))
if (nrow(high_residual_areas) > 0) {
cat(sprintf("\n ️ Detected %d high-residual points (|residual| > 2σ):\n",
nrow(high_residual_areas)))
cat(sprintf(" Overpredicted: %d\n", sum(high_residual_areas$residual_type == "Overpredicted")))
cat(sprintf(" Underpredicted: %d\n", sum(high_residual_areas$residual_type == "Underpredicted")))
# Save high-residual point data
write_csv(high_residual_areas %>%
dplyr::select(x_coord, y_coord, !!sym(y_var), predicted, residual, residual_type),
"file/high_residual_locations.csv")
cat(" file/high_residual_locations.csv\n")
}
}