# Find project root and include ensure_packages.jl
project_root = let
current = pwd()
while !isfile(joinpath(current, "Project.toml")) && !isfile(joinpath(current, "_quarto.yml"))
parent = dirname(current)
parent == current && break
current = parent
end
current
end
include(joinpath(project_root, "scripts", "ensure_packages.jl"))
@auto_using Random Distributions GLM DataFrames CairoMakie
Random.seed!(42)
# Simulate data with missing quadratic term (model misspecification)
n = 200
X = rand(Uniform(-2, 2), n)
Y = 1.0 .+ 0.5 .* X .+ 0.3 .* X.^2 .+ rand(Normal(0, 0.2), n) # True model has X²
df = DataFrame(X = X, Y = Y)
# Fit misspecified model (missing X² term)
model = lm(@formula(Y ~ X), df)
Y_pred = predict(model, df)
residuals = Y .- Y_pred
# Check residuals
residual_mean = mean(residuals)
residual_std = std(residuals)
# Check autocorrelation (for time series, here we check against X)
# In time series, check lag-1 autocorrelation
# Here, we check correlation with X to detect missing terms
residual_X_corr = cor(residuals, X)
# Visualise
let
fig = Figure(size = (1200, 400))
ax1 = Axis(fig[1, 1], title = "Residuals vs X", xlabel = "X", ylabel = "Residuals")
ax2 = Axis(fig[1, 2], title = "Residuals vs Predicted", xlabel = "Predicted Y", ylabel = "Residuals")
ax3 = Axis(fig[1, 3], title = "Residual Distribution", xlabel = "Residuals", ylabel = "Density")
scatter!(ax1, X, residuals, color = :blue, alpha = 0.5)
hlines!(ax1, [0], color = :red, linestyle = :dash, linewidth = 2)
scatter!(ax2, Y_pred, residuals, color = :blue, alpha = 0.5)
hlines!(ax2, [0], color = :red, linestyle = :dash, linewidth = 2)
hist!(ax3, residuals, bins=20, normalization=:pdf, color=(:blue, 0.5))
# Overlay normal distribution
x_norm = range(minimum(residuals), maximum(residuals), length=100)
y_norm = pdf.(Normal(residual_mean, residual_std), x_norm)
lines!(ax3, x_norm, y_norm, color=:red, linewidth=2, label="Normal")
axislegend(ax3)
fig # Only this gets displayed
end
println("Residual analysis:")
println(" Mean: ", round(residual_mean, digits=4), " (should be ≈ 0)")
println(" Std: ", round(residual_std, digits=4))
println(" Correlation with X: ", round(residual_X_corr, digits=4), " (should be ≈ 0)")
println("\nInterpretation:")
if abs(residual_X_corr) > 0.1
println(" ⚠️ Residuals correlated with X → missing term (e.g., X²)")
else
println(" ✓ Residuals appear independent of X")
end