GLM.jl Integration
FormulaCompiler.jl seamlessly integrates with GLM.jl to provide zero-allocation model matrix evaluation for both linear and generalized linear models.
Supported Models
FormulaCompiler.jl works with all GLM.jl model types:
- Linear models (
lm) - Generalized linear models (
glm) - All distribution families and link functions
- Custom contrasts and standardized predictors
Basic Usage
Linear Models
using GLM, FormulaCompiler, DataFrames, Tables
# Create sample data
df = DataFrame(
y = randn(1000),
x1 = randn(1000),
x2 = randn(1000),
group = rand(["A", "B", "C"], 1000)
)
# Fit linear model
model = lm(@formula(y ~ x1 + x2 * group), df)
# Compile for zero-allocation evaluation
data = Tables.columntable(df)
compiled = compile_formula(model, data)
row_vec = Vector{Float64}(undef, length(compiled))
# Zero-allocation evaluation
compiled(row_vec, data, 1) # ~50ns, 0 allocationsGeneralized Linear Models
using CategoricalArrays
# Binary outcome data
df_binary = DataFrame(
success = rand(Bool, 1000),
x = randn(1000),
treatment = categorical(rand(["control", "treatment"], 1000)),
age = rand(20:80, 1000)
)
# Logistic regression
logit_model = glm(
@formula(success ~ x + treatment + age),
df_binary,
Binomial(),
LogitLink()
)
# Compile and use
compiled_logit = compile_formula(logit_model, Tables.columntable(df_binary))
row_vec = Vector{Float64}(undef, length(compiled_logit))
compiled_logit(row_vec, Tables.columntable(df_binary), 1)Distribution Families
FormulaCompiler.jl works with all GLM.jl distribution families:
Gaussian (Normal) - Identity Link
normal_model = glm(@formula(y ~ x1 + x2), df, Normal(), IdentityLink())
compiled_normal = compile_formula(normal_model, data)Binomial - Logit Link
# Logistic regression
logit_model = glm(@formula(success ~ x + age), df_binary, Binomial(), LogitLink())
# Probit regression
probit_model = glm(@formula(success ~ x + age), df_binary, Binomial(), ProbitLink())
# Complementary log-log
cloglog_model = glm(@formula(success ~ x + age), df_binary, Binomial(), CloglogLink())Poisson - Log Link
# Count data
df_count = DataFrame(
count = rand(Poisson(2), 1000),
x = randn(1000),
exposure = rand(0.5:0.1:2.0, 1000)
)
poisson_model = glm(@formula(count ~ x + log(exposure)), df_count, Poisson(), LogLink())
compiled_poisson = compile_formula(poisson_model, Tables.columntable(df_count))Gamma - Inverse Link
# Positive continuous data
df_gamma = DataFrame(
response = rand(Gamma(2, 3), 1000),
x = randn(1000),
factor = rand(["low", "high"], 1000)
)
gamma_model = glm(@formula(response ~ x + factor), df_gamma, Gamma(), InverseLink())
compiled_gamma = compile_formula(gamma_model, Tables.columntable(df_gamma))Advanced GLM Features
Custom Contrasts
using StatsModels
# Define custom contrasts
contrasts_dict = Dict(
:treatment => DummyCoding(base="control"),
:group => EffectsCoding(),
:region => HelmertCoding()
)
# Fit model with custom contrasts
df_contrasts = DataFrame(
y = randn(1000),
x = randn(1000),
treatment = categorical(rand(["control", "low", "high"], 1000)),
group = categorical(rand(["A", "B", "C", "D"], 1000)),
region = categorical(rand(["North", "South", "East", "West"], 1000))
)
model_contrasts = lm(
@formula(y ~ x + treatment + group + region),
df_contrasts,
contrasts = contrasts_dict
)
compiled_contrasts = compile_formula(model_contrasts, Tables.columntable(df_contrasts))Weights and Offsets
# Weighted regression
weights = rand(0.5:0.1:2.0, 1000)
weighted_model = lm(@formula(y ~ x1 + x2), df, wts=weights)
# GLM with offset
df_offset = DataFrame(
y = rand(Poisson(2), 1000),
x = randn(1000),
offset_var = log.(rand(0.5:0.1:2.0, 1000))
)
offset_model = glm(
@formula(y ~ x + offset(offset_var)),
df_offset,
Poisson(),
LogLink()
)
# FormulaCompiler handles both
compiled_weighted = compile_formula(weighted_model, Tables.columntable(df))
compiled_offset = compile_formula(offset_model, Tables.columntable(df_offset))Performance Comparisons
Benchmark Against modelmatrix()
using BenchmarkTools
# Setup
df = DataFrame(
y = randn(1000),
x1 = randn(1000),
x2 = randn(1000),
group = categorical(rand(["A", "B", "C"], 1000))
)
model = lm(@formula(y ~ x1 * x2 + group + log(abs(x1) + 1)), df)
data = Tables.columntable(df)
# Traditional approach
function traditional_single_row(model, row_idx)
mm = modelmatrix(model)
return mm[row_idx, :]
end
# FormulaCompiler approach
compiled = compile_formula(model, data)
row_vec = Vector{Float64}(undef, length(compiled))
function fc_single_row(compiled, data, row_vec, row_idx)
compiled(row_vec, data, row_idx)
return row_vec
end
# Benchmark comparison
println("Traditional approach:")
@benchmark traditional_single_row($model, 1)
println("\nFormulaCompiler approach:")
@benchmark fc_single_row($compiled, $data, $row_vec, 1)
# Expected results:
# Traditional: ~10μs, 1 allocation
# FormulaCompiler: ~50ns, 0 allocations (~200x speedup)Large Model Performance
# Create a large, complex model
function create_large_model(n_obs=10000)
df = DataFrame(
y = randn(n_obs),
x1 = randn(n_obs),
x2 = randn(n_obs),
x3 = randn(n_obs),
x4 = randn(n_obs),
group1 = categorical(rand(1:10, n_obs)),
group2 = categorical(rand(1:5, n_obs)),
group3 = categorical(rand(1:3, n_obs))
)
# Complex formula with interactions
formula = @formula(y ~ (x1 + x2 + x3 + x4) * (group1 + group2) +
log(abs(x1) + 1) * group3 +
sqrt(abs(x2) + 1) +
x3^2 +
(x4 > 0))
model = lm(formula, df)
return model, df
end
# Test performance on large model
large_model, large_df = create_large_model(10000)
large_data = Tables.columntable(large_df)
large_compiled = compile_formula(large_model, large_data)
println("Large model performance:")
println("Model matrix size: ", size(modelmatrix(large_model)))
println("Compilation time:")
@time compile_formula(large_model, large_data)
println("Single row evaluation:")
row_vec = Vector{Float64}(undef, length(large_compiled))
@benchmark $large_compiled($row_vec, $large_data, 1)Real-world Applications
Marginal Effects Calculation
function calculate_marginal_effects(model, data, variable_col, delta=0.01)
compiled = compile_formula(model, data)
n_rows = Tables.rowcount(data)
# Get original variable values
original_values = data[variable_col]
# Create scenarios with perturbed values
perturbed_values = original_values .+ delta
perturbed_data = (; data..., variable_col => perturbed_values)
row_vec_original = Vector{Float64}(undef, length(compiled))
row_vec_perturbed = Vector{Float64}(undef, length(compiled))
marginal_effects = Matrix{Float64}(undef, n_rows, length(compiled))
for i in 1:n_rows
# Original prediction
compiled(row_vec_original, data, i)
# Perturbed prediction
compiled(row_vec_perturbed, perturbed_data, i)
# Marginal effect
marginal_effects[i, :] .= (row_vec_perturbed .- row_vec_original) ./ delta
end
return marginal_effects
end
# Example usage
marginal_fx = calculate_marginal_effects(model, data, :x1)Bootstrap Confidence Intervals
function bootstrap_glm_coefficients(model, data, n_bootstrap=1000)
compiled = compile_formula(model, data)
n_obs = Tables.rowcount(data)
n_coefs = length(compiled)
# Get original response variable
y = data[Symbol(model.mf.f.lhs)]
bootstrap_coefs = Matrix{Float64}(undef, n_bootstrap, n_coefs)
row_vec = Vector{Float64}(undef, n_coefs)
for boot in 1:n_bootstrap
# Bootstrap sample
sample_idx = rand(1:n_obs, n_obs)
# Create design matrix for bootstrap sample
X_boot = Matrix{Float64}(undef, n_obs, n_coefs)
y_boot = Vector{Float64}(undef, n_obs)
for (i, idx) in enumerate(sample_idx)
compiled(row_vec, data, idx)
X_boot[i, :] .= row_vec
y_boot[i] = y[idx]
end
# Fit bootstrap model (simplified OLS)
bootstrap_coefs[boot, :] = X_boot \ y_boot
end
return bootstrap_coefs
endPrediction Intervals
function prediction_intervals(model, data, confidence_level=0.95)
compiled = compile_formula(model, data)
n_obs = Tables.rowcount(data)
row_vec = Vector{Float64}(undef, length(compiled))
# Get model coefficients and residual variance
coefs = coef(model)
σ² = deviance(model) / dof_residual(model)
# Critical value
α = 1 - confidence_level
t_crit = quantile(TDist(dof_residual(model)), 1 - α/2)
predictions = Vector{Float64}(undef, n_obs)
lower_bounds = Vector{Float64}(undef, n_obs)
upper_bounds = Vector{Float64}(undef, n_obs)
for i in 1:n_obs
compiled(row_vec, data, i)
# Point prediction
pred = dot(coefs, row_vec)
predictions[i] = pred
# Prediction standard error
# SE = sqrt(σ² * (1 + x'(X'X)⁻¹x))
# Simplified for demonstration
se = sqrt(σ² * (1 + sum(row_vec.^2) / n_obs))
# Confidence bounds
margin = t_crit * se
lower_bounds[i] = pred - margin
upper_bounds[i] = pred + margin
end
return (predictions = predictions, lower = lower_bounds, upper = upper_bounds)
endIntegration Best Practices
Model Validation
function validate_glm_integration(model, data)
compiled = compile_formula(model, data)
# Compare first few rows with modelmatrix
mm = modelmatrix(model)
row_vec = Vector{Float64}(undef, length(compiled))
for i in 1:min(10, size(mm, 1))
compiled(row_vec, data, i)
original_row = mm[i, :]
if !isapprox(row_vec, original_row, rtol=1e-12)
@warn "Mismatch in row $i"
println("Original: ", original_row)
println("Compiled: ", row_vec)
return false
end
end
println("✓ FormulaCompiler matches GLM.jl modelmatrix for all tested rows")
return true
end
# Validate integration
validate_glm_integration(model, data)Memory Usage Comparison
function compare_memory_usage(model, data)
# Traditional approach
traditional_memory = @allocated modelmatrix(model)
# FormulaCompiler approach
compilation_memory = @allocated compile_formula(model, data)
compiled = compile_formula(model, data)
row_vec = Vector{Float64}(undef, length(compiled))
evaluation_memory = @allocated compiled(row_vec, data, 1)
println("Memory Usage Comparison:")
println("Traditional modelmatrix(): ", traditional_memory, " bytes")
println("FormulaCompiler compilation: ", compilation_memory, " bytes")
println("FormulaCompiler evaluation: ", evaluation_memory, " bytes")
if evaluation_memory == 0
println("✓ Zero-allocation evaluation achieved")
else
@warn "Non-zero allocation detected in evaluation"
end
end
compare_memory_usage(model, data)Troubleshooting
Common Issues
- Type instability: Ensure all variables have consistent types
- Missing values: Handle
missingvalues before compilation - Categorical levels: Ensure categorical variables have the same levels in test data
- Formula complexity: Very complex formulas may have longer compilation times
Debugging Tools
# Check compilation success
function debug_compilation(model, data)
try
compiled = compile_formula(model, data)
println("✓ Compilation successful")
println("Formula length: ", length(compiled))
return compiled
catch e
@error "Compilation failed" exception = e
return nothing
end
end
# Performance diagnostics
function diagnose_performance(model, data)
compiled = compile_formula(model, data)
row_vec = Vector{Float64}(undef, length(compiled))
# Check allocation
alloc = @allocated compiled(row_vec, data, 1)
if alloc > 0
@warn "Non-zero allocation detected: $alloc bytes"
end
# Check timing
time_ns = @elapsed compiled(row_vec, data, 1) * 1e9
if time_ns > 1000 # > 1μs
@warn "Evaluation slower than expected: $(round(time_ns))ns"
end
println("Performance: $(round(time_ns))ns, $(alloc) bytes")
end