Examples

Real-world examples demonstrating FormulaCompiler.jl in action.

Monte Carlo Simulation

High-performance Monte Carlo simulation using zero-allocation evaluation:

using FormulaCompiler, GLM, DataFrames, Tables, Random
using BenchmarkTools, Statistics

function monte_carlo_simulation(n_sims=1_000_000, n_obs=1000)
    # Generate base dataset
    Random.seed!(123)
    df = DataFrame(
        y = randn(n_obs),
        x = randn(n_obs),
        z = abs.(randn(n_obs)) .+ 0.1,
        treatment = rand(Bool, n_obs),
        group = categorical(rand(["A", "B", "C"], n_obs))
    )
    
    # Fit model
    model = lm(@formula(y ~ x * treatment + log(z) + group), df)
    data = Tables.columntable(df)
    
    # Compile for zero-allocation evaluation
    compiled = compile_formula(model, data)
    row_vec = Vector{Float64}(undef, length(compiled))
    
    # Pre-allocate results
    results = Vector{Float64}(undef, n_sims)
    
    println("Running $n_sims Monte Carlo simulations...")
    @time begin
        for sim in 1:n_sims
            # Random row selection
            row_idx = rand(1:n_obs)
            
            # Zero-allocation evaluation
            compiled(row_vec, data, row_idx)
            
            # Calculate linear predictor
            results[sim] = dot(coef(model), row_vec)
        end
    end
    
    return results
end

# Run simulation
mc_results = monte_carlo_simulation(1_000_000, 1000)

println("Monte Carlo Results:")
println("Mean: $(round(mean(mc_results), digits=4))")
println("Std:  $(round(std(mc_results), digits=4))")
println("Min:  $(round(minimum(mc_results), digits=4))")  
println("Max:  $(round(maximum(mc_results), digits=4))")

Bootstrap Confidence Intervals

Efficient bootstrap resampling for coefficient confidence intervals:

using Random

function bootstrap_confidence_intervals(model, data, n_bootstrap=1000, confidence=0.95)
    compiled = compile_formula(model, data)
    n_obs = Tables.rowcount(data)
    n_coefs = length(compiled)
    
    # Get response variable
    y_name = Symbol(model.mf.f.lhs)
    y = data[y_name]
    
    # Pre-allocate
    bootstrap_coefs = Matrix{Float64}(undef, n_bootstrap, n_coefs)
    row_vec = Vector{Float64}(undef, n_coefs)
    X_bootstrap = Matrix{Float64}(undef, n_obs, n_coefs)
    y_bootstrap = Vector{Float64}(undef, n_obs)
    
    println("Computing $n_bootstrap bootstrap samples...")
    @time begin
        for b in 1:n_bootstrap
            # Generate bootstrap sample indices
            sample_indices = rand(1:n_obs, n_obs)
            
            # Build design matrix for bootstrap sample (zero allocations)
            for (i, idx) in enumerate(sample_indices)
                compiled(row_vec, data, idx)
                X_bootstrap[i, :] .= row_vec
                y_bootstrap[i] = y[idx]
            end
            
            # Compute bootstrap coefficients
            bootstrap_coefs[b, :] = X_bootstrap \ y_bootstrap
        end
    end
    
    # Calculate confidence intervals
    α = 1 - confidence
    lower_percentile = 100 * (α/2)
    upper_percentile = 100 * (1 - α/2)
    
    coef_names = coefnames(model)
    original_coefs = coef(model)
    
    println("\n$(Int(confidence*100))% Bootstrap Confidence Intervals:")
    println("─"^60)
    
    for (i, name) in enumerate(coef_names)
        boot_coefs_i = bootstrap_coefs[:, i]
        lower = percentile(boot_coefs_i, lower_percentile)
        upper = percentile(boot_coefs_i, upper_percentile)
        
        println("$name:")
        println("  Original: $(round(original_coefs[i], digits=4))")
        println("  Bootstrap: [$(round(lower, digits=4)), $(round(upper, digits=4))]")
        println("  Bootstrap SE: $(round(std(boot_coefs_i), digits=4))")
    end
    
    return bootstrap_coefs
end

# Example usage
df = DataFrame(
    y = randn(500),
    x = randn(500),
    treatment = rand(Bool, 500),
    age = rand(20:80, 500)
)

model = lm(@formula(y ~ x * treatment + age), df)
data = Tables.columntable(df)

boot_coefs = bootstrap_confidence_intervals(model, data, 1000, 0.95)

Policy Impact Analysis

Comprehensive policy scenario analysis using the scenario system:

function policy_impact_analysis()
    # Simulate policy-relevant dataset
    Random.seed!(456)
    n_individuals = 10000
    
    df = DataFrame(
        # Outcome: earnings (thousands)
        earnings = max.(0, randn(n_individuals) * 15 .+ 45),
        
        # Demographics
        age = rand(22:65, n_individuals),
        education_years = rand(10:18, n_individuals),
        experience = rand(0:30, n_individuals),
        
        # Geographic
        region = categorical(rand(["North", "South", "East", "West"], n_individuals)),
        urban = rand(Bool, n_individuals),
        
        # Current programs
        job_training = rand(Bool, n_individuals),
        healthcare_access = rand(Bool, n_individuals)
    )
    
    # Fit earnings model
    model = lm(@formula(earnings ~ age + education_years + experience + 
                                 region + urban + job_training * healthcare_access), df)
    
    data = Tables.columntable(df)
    compiled = compile_formula(model, data)
    
    # Define policy scenarios
    scenarios = Dict(
        "status_quo" => create_scenario("status_quo", data),
        
        "universal_training" => create_scenario("universal_training", data;
            job_training = true
        ),
        
        "universal_healthcare" => create_scenario("universal_healthcare", data;
            healthcare_access = true  
        ),
        
        "combined_programs" => create_scenario("combined", data;
            job_training = true,
            healthcare_access = true
        ),
        
        "education_boost" => create_scenario("education_boost", data;
            education_years = mean(df.education_years) + 2  # +2 years education
        ),
        
        "comprehensive_policy" => create_scenario("comprehensive", data;
            job_training = true,
            healthcare_access = true, 
            education_years = mean(df.education_years) + 1,
            urban = true  # Urbanization investment
        )
    )
    
    # Evaluate policy impacts
    results = Dict{String, NamedTuple}()
    row_vec = Vector{Float64}(undef, length(compiled))
    
    println("Policy Impact Analysis Results:")
    println("="^50)
    
    for (policy_name, scenario) in scenarios
        predictions = Vector{Float64}(undef, n_individuals)
        
        # Calculate predictions for all individuals under this policy
        for i in 1:n_individuals
            compiled(row_vec, scenario.data, i)
            predictions[i] = dot(coef(model), row_vec)
        end
        
        # Calculate impacts vs status quo
        if policy_name != "status_quo"
            status_quo_preds = Vector{Float64}(undef, n_individuals)
            for i in 1:n_individuals
                compiled(row_vec, scenarios["status_quo"].data, i)
                status_quo_preds[i] = dot(coef(model), row_vec)
            end
            
            individual_impacts = predictions .- status_quo_preds
            
            results[policy_name] = (
                mean_earnings = mean(predictions),
                mean_impact = mean(individual_impacts),
                median_impact = median(individual_impacts),
                impact_std = std(individual_impacts),
                percent_helped = 100 * mean(individual_impacts .> 0),
                total_cost = sum(max.(0, individual_impacts)) * 1000  # Total $ impact
            )
        else
            results[policy_name] = (
                mean_earnings = mean(predictions),
                mean_impact = 0.0,
                median_impact = 0.0,
                impact_std = 0.0,
                percent_helped = 0.0,
                total_cost = 0.0
            )
        end
    end
    
    # Display results
    baseline_earnings = results["status_quo"].mean_earnings
    
    for (policy, stats) in results
        if policy == "status_quo"
            println("$policy (baseline):")
            println("  Mean earnings: \$$(round(Int, stats.mean_earnings))k")
        else
            println("\n$policy:")
            println("  Mean earnings: \$$(round(Int, stats.mean_earnings))k")
            println("  Mean impact: \$$(round(Int, stats.mean_impact))k per person")
            println("  Median impact: \$$(round(Int, stats.median_impact))k per person") 
            println("  % individuals helped: $(round(stats.percent_helped, digits=1))%")
            println("  Total economic impact: \$$(round(Int, stats.total_cost/1000))M")
            
            # Cost-effectiveness (simplified)
            if policy == "universal_training"
                cost_per_person = 5000  # $5k per person for training
            elseif policy == "universal_healthcare"
                cost_per_person = 8000  # $8k per person for healthcare
            elseif policy == "combined_programs"  
                cost_per_person = 12000  # $12k for both
            elseif policy == "education_boost"
                cost_per_person = 15000  # $15k for education
            else
                cost_per_person = 20000  # $20k comprehensive
            end
            
            total_program_cost = cost_per_person * n_individuals / 1000  # In thousands
            net_benefit = stats.total_cost - total_program_cost
            roi = (net_benefit / total_program_cost) * 100
            
            println("  Program cost: \$$(round(Int, total_program_cost/1000))M")
            println("  Net benefit: \$$(round(Int, net_benefit/1000))M")
            println("  ROI: $(round(roi, digits=1))%")
        end
    end
    
    return results, scenarios
end

# Run the analysis
policy_results, policy_scenarios = policy_impact_analysis()

Marginal Effects Calculation

Efficient numerical derivatives for marginal effects:

function marginal_effects_analysis(model, data, variables=nothing; delta=0.01)
    compiled = compile_formula(model, data)
    n_obs = Tables.rowcount(data)
    n_coefs = length(compiled)
    
    # Default to all continuous variables
    if variables === nothing
        variables = [:age, :experience, :education_years]  # Adjust as needed
    end
    
    results = Dict{Symbol, Matrix{Float64}}()
    row_vec_orig = Vector{Float64}(undef, n_coefs)
    row_vec_pert = Vector{Float64}(undef, n_coefs)
    
    for var in variables
        println("Computing marginal effects for $var...")
        
        # Get original values
        original_values = data[var]
        perturbed_values = original_values .+ delta
        perturbed_data = (; data..., var => perturbed_values)
        
        marginal_effects = Matrix{Float64}(undef, n_obs, n_coefs)
        
        @time begin
            for i in 1:n_obs
                # Original prediction
                compiled(row_vec_orig, data, i)
                
                # Perturbed prediction  
                compiled(row_vec_pert, perturbed_data, i)
                
                # Marginal effects for each coefficient
                marginal_effects[i, :] .= (row_vec_pert .- row_vec_orig) ./ delta
            end
        end
        
        results[var] = marginal_effects
    end
    
    # Summarize results
    coef_names = coefnames(model)
    println("\nMarginal Effects Summary:")
    println("="^40)
    
    for var in variables
        println("\nVariable: $var")
        me_matrix = results[var]
        
        for (j, coef_name) in enumerate(coef_names)
            me_col = me_matrix[:, j]
            println("  $coef_name:")
            println("    Mean ME: $(round(mean(me_col), digits=6))")
            println("    Std ME:  $(round(std(me_col), digits=6))")
            println("    Range:   [$(round(minimum(me_col), digits=6)), $(round(maximum(me_col), digits=6))]")
        end
    end
    
    return results
end

# Example: marginal effects for policy model
marginal_results = marginal_effects_analysis(model, data, [:age, :education_years, :experience])

High-Frequency Trading Model

Real-time prediction serving with microsecond latency requirements:

using Dates

function high_frequency_trading_example()
    # Simulate high-frequency financial data
    Random.seed!(789)
    n_ticks = 100_000
    
    # Generate realistic financial time series
    returns = cumsum(randn(n_ticks) * 0.001)  # Random walk returns
    
    df = DataFrame(
        # Price features
        return_1min = returns,
        return_5min = lag(returns, 5),
        return_15min = lag(returns, 15),
        
        # Volume features  
        volume = abs.(randn(n_ticks)) .+ 1,
        volume_ratio = rand(0.5:0.01:2.0, n_ticks),
        
        # Market microstructure
        spread = abs.(randn(n_ticks)) * 0.01 .+ 0.001,
        market_impact = rand(0.001:0.0001:0.01, n_ticks),
        
        # Time features
        hour = repeat(9:16, inner=div(n_ticks, 8))[1:n_ticks],
        minute = repeat(0:59, inner=div(n_ticks, 60))[1:n_ticks],
        
        # Target: next minute return
        next_return = lead(returns, 1)
    )
    
    # Remove missing values from lags/leads
    df = df[16:(end-1), :]
    
    # Fit high-frequency prediction model
    model = lm(@formula(next_return ~ return_1min + return_5min + return_15min + 
                                    log(volume) + volume_ratio + spread + 
                                    market_impact + hour), df)
    
    data = Tables.columntable(df)
    compiled = compile_formula(model, data)
    
    # Simulate real-time prediction serving
    println("High-Frequency Trading Model Performance:")
    println("Model coefficients: ", length(compiled))
    
    # Pre-allocate for zero-allocation serving
    row_vec = Vector{Float64}(undef, length(compiled))
    n_predictions = 10_000
    
    # Benchmark prediction latency
    prediction_times = Vector{Float64}(undef, n_predictions)
    predictions = Vector{Float64}(undef, n_predictions)
    
    println("Serving $n_predictions real-time predictions...")
    
    for i in 1:n_predictions
        tick_idx = rand(1:nrow(df))
        
        start_time = time_ns()
        compiled(row_vec, data, tick_idx)
        prediction = dot(coef(model), row_vec)
        end_time = time_ns()
        
        prediction_times[i] = (end_time - start_time) / 1000  # Convert to microseconds
        predictions[i] = prediction
    end
    
    # Latency analysis
    println("\nLatency Analysis:")
    println("Mean latency: $(round(mean(prediction_times), digits=2)) μs")
    println("Median latency: $(round(median(prediction_times), digits=2)) μs") 
    println("95th percentile: $(round(quantile(prediction_times, 0.95), digits=2)) μs")
    println("99th percentile: $(round(quantile(prediction_times, 0.99), digits=2)) μs")
    println("Max latency: $(round(maximum(prediction_times), digits=2)) μs")
    
    # Trading performance metrics
    actual_returns = [data.next_return[rand(1:nrow(df))] for _ in 1:n_predictions]
    
    # Simple trading strategy: long if predicted return > 0
    positions = sign.(predictions)
    strategy_returns = positions .* actual_returns
    
    println("\nTrading Strategy Performance:")
    println("Total predictions: $n_predictions")
    println("Accuracy (direction): $(round(100 * mean(sign.(predictions) .== sign.(actual_returns)), digits=1))%")
    println("Mean strategy return: $(round(mean(strategy_returns) * 10000, digits=2)) bps")
    println("Strategy Sharpe ratio: $(round(mean(strategy_returns) / std(strategy_returns), digits=3))")
    println("Max drawdown: $(round(minimum(cumsum(strategy_returns)) * 100, digits=2))%")
    
    return (latencies = prediction_times, predictions = predictions, returns = strategy_returns)
end

# Run high-frequency trading example
hft_results = high_frequency_trading_example()

Medical Research: Clinical Trial Simulation

Simulating clinical trial outcomes with patient heterogeneity:

function clinical_trial_simulation()
    # Simulate diverse patient population
    Random.seed!(101112)
    n_patients = 5000
    
    df = DataFrame(
        # Patient demographics
        age = rand(18:85, n_patients),
        sex = categorical(rand(["Male", "Female"], n_patients)),
        bmi = max.(15, randn(n_patients) * 5 .+ 25),
        
        # Baseline health
        baseline_severity = rand(1:10, n_patients),
        comorbidities = rand(0:5, n_patients),
        
        # Treatment assignment (randomized)
        treatment = categorical(rand(["Placebo", "Low_Dose", "High_Dose"], n_patients)),
        
        # Compliance (realistic patterns)
        compliance_rate = min.(1.0, max.(0.0, randn(n_patients) * 0.2 .+ 0.8)),
        
        # Outcome: improvement score (0-100)
        improvement = max.(0, min.(100, 
            randn(n_patients) * 15 .+ 
            (df.treatment .== "High_Dose") * 20 .+
            (df.treatment .== "Low_Dose") * 10 .-
            df.baseline_severity * 2 .+
            df.compliance_rate * 15
        ))
    )
    
    # Clinical model
    model = lm(@formula(improvement ~ age + sex + bmi + baseline_severity + 
                                    comorbidities + treatment * compliance_rate), df)
    
    data = Tables.columntable(df)
    compiled = compile_formula(model, data)
    
    # Define clinical scenarios
    scenarios = Dict(
        "real_world" => create_scenario("real_world", data),
        
        "perfect_compliance" => create_scenario("perfect_compliance", data;
            compliance_rate = 1.0
        ),
        
        "elderly_subgroup" => create_scenario("elderly", data;
            age = 70,  # All patients age 70
            compliance_rate = 0.9
        ),
        
        "high_risk_patients" => create_scenario("high_risk", data;
            baseline_severity = 8,
            comorbidities = 3,
            bmi = 30
        ),
        
        "optimal_candidates" => create_scenario("optimal", data;
            age = 45,
            baseline_severity = 5,
            comorbidities = 1,
            bmi = 23,
            compliance_rate = 1.0
        )
    )
    
    # Analyze treatment effects across scenarios
    println("Clinical Trial Scenario Analysis:")
    println("="^50)
    
    row_vec = Vector{Float64}(undef, length(compiled))
    
    for (scenario_name, scenario) in scenarios
        println("\nScenario: $scenario_name")
        
        # Calculate outcomes by treatment group
        treatment_groups = ["Placebo", "Low_Dose", "High_Dose"]
        group_results = Dict{String, Float64}()
        
        for treatment_group in treatment_groups
            # Create scenario with specific treatment
            treatment_scenario = create_scenario("temp", scenario.data;
                treatment = treatment_group
            )
            
            # Predict outcomes for all patients under this treatment
            outcomes = Vector{Float64}(undef, n_patients)
            for i in 1:n_patients
                compiled(row_vec, treatment_scenario.data, i)
                outcomes[i] = dot(coef(model), row_vec)
            end
            
            group_results[treatment_group] = mean(outcomes)
        end
        
        # Calculate treatment effects
        placebo_effect = group_results["Placebo"]
        low_dose_effect = group_results["Low_Dose"] - placebo_effect
        high_dose_effect = group_results["High_Dose"] - placebo_effect
        dose_response = high_dose_effect - low_dose_effect
        
        println("  Placebo response: $(round(placebo_effect, digits=1))")
        println("  Low dose effect: $(round(low_dose_effect, digits=1)) (vs placebo)")
        println("  High dose effect: $(round(high_dose_effect, digits=1)) (vs placebo)")
        println("  Dose response: $(round(dose_response, digits=1))")
        
        # Calculate number needed to treat (simplified)
        if high_dose_effect > 0
            nnt = round(Int, 100 / high_dose_effect)  # Assume 100-point scale
            println("  Number needed to treat: $nnt")
        end
    end
    
    return scenarios, group_results
end

# Run clinical trial analysis
clinical_scenarios, clinical_results = clinical_trial_simulation()

These examples demonstrate FormulaCompiler.jl's versatility across:

  • High-performance computing: Monte Carlo simulations with millions of evaluations
  • Statistical inference: Bootstrap confidence intervals with zero-allocation resampling
  • Policy analysis: Complex scenario modeling for decision support
  • Real-time systems: Microsecond-latency prediction serving
  • Medical research: Clinical trial simulation and subgroup analysis

Each example leverages FormulaCompiler.jl's core strengths: zero-allocation performance, flexible scenario system, and seamless integration with the Julia statistical ecosystem.