Scenario Analysis
FormulaCompiler.jl's scenario system enables efficient "what-if" analysis and counterfactual modeling with minimal memory overhead.
Overview
The scenario system allows you to:
- Override specific variables while keeping others unchanged
- Create policy scenarios for analysis
- Generate scenario grids for comprehensive analysis
- Perform efficient counterfactual analysis
- Handle missing data patterns
All with minimal memory allocation using the OverrideVector system.
Scenario Workflow
Basic Scenario Creation
Simple Overrides
using FormulaCompiler, DataFrames, Tables, GLM
# Setup data
df = DataFrame(
y = randn(1000),
x = randn(1000),
treatment = rand(Bool, 1000),
age = rand(18:80, 1000),
income = rand(20000:100000, 1000)
)
data = Tables.columntable(df)
model = lm(@formula(y ~ x * treatment + age + log(income)), df)
compiled = compile_formula(model, data)
# Create scenarios
baseline = create_scenario("baseline", data)
# Treatment effect scenario
treatment_scenario = create_scenario("treatment_on", data;
treatment = true
)
# Policy scenario: everyone gets average income
policy_scenario = create_scenario("income_equality", data;
income = mean(df.income)
)
# Complex scenario with multiple overrides
intervention = create_scenario("intervention", data;
treatment = true,
age = 30, # Set everyone to age 30
income = 50000 # Set everyone to $50k income
)Evaluating Scenarios
row_vec = Vector{Float64}(undef, length(compiled))
# Compare scenarios for individual 100
individual = 100
compiled(row_vec, baseline.data, individual)
baseline_prediction = copy(row_vec)
compiled(row_vec, treatment_scenario.data, individual)
treatment_prediction = copy(row_vec)
compiled(row_vec, intervention.data, individual)
intervention_prediction = copy(row_vec)
# Calculate treatment effects
treatment_effect = treatment_prediction .- baseline_prediction
intervention_effect = intervention_prediction .- baseline_predictionScenario Collections
Creating Scenario Grids
Generate all combinations of scenario parameters:
# Create a 2×3×2 grid of scenarios
policy_grid = create_scenario_grid("policy_analysis", data, Dict(
:treatment => [false, true],
:income => [30000, 50000, 80000],
:region => ["urban", "rural"]
))
println("Created $(length(policy_grid)) scenarios") # 12 scenarios
# Evaluate all scenarios for a specific individual
individual = 1
results = Matrix{Float64}(undef, length(policy_grid), length(compiled))
for (i, scenario) in enumerate(policy_grid)
compiled(view(results, i, :), scenario.data, individual)
end
# Results matrix: each row is one scenario combinationNamed Scenario Collections
# Create multiple related scenarios
scenarios = ScenarioCollection("treatment_analysis")
add_scenario!(scenarios, "control", data; treatment = false)
add_scenario!(scenarios, "low_dose", data; treatment = true, dose = 50)
add_scenario!(scenarios, "high_dose", data; treatment = true, dose = 100)
add_scenario!(scenarios, "placebo", data; treatment = false, placebo = true)
# Evaluate all scenarios
results = evaluate_scenarios(compiled, scenarios, individual)
# Access by name
control_result = results["control"]
treatment_result = results["high_dose"]Advanced Scenario Patterns
Dynamic Scenario Modification
scenario = create_scenario("dynamic", data; x = 1.0)
# Modify scenarios iteratively
for new_treatment_value in [false, true]
set_override!(scenario, :treatment, new_treatment_value)
for new_age in [25, 35, 45, 55, 65]
set_override!(scenario, :age, new_age)
# Evaluate current scenario
compiled(row_vec, scenario.data, individual)
println("Treatment: $new_treatment_value, Age: $new_age, Prediction: $(row_vec[1])")
end
end
# Bulk updates
update_scenario!(scenario;
treatment = true,
age = 40,
income = 60000,
new_policy = true
)
# Remove specific overrides
remove_override!(scenario, :new_policy)Conditional Scenarios
Create scenarios with conditional logic:
function create_conditional_scenario(name, data, condition_func)
scenario = create_scenario(name, data)
# Apply conditions based on original data
if condition_func === :high_earners
# Override high earners (top 25%) to maximum income
high_income_threshold = quantile(df.income, 0.75)
max_income = maximum(df.income)
# This would need custom implementation to be truly conditional
# For now, we can approximate with population statistics
set_override!(scenario, :income, max_income)
end
return scenario
endStochastic Scenarios
Generate random scenario variations:
using Random
function create_random_scenarios(base_data, n_scenarios, seed=123)
Random.seed!(seed)
scenarios = ScenarioCollection("random_scenarios")
for i in 1:n_scenarios
scenario_name = "random_$i"
# Random treatment assignment
treatment = rand(Bool)
# Random age from realistic distribution
age = rand(25:65)
# Random income with some correlation to age
base_income = 30000 + age * 800 + rand(Normal(0, 10000))
income = max(20000, base_income)
add_scenario!(scenarios, scenario_name, base_data;
treatment = treatment,
age = age,
income = income
)
end
return scenarios
end
# Generate 100 random scenarios
random_scenarios = create_random_scenarios(data, 100)
# Evaluate all random scenarios
random_results = evaluate_scenarios(compiled, random_scenarios, individual)Policy Analysis Applications
Treatment Effect Analysis
# Create treatment vs. control scenarios
control = create_scenario("control", data; treatment = false)
treated = create_scenario("treated", data; treatment = true)
# Calculate individual treatment effects
n_individuals = 100
treatment_effects = Vector{Float64}(n_individuals)
for i in 1:n_individuals
compiled(row_vec, control.data, i)
control_pred = row_vec[1] # Assuming outcome is first term
compiled(row_vec, treated.data, i)
treated_pred = row_vec[1]
treatment_effects[i] = treated_pred - control_pred
end
# Analyze distribution of treatment effects
using Statistics
println("Mean treatment effect: $(mean(treatment_effects))")
println("Std treatment effect: $(std(treatment_effects))")Income Redistribution Analysis
# Create scenarios with different income distributions
income_scenarios = Dict(
"current" => create_scenario("current", data),
"universal_basic" => create_scenario("ubi", data; income = 40000),
"progressive" => create_scenario("progressive", data;
income = quantile(df.income, 0.5) # Everyone gets median income
),
"equality" => create_scenario("equality", data;
income = mean(df.income) # Everyone gets mean income
)
)
# Evaluate welfare outcomes under different policies
welfare_results = Dict{String, Vector{Float64}}()
for (policy_name, scenario) in income_scenarios
policy_outcomes = Vector{Float64}(nrow(df))
for i in 1:nrow(df)
compiled(row_vec, scenario.data, i)
policy_outcomes[i] = row_vec[1] # Or some welfare function
end
welfare_results[policy_name] = policy_outcomes
end
# Compare distributions
for (policy, outcomes) in welfare_results
println("Policy: $policy")
println(" Mean outcome: $(mean(outcomes))")
println(" Std outcome: $(std(outcomes))")
println(" Gini coefficient: $(gini_coefficient(outcomes))")
endSensitivity Analysis
# Analyze sensitivity to key parameters
function sensitivity_analysis(compiled, data, base_individual, param_ranges)
base_scenario = create_scenario("base", data)
results = Dict{Symbol, Vector{Tuple{Float64, Float64}}}()
for (param, range) in param_ranges
param_results = Vector{Tuple{Float64, Float64}}()
for value in range
# Create scenario with this parameter value
set_override!(base_scenario, param, value)
# Evaluate
row_vec = Vector{Float64}(undef, length(compiled))
compiled(row_vec, base_scenario.data, base_individual)
push!(param_results, (value, row_vec[1]))
end
results[param] = param_results
remove_override!(base_scenario, param) # Reset for next parameter
end
return results
end
# Run sensitivity analysis
sensitivity_ranges = Dict(
:age => 20:5:80,
:income => 20000:10000:100000,
:x => -3:0.5:3
)
sensitivity_results = sensitivity_analysis(compiled, data, 1, sensitivity_ranges)
# Plot or analyze results
for (param, results) in sensitivity_results
values = [r[1] for r in results]
outcomes = [r[2] for r in results]
println("Parameter: $param")
println(" Range: $(minimum(values)) to $(maximum(values))")
println(" Outcome range: $(minimum(outcomes)) to $(maximum(outcomes))")
endMemory Efficiency
Understanding OverrideVector
# Compare memory usage
n_obs = 1_000_000
# Traditional approach: full vector
full_vector = fill(42.0, n_obs)
println("Full vector size: $(sizeof(full_vector)) bytes")
# OverrideVector approach
override_vector = OverrideVector(42.0, n_obs)
println("Override vector size: $(sizeof(override_vector)) bytes")
# Memory savings
savings_ratio = sizeof(full_vector) / sizeof(override_vector)
println("Memory savings: $(round(savings_ratio))x")
# Both provide same interface
@assert full_vector[500_000] == override_vector[500_000]
@assert length(full_vector) == length(override_vector)Efficient Scenario Storage
# Multiple scenarios with shared base data
shared_data = Tables.columntable(df)
# Each scenario only stores its overrides
scenario1 = create_scenario("s1", shared_data; treatment = true)
scenario2 = create_scenario("s2", shared_data; income = 50000)
scenario3 = create_scenario("s3", shared_data; age = 30, treatment = false)
# Total memory is much less than 3 full copies of data
total_scenario_memory = sizeof(scenario1) + sizeof(scenario2) + sizeof(scenario3)
full_data_memory = 3 * sum(sizeof(col) for col in shared_data)
println("Scenario memory: $total_scenario_memory bytes")
println("Full copy memory: $full_data_memory bytes")
println("Savings: $(round(full_data_memory / total_scenario_memory))x")Best Practices
Scenario Design
- Use descriptive names for scenarios
- Group related scenarios in collections
- Cache compiled formulas when evaluating many scenarios
- Pre-allocate output vectors for repeated evaluations
Performance Tips
- Create scenarios once and reuse them
- Use batch evaluation for multiple individuals
- Consider using views for large result matrices
- Profile memory usage in complex scenario analyses
Statistical Considerations
- Always compare to appropriate baselines
- Account for uncertainty in treatment effect estimates
- Consider interaction effects in scenario design