Appendix II: Python
This chapter collects Python code equivalents to nearly all of the R commands shown in this book. The same code is reproduced, with full output, on the marginaleffects.com website. The small number of features that are not yet available in the Python version of marginaleffects are listed on the roadmap at the end of this appendix.
Note that, by default, marginaleffects commands return data frames in the Polars format. These objects are easy to convert to Pandas, Numpy, or other formats using an appropriate .to_*() method. For example, calling the get_dataset() function returns a Polars data frame that we can convert to Pandas with .to_pandas(). Recall that Python uses 0-based indexing whereas R uses 1-based indexing.
import polars as pl
import numpy as np
from marginaleffects import *
from plotnine import *
from scipy.stats import norm
from statsmodels.formula.api import logit, ols
dat = get_dataset("thornton")
dat = dat.to_pandas()
dat.head()1. Who is this book for?
Data
dat = get_dataset("Titanic", "Stat2Data")
dat[:6, ["Name", "Survived", "Age"]]
get_dataset(search = "Titanic")
get_dataset("Titanic", "Stat2Data", docs = True)3. Conceptual framework
Predictors
rng = np.random.default_rng(seed=48103)
N = 10
dat = pl.DataFrame({
"Num": rng.normal(loc = 0, scale = 1, size = N),
"Bin": rng.binomial(n = 1, p = 0.5, size = N),
"Cat": rng.choice(["A","B","C"], size = N)
})Empirical grid
datInteresting grid
datagrid(Bin=[0,1], newdata = dat)
datagrid(
Num=[dat["Num"].max(), dat["Num"].min()],
Bin=dat["Bin"].mean(),
Cat=dat["Cat"].unique(),
newdata = dat)Representative grid
datagrid(grid_type = "mean_or_mode", newdata = dat)Balanced grid
datagrid(grid_type = "balanced", newdata = dat)Counterfactual grid
g = datagrid(
Bin=[0,1],
grid_type = "counterfactual",
newdata = dat
)
g.shape
g.filter(pl.col("rowidcf")<3)4. Hypothesis and equivalence tests
dat = get_dataset("thornton")
dat.head(6)
mod = ols("outcome ~ agecat - 1",
data=dat.to_pandas()).fit()
mod.params
dat.group_by("agecat").agg(pl.col("outcome").mean())Null hypothesis
Choice of null hypothesis
hypotheses(mod, hypothesis = 0.5)
b = mod.params.iloc[0]
se = np.sqrt(np.diag(mod.cov_params()))[0]
z = (b - 0.5) / se
(norm.cdf(-np.abs(z))) * 2Linear and non-linear hypothesis tests
hypotheses(mod, hypothesis = "b2 - b0 = 0")
hypotheses(mod, hypothesis = "b2 / b0 = 1")
import numpy as np
hypotheses(mod, hypothesis = "b1**2 * np.exp(b0) = 0")
hypotheses(mod, hypothesis = "b0 - (b1 * b2) = 2")
hypotheses(mod, hypothesis = "difference ~ reference")
hypotheses(mod, hypothesis = "ratio ~ sequential")Equivalence
hypotheses(mod, hypothesis = "b2 - b1 = 0")
hypotheses(mod, hypothesis = "b2 - b1 = 0",
equivalence = [-0.05, 0.05])["p_value_equiv"]5. Predictions
Quantity
dat = get_dataset("thornton")
dat = dat.drop_nulls(subset = ["incentive"])
mod = logit("outcome ~ incentive + agecat",
data=dat.to_pandas()).fit()
b = mod.params
linpred_treatment_younger = b.iloc[0] + b.iloc[3] * 1 + \
b.iloc[1] * 1 + b.iloc[2] * 0
linpred_control_older = b.iloc[0] + b.iloc[3] * 0 + \
b.iloc[1] * 0 + b.iloc[2] * 1
logistic = lambda x: 1 / (1 + np.exp(-x))
logistic(linpred_treatment_younger)
logistic(linpred_control_older)
grid = pl.DataFrame({
"agecat": ["18 to 35", ">35"],
"incentive": [1, 0]
})
predictions(mod, newdata = grid)Predictors
mod = logit("outcome ~ incentive + agecat + distance",
data=dat.to_pandas()).fit()Empirical grid
p = predictions(mod)
p.shape
p.columns
p[:4, "estimate"]Interesting grid
datagrid(agecat = "18 to 35", incentive = [0,1], model = mod)
predictions(mod,
newdata=datagrid(agecat = "18 to 35", incentive = [0,1]))
predictions(mod,
newdata=datagrid(
distance = 2,
agecat = dat["agecat"].unique(),
incentive = dat["incentive"].max()))Representative grid
predictions(mod, newdata="mean")Balanced grid
predictions(mod,
newdata = datagrid(
agecat = dat["agecat"].unique(),
incentive = dat["incentive"].unique(),
distance = dat["distance"].mean()))
predictions(mod, newdata = "balanced") Counterfactual grid
p = predictions(mod, variables = {"incentive": [0, 1]})
p.shape
p = pl.DataFrame({
"Control": p.filter(pl.col("incentive") == 0)["estimate"],
"Treatment": p.filter(pl.col("incentive") == 1)["estimate"]
})
plot = (
ggplot(p, aes(x="Control", y="Treatment")) +
geom_point() +
geom_abline(
intercept=0, slope=1, linetype="--", color="grey") +
labs(
x="Pr(outcome=1) when incentive = 0",
y="Pr(outcome=1) when incentive = 1") +
xlim(0, 1) +
ylim(0, 1) +
theme_minimal()
)
plot.show()Aggregation
p = predictions(mod)
p["estimate"].mean()
avg_predictions(mod)
avg_predictions(mod, by="agecat")
avg_predictions(mod, newdata="balanced", by="agecat")
dat["incentive"].value_counts()
avg_predictions(mod, by="incentive")
avg_predictions(mod,
variables={"incentive": [0, 1]},
by="incentive")Test
Null hypothesis tests
p = avg_predictions(mod, by="agecat")
p["estimate"][2]-p["estimate"][1]
avg_predictions(mod, by="agecat", hypothesis = "b2 - b1 = 0")
avg_predictions(mod, by="agecat",
hypothesis = "difference ~ sequential")
avg_predictions(mod, by="agecat",
hypothesis = "difference ~ reference")
avg_predictions(mod, by=["incentive","agecat"])
avg_predictions(mod, by=["incentive","agecat"],
hypothesis = "difference ~ sequential | incentive")Equivalence tests
avg_predictions(mod,
by="agecat",
hypothesis = "b2 - b0 = 0",
equivalence = [-0.1, 0.1])Visualization
Unit predictions
p = predictions(mod)
p = p.with_columns(pl.col("incentive").cast(pl.Utf8))
plot = (
ggplot(p, aes(x="estimate", fill="incentive")) +
geom_histogram() +
labs(x="Pr(outcome=1) when incentive = 0"))
plot = (
ggplot(p, aes(x="estimate", colour="incentive")) +
geom_line(stat='ecdf'))Marginal predictions
avg_predictions(mod, by="incentive")
plot_predictions(mod, by="incentive").show()
plot_predictions(mod, by=["incentive", "agecat"]).show()
plot_predictions(mod, by="incentive",
newdata = "balanced", draw = False)Conditional predictions
plot_predictions(mod, condition="distance").show()
plot_predictions(mod, condition=["distance","incentive"]).show()
plot_predictions(mod,
condition=["distance","incentive", "agecat"]
).show()
plot_predictions(mod,
condition={"distance" : None, "agecat" : ">35", "incentive" : 0 }
).show()Customization
plot_predictions(mod, by="incentive", draw = False)6. Counterfactual comparisons
dat = get_dataset("thornton")
mod = logit("outcome ~ incentive * (agecat + distance)",
data=dat.to_pandas()).fit()
mod.summary()First steps: risk difference with a binary treatment
grid = pl.DataFrame({
"distance": 2,
"agecat": ["18 to 35"],
"incentive": 1
})g_treatment = grid.with_columns(pl.lit(1).alias("incentive"))
g_control = grid.with_columns(pl.lit(0).alias("incentive"))
p_treatment = mod.predict(g_treatment.to_pandas())
p_control = mod.predict(g_control.to_pandas())
p_treatment - p_controlcomparisons(mod, variables = "incentive", newdata = grid)Comparison functions
comparisons(mod,
variables = "incentive",
comparison = "ratio",
hypothesis = 1,
newdata = grid)comparisons(mod,
variables = "incentive",
comparison = "lift",
hypothesis = 1,
newdata = grid)Predictors
Focal variables.
comparisons(mod,
variables="incentive",
newdata=grid)
comparisons(mod,
variables={"incentive" : [1, 0]},
newdata=grid)
comparisons(mod,
variables="agecat",
newdata=grid)
comparisons(mod,
variables={"agecat" : ["18 to 35", ">35"]},
newdata=grid)# Increase of 5 units
comparisons(mod, variables={"distance": 5}, newdata=grid)
# Increase of 1 standard deviation
comparisons(mod, variables={"distance": "sd"}, newdata=grid)
# Change between specific values
comparisons(mod, variables={"distance" : [0, 3]}, newdata=grid)
# Change across the interquartile range
comparisons(mod, variables={"distance" : "iqr"}, newdata=grid)
# Change across the full range
comparisons(mod, variables={"distance": "minmax"}, newdata=grid)Cross-comparisons
comparisons(mod,
variables = ["incentive", "distance"],
cross = True,
newdata = grid)Adjustment variables
comparisons(mod, variables = "incentive")
cmp = comparisons(mod)
cmp.shapecomparisons(mod,
variables = "incentive", newdata = datagrid(
agecat = dat["agecat"].unique(),
distance = dat["distance"].mean()))
comparisons(mod, variables = "incentive", newdata = "mean")
comparisons(mod, variables = "incentive", newdata = "balanced")Aggregation
avg_comparisons(mod, variables = "incentive")
cmp = comparisons(mod, variables = "incentive")
cmp["estimate"].mean()
avg_comparisons(mod, variables = "incentive", by = "agecat")
avg_comparisons(mod,
variables = "incentive",
newdata = dat.filter(pl.col("incentive") == 1))penguins = get_dataset("penguins", "palmerpenguins") \
.drop_nulls(subset = "species")
penguins \
.select("species", "body_mass_g", "flipper_length_mm") \
.group_by("species").mean()
fit = ols(
"flipper_length_mm ~ body_mass_g * species",
data = penguins.to_pandas()).fit()
avg_predictions(fit, by = "species")
avg_predictions(fit, variables = "species", by = "species")
avg_comparisons(fit, variables = {"species": "sequential"})Test
avg_comparisons(mod, variables = "incentive", by = "agecat")
avg_comparisons(mod, variables = "incentive", by = "agecat",
hypothesis = "b0 - b2 = 0")Visualization
avg_comparisons(mod,
variables = "incentive",
by = "agecat")
plot_comparisons(mod,
variables = "incentive",
by = "agecat").show()
plot_comparisons(mod,
variables = "incentive",
condition = "distance").show()
plot_comparisons(mod,
variables = "incentive",
condition = ["distance", "agecat"]).show()7. Slopes
import polars as pl
import numpy as np
from marginaleffects import *
from plotnine import *
from statsmodels.formula.api import logit, ols
np.random.seed(48103)
N = int(1e6)
X = np.random.normal(loc=0, scale=2, size=N)
p = 1 / (1 + np.exp(-(-1 + 0.5 * X)))
Y = np.random.binomial(n=1, p=p)
dat = pl.DataFrame({"Y": Y, "X": X})
mod = logit("Y ~ X", data=dat.to_pandas()).fit()
mod.params
slopes(
mod,
variables="X",
newdata=datagrid(X = [-5, 0, 10]))Predictors
dat = get_dataset("thornton")
mod = logit(
"outcome ~ incentive * distance * I(distance**2)",
data = dat.to_pandas()).fit()
slopes(
mod,
variables = "distance",
newdata = datagrid(incentive = 1, distance = 1))
slopes(
mod,
variables = "distance",
newdata = "mean")
slopes(mod, variables = "distance")Aggregation
avg_slopes(mod, variables = "distance")
avg_slopes(mod, variables = "distance", by = "incentive")Test
avg_slopes(mod, variables = "distance", by = "incentive")
avg_slopes(mod,
variables = "distance",
by = "incentive",
hypothesis = "b0 - b1 = 0")Visualization
plot_predictions(mod, condition = "distance").show()
plot_slopes(mod, variables = "distance",
condition = "distance").show()
plot_slopes(mod,
variables = "distance",
condition = ["distance", "incentive"]).show()
plot_slopes(mod,
variables = "distance",
by = "incentive").show()8. Causal inference with G-computation
Treatment effects: ATE, ATT, ATU
dat = get_dataset("lottery")
dat = dat.filter((pl.col('win_big') == 1) | (pl.col('win') == 0))
mod = smf.ols("earnings_post_avg ~ win_big * (
tickets + man + work + age + education + college + year +
earnings_pre_1 + earnings_pre_2 + earnings_pre_3)",
data=dat).fit()
mod.summary()d0 = dat.with_columns(pl.lit(0).alias('win_big'))
d1 = dat.with_columns(pl.lit(1).alias('win_big'))
p0 = predictions(mod, newdata = d0)
p1 = predictions(mod, newdata = d1)
dat[5, 'win_big']
p0[5, 'estimate']
p1[5, 'estimate']p0.select('estimate').mean()
p1.select('estimate').mean()
avg_predictions(mod, variables = "win_big", by = "win_big")avg_comparisons(mod, variables = "win_big")
avg_comparisons(mod, variables = "win_big",
newdata=dat.filter(pl.col('win_big') == 1))
avg_comparisons(mod, variables = "win_big",
newdata=dat.filter(pl.col('win_big') == 0))Conditional treatment effects: CATE
avg_comparisons(mod, variables="win_big", by="work")9. Experiments
Regression adjustment
dat = get_dataset("thornton")
mod = ols("outcome ~ incentive", data = dat.to_pandas()).fit()
mod.params
avg_comparisons(mod, variables = "incentive", vcov = "HC2")mod = ols("outcome ~ incentive * (age + distance + hiv2004)",
data=dat.to_pandas()).fit()
mod.params
avg_comparisons(mod, variables = "incentive", vcov = "HC2")Factorial experiments
dat = get_dataset("factorial_01")
mod = ols("Y ~ Ta + Tb + Ta:Tb", data = dat.to_pandas()).fit()
mod.params
plot_predictions(mod, by = ["Ta", "Tb"]).show()
avg_comparisons(mod, variables = "Ta",
newdata = dat.filter(pl.col("Tb") == 0))
avg_comparisons(mod, variables=["Ta", "Tb"], cross=True)
avg_comparisons(mod, variables = "Ta", by = "Tb")
avg_comparisons(mod, variables = "Ta", by = "Tb",
hypothesis = "b1 - b0 = 0")10. Interactions and polynomials
Multiplicative interactions
Categorical-by-categorical
dat = get_dataset("interaction_01").to_pandas()
mod = logit("Y ~ X * M", data=dat).fit()
avg_predictions(mod, by=["X", "M"])
plot_predictions(mod, by=["M", "X"]).show()
avg_comparisons(mod, variables = "X")
avg_comparisons(mod, variables = "X", by = "M")
avg_comparisons(mod,
variables = "X",
by = "M",
hypothesis = "b2 - b0 = 0")Categorical-by-continuous
dat = get_dataset("interaction_02")
mod = logit("Y ~ X * M", data=dat.to_pandas()).fit()
mod.summary()
fivenum = lambda x: np.quantile(x, [0, 0.25, 0.5, 0.75, 1])
range = lambda x: np.quantile(x, [0, 1])
p = predictions(mod, newdata=datagrid(X = [0, 1], M = fivenum))
plot_predictions(mod, condition = ["M", "X"]).show()
avg_comparisons(mod, variables = "X")
comparisons(mod, variables = "X", newdata=datagrid(M = range))
comparisons(mod, variables = "X",
newdata=datagrid(M = range),
hypothesis = "b1 - b0 = 0")Continuous-by-continuous
dat = get_dataset("interaction_03")
mod = logit("Y ~ X * M", data=dat).fit()
predictions(mod, newdata = datagrid(
X = [-2, 2],
M = [-1, 0, 1]))
plot_predictions(mod, condition=["X", "M"]).show()
avg_slopes(mod, variables = "X")
slopes(mod, variables = "X", newdata = datagrid(M = fivenum))
plot_slopes(mod, variables = "X", condition = "M").show()
slopes(mod, variables = "X", newdata=datagrid(M = range))
slopes(mod, variables = "X", newdata=datagrid(M = range),
hypothesis = "b1 - b0 = 0")Multiple interactions
dat = get_dataset("interaction_04")
mod = logit("Y ~ X * M1 * M2", data = dat).fit()
mod.summary()
plot_predictions(mod, by = ["X", "M1", "M2"]).show()
avg_predictions(mod, newdata = datagrid(X = 0, M1 = 0, M2 = 0))
avg_comparisons(mod, variables = "X")
avg_comparisons(mod, variables = "X", by = "M1",
hypothesis = "b1 - b0 = 0")
avg_comparisons(mod,
variables = "X", by = ["M2", "M1"])
avg_comparisons(mod,
hypothesis = "b1 - b0 = 0",
variables = "X", by = ["M2", "M1"])
avg_comparisons(mod,
hypothesis = "b3 - b2 = 0",
variables = "X", by = ["M2", "M1"])
avg_comparisons(mod,
hypothesis = "(b1 - b0) - (b3 - b2) = 0",
variables = "X", by = ["M2", "M1"])Polynomial regression
dat = get_dataset("polynomial_01").to_pandas()
mod_linear = ols("Y ~ X", data = dat).fit()
plot_predictions(mod_linear, condition="X", points=0.05).show()
mod_cubic = ols("Y ~ X + I(X**2) + I(X**3)", data = dat).fit()
plot_predictions(mod_cubic, condition="X", points=0.05).show()
slopes(mod_cubic, variables="X", newdata=datagrid(X=[-2, 0, 2]))dat = get_dataset("polynomial_02").to_pandas()
mod_cubic = ols("Y ~ X + I(X**2) + I(X**3)", data = dat).fit()
plot_predictions(mod_cubic, condition="X", points = 0.1).show()
mod_cubic_interaction = ols("Y ~ M * (X + I(X**2) + I(X**3))",
data=dat).fit()
plot_predictions(mod_cubic_interaction,
condition=["X", "M"], points=0.1).show()
fivenum = lambda x: np.quantile(x, [0, 0.25, 0.5, 0.75, 1])
slopes(mod_cubic_interaction, variables="X",
newdata=datagrid(M=[0, 1], X=fivenum))13. Machine learning
from marginaleffects import *
import numpy as np
import polars.selectors as cs
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (OneHotEncoder,
FunctionTransformer)
from sklearn.compose import make_column_transformer
from xgboost import XGBRegressor
airbnb = get_dataset("airbnb")
airbnb[:5, :6]
train, test = train_test_split(airbnb, train_size = 3/4)Predictions
# Function to convert data frames into X and y matrices
def df_to_xy(data):
y = data.select(cs.by_name("price", require_all=False))
X = data.select(~cs.by_name("price", require_all=False))
return y, X
# Convert categorical variables to dummies
catvar = airbnb.select(~cs.numeric()).columns
preprocessor = make_column_transformer(
(OneHotEncoder(), catvar),
remainder=FunctionTransformer(lambda x: x.to_numpy()),)
# Scikit-Learn pipeline: pre-process and fit
pipeline = make_pipeline(preprocessor, XGBRegressor())
# Run the pipeline
mod = fit_sklearn(
df_to_xy,
data=train,
engine=pipeline,)
avg_predictions(mod, by = "unit_type", newdata = test)
plot_predictions(mod, by = ["bedrooms", "unit_type"]).show()
airbnb_subset = train.sample(10000)
grid = datagrid(
bedrooms = np.unique,
unit_type = np.unique,
newdata = airbnb_subset,
grid_type = "counterfactual")
plot_predictions(mod,
by = ["bedrooms", "unit_type"],
newdata = grid).show()Counterfactual comparisons
avg_comparisons(mod, variables = {"bedrooms": 2})
avg_comparisons(mod,
variables = ["bedrooms", "Wireless Internet"],
cross = True)Roadmap
At the time of writing (April 2025), the following features were not available in the Python version of marginaleffects. Please check marginaleffects.com for developments.
- Bayesian models.
- Mixed effects models.
- Categorical outcome models.
- Functions in the
comparisonargument ofcomparisons(). - Rug plots in
plot_*()functions. - Bootstrap, simulation-based inference, and conformal prediction.
- Multiple comparison correction.
- Clustered standard errors.
- Robust standard errors and link scale predictions for logistic regression models with Statsmodels.