15 Python
This chapter collects Python
code equivalents to nearly all of the R
commands shown in this book. The same code is reproduced, with full output, on the marginaleffects.com website. The small number of features that are not yet available in the Python
version of marginaleffects
are listed on the roadmap in Section 15.1.
Note that, by default, marginaleffects
commands return data frames in the Polars format. These objects are easy to convert to Pandas, Numpy, or other formats using an appropriate .to_*()
method. For example, calling the get_dataset()
function returns a Polars data frame that we can convert to Pandas with .to_pandas()
. Recall that Python uses 0-based indexing whereas R uses 1-based indexing.
import polars as pl
import numpy as np
from marginaleffects import *
from plotnine import *
from scipy.stats import norm
from statsmodels.formula.api import logit, ols
= get_dataset("thornton")
dat = dat.to_pandas()
dat dat.head()
1. Who is this book for?
Section 1.4
= get_dataset("Titanic", "Stat2Data")
dat 6, ["Name", "Survived", "Age"]]
dat[:= "Titanic")
get_dataset(search "Titanic", "Stat2Data", docs = True) get_dataset(
3. Conceptual framework
Section 3.2
= np.random.default_rng(seed=48103)
rng = 10
N = pl.DataFrame({
dat "Num": rng.normal(loc = 0, scale = 1, size = N),
"Bin": rng.binomial(n = 1, p = 0.5, size = N),
"Cat": rng.choice(["A","B","C"], size = N)
})
Empirical grid
dat
Interesting grid
=[0,1], newdata = dat)
datagrid(Bin
datagrid(=[dat["Num"].max(), dat["Num"].min()],
Num=dat["Bin"].mean(),
Bin=dat["Cat"].unique(),
Cat= dat) newdata
Representative grid
= "mean_or_mode", newdata = dat) datagrid(grid_type
Balanced grid
= "balanced", newdata = dat) datagrid(grid_type
Counterfactual grid
= datagrid(
g =[0,1],
Bin= "counterfactual",
grid_type = dat
newdata
)
g.shapefilter(pl.col("rowidcf")<3) g.
4. Hypothesis and equivalence tests
= get_dataset("thornton")
dat 6)
dat.head(= ols("outcome ~ agecat - 1",
mod =dat.to_pandas()).fit()
data
mod.params"agecat").agg(pl.col("outcome").mean()) dat.group_by(
Section 4.1
Choice of null hypothesis
= 0.5)
hypotheses(mod, hypothesis = mod.params.iloc[0]
b = np.sqrt(np.diag(mod.cov_params()))[0]
se = (b - 0.5) / se
z -np.abs(z))) * 2 (norm.cdf(
Linear and non-linear hypothesis tests
= "b2 - b0 = 0")
hypotheses(mod, hypothesis = "b2 / b0 = 1")
hypotheses(mod, hypothesis
import numpy as np
= "b1**2 * np.exp(b0) = 0")
hypotheses(mod, hypothesis = "b0 - (b1 * b2) = 2")
hypotheses(mod, hypothesis
= "difference ~ reference")
hypotheses(mod, hypothesis = "ratio ~ sequential") hypotheses(mod, hypothesis
Section 4.2
= "b2 - b1 = 0")
hypotheses(mod, hypothesis = "b2 - b1 = 0",
hypotheses(mod, hypothesis = [-0.05, 0.05])["p_value_equiv"] equivalence
5. Predictions
Section 5.1
= get_dataset("thornton")
dat = dat.drop_nulls(subset = ["incentive"])
dat = logit("outcome ~ incentive + agecat",
mod =dat.to_pandas()).fit()
data= mod.params
b
= b.iloc[0] + b.iloc[3] * 1 + \
linpred_treatment_younger 1] * 1 + b.iloc[2] * 0
b.iloc[= b.iloc[0] + b.iloc[3] * 0 + \
linpred_control_older 1] * 0 + b.iloc[2] * 1
b.iloc[
= lambda x: 1 / (1 + np.exp(-x))
logistic
logistic(linpred_treatment_younger)
logistic(linpred_control_older)
= pl.DataFrame({
grid "agecat": ["18 to 35", ">35"],
"incentive": [1, 0]
})
= grid) predictions(mod, newdata
Section 5.2
= logit("outcome ~ incentive + agecat + distance",
mod =dat.to_pandas()).fit() data
Empirical grid
= predictions(mod)
p
p.shape
p.columns4, "estimate"] p[:
Interesting grid
= "18 to 35", incentive = [0,1], model = mod)
datagrid(agecat
predictions(mod,=datagrid(agecat = "18 to 35", incentive = [0,1]))
newdata
predictions(mod,=datagrid(
newdata= 2,
distance = dat["agecat"].unique(),
agecat = dat["incentive"].max())) incentive
Representative grid
="mean") predictions(mod, newdata
Balanced grid
predictions(mod, = datagrid(
newdata = dat["agecat"].unique(),
agecat = dat["incentive"].unique(),
incentive = dat["distance"].mean()))
distance = "balanced") predictions(mod, newdata
Counterfactual grid
= predictions(mod, variables = {"incentive": [0, 1]})
p
p.shape= pl.DataFrame({
p "Control": p.filter(pl.col("incentive") == 0)["estimate"],
"Treatment": p.filter(pl.col("incentive") == 1)["estimate"]
})= (
plot ="Control", y="Treatment")) +
ggplot(p, aes(x+
geom_point()
geom_abline(=0, slope=1, linetype="--", color="grey") +
intercept
labs(="Pr(outcome=1) when incentive = 0",
x="Pr(outcome=1) when incentive = 1") +
y0, 1) +
xlim(0, 1) +
ylim(
theme_minimal()
) plot.show()
Section 5.3
= predictions(mod)
p "estimate"].mean()
p[
avg_predictions(mod)="agecat")
avg_predictions(mod, by="balanced", by="agecat")
avg_predictions(mod, newdata"incentive"].value_counts()
dat[="incentive")
avg_predictions(mod, by
avg_predictions(mod,={"incentive": [0, 1]},
variables="incentive") by
Section 5.5
Null hypothesis tests
= avg_predictions(mod, by="agecat")
p "estimate"][2]-p["estimate"][1]
p[="agecat", hypothesis = "b2 - b1 = 0")
avg_predictions(mod, by="agecat",
avg_predictions(mod, by= "difference ~ sequential")
hypothesis ="agecat",
avg_predictions(mod, by= "difference ~ reference")
hypothesis =["incentive","agecat"])
avg_predictions(mod, by=["incentive","agecat"],
avg_predictions(mod, by= "difference ~ sequential | incentive") hypothesis
Equivalence tests
avg_predictions(mod,="agecat",
by= "b2 - b0 = 0",
hypothesis = [-0.1, 0.1]) equivalence
Section 5.6
Unit predictions
= predictions(mod)
p = p.with_columns(pl.col("incentive").cast(pl.Utf8))
p = (
plot ="estimate", fill="incentive")) +
ggplot(p, aes(x+
geom_histogram() ="Pr(outcome=1) when incentive = 0"))
labs(x= (
plot ="estimate", colour="incentive")) +
ggplot(p, aes(x='ecdf')) geom_line(stat
Marginal predictions
="incentive")
avg_predictions(mod, by="incentive").show()
plot_predictions(mod, by=["incentive", "agecat"]).show()
plot_predictions(mod, by="incentive",
plot_predictions(mod, by= "balanced", draw = False) newdata
Conditional predictions
="distance").show()
plot_predictions(mod, condition=["distance","incentive"]).show()
plot_predictions(mod, condition
plot_predictions(mod,=["distance","incentive", "agecat"]
condition
).show()
plot_predictions(mod,={"distance" : None, "agecat" : ">35", "incentive" : 0 }
condition ).show()
Customization
="incentive", draw = False) plot_predictions(mod, by
6. Counterfactual comparisons
= get_dataset("thornton")
dat = logit("outcome ~ incentive * (agecat + distance)",
mod =dat.to_pandas()).fit()
data mod.summary()
Section 6.1.1
= pl.DataFrame({
grid "distance": 2,
"agecat": ["18 to 35"],
"incentive": 1
})
= grid.with_columns(pl.lit(1).alias("incentive"))
g_treatment = grid.with_columns(pl.lit(0).alias("incentive"))
g_control = mod.predict(g_treatment.to_pandas())
p_treatment = mod.predict(g_control.to_pandas())
p_control - p_control p_treatment
= "incentive", newdata = grid) comparisons(mod, variables
Section 6.1.2
comparisons(mod,= "incentive",
variables = "ratio",
comparison = 1,
hypothesis = grid) newdata
comparisons(mod,= "incentive",
variables = "lift",
comparison = 1,
hypothesis = grid) newdata
Section 6.2
Focal variables.
comparisons(mod,="incentive",
variables=grid)
newdata
comparisons(mod,={"incentive" : [1, 0]},
variables=grid)
newdata
comparisons(mod,="agecat",
variables=grid)
newdata
comparisons(mod,={"agecat" : ["18 to 35", ">35"]},
variables=grid) newdata
# Increase of 5 units
={"distance": 5}, newdata=grid)
comparisons(mod, variables# Increase of 1 standard deviation
={"distance": "sd"}, newdata=grid)
comparisons(mod, variables# Change between specific values
={"distance" : [0, 3]}, newdata=grid)
comparisons(mod, variables# Change across the interquartile range
={"distance" : "iqr"}, newdata=grid)
comparisons(mod, variables# Change across the full range
={"distance": "minmax"}, newdata=grid) comparisons(mod, variables
Cross-comparisons
comparisons(mod,= ["incentive", "distance"],
variables = True,
cross = grid) newdata
Adjustment variables
= "incentive")
comparisons(mod, variables cmp = comparisons(mod)
cmp.shape
comparisons(mod,= "incentive", newdata = datagrid(
variables = dat["agecat"].unique(),
agecat = dat["distance"].mean()))
distance = "incentive", newdata = "mean")
comparisons(mod, variables = "incentive", newdata = "balanced") comparisons(mod, variables
Section 6.3
= "incentive")
avg_comparisons(mod, variables cmp = comparisons(mod, variables = "incentive")
cmp["estimate"].mean()
= "incentive", by = "agecat")
avg_comparisons(mod, variables
avg_comparisons(mod,= "incentive",
variables = dat.filter(pl.col("incentive") == 1)) newdata
= get_dataset("penguins", "palmerpenguins") \
penguins = "species")
.drop_nulls(subset \
penguins "species", "body_mass_g", "flipper_length_mm") \
.select("species").mean()
.group_by(= ols(
fit "flipper_length_mm ~ body_mass_g * species",
= penguins.to_pandas()).fit()
data = "species")
avg_predictions(fit, by = "species", by = "species")
avg_predictions(fit, variables = {"species": "sequential"}) avg_comparisons(fit, variables
Section 6.5
= "incentive", by = "agecat")
avg_comparisons(mod, variables = "incentive", by = "agecat",
avg_comparisons(mod, variables = "b0 - b2 = 0") hypothesis
Section 6.6
avg_comparisons(mod,= "incentive",
variables = "agecat")
by
plot_comparisons(mod,= "incentive",
variables = "agecat").show()
by
plot_comparisons(mod,= "incentive",
variables = "distance").show()
condition
plot_comparisons(mod,= "incentive",
variables = ["distance", "agecat"]).show() condition
7. Slopes
import polars as pl
import numpy as np
from marginaleffects import *
from plotnine import *
from statsmodels.formula.api import logit, ols
48103)
np.random.seed(= int(1e6)
N = np.random.normal(loc=0, scale=2, size=N)
X = 1 / (1 + np.exp(-(-1 + 0.5 * X)))
p = np.random.binomial(n=1, p=p)
Y = pl.DataFrame({"Y": Y, "X": X})
dat = logit("Y ~ X", data=dat.to_pandas()).fit()
mod
mod.params
slopes(
mod,="X",
variables=datagrid(X = [-5, 0, 10])) newdata
Section 7.2
= get_dataset("thornton")
dat = logit(
mod "outcome ~ incentive * distance * I(distance**2)",
= dat.to_pandas()).fit()
data
slopes(
mod,= "distance",
variables = datagrid(incentive = 1, distance = 1))
newdata
slopes(
mod,= "distance",
variables = "mean")
newdata = "distance") slopes(mod, variables
Section 7.3
= "distance")
avg_slopes(mod, variables = "distance", by = "incentive") avg_slopes(mod, variables
Section 7.5
= "distance", by = "incentive")
avg_slopes(mod, variables
avg_slopes(mod,= "distance",
variables = "incentive",
by = "b0 - b1 = 0") hypothesis
Section 7.6
= "distance").show()
plot_predictions(mod, condition = "distance",
plot_slopes(mod, variables = "distance").show()
condition
plot_slopes(mod,= "distance",
variables = ["distance", "incentive"]).show()
condition
plot_slopes(mod,= "distance",
variables = "incentive").show() by
8. Causal inference with G-computation
Section 8.1
= get_dataset("lottery")
dat = dat.filter((pl.col('win_big') == 1) | (pl.col('win') == 0))
dat = smf.ols("earnings_post_avg ~ win_big * (
mod tickets + man + work + age + education + college + year +
+ earnings_pre_2 + earnings_pre_3)",
earnings_pre_1 data=dat).fit()
mod.summary()
= dat.with_columns(pl.lit(0).alias('win_big'))
d0 = dat.with_columns(pl.lit(1).alias('win_big'))
d1 = predictions(mod, newdata = d0)
p0 = predictions(mod, newdata = d1)
p1 5, 'win_big']
dat[5, 'estimate']
p0[5, 'estimate'] p1[
'estimate').mean()
p0.select('estimate').mean()
p1.select(= "win_big", by = "win_big") avg_predictions(mod, variables
= "win_big")
avg_comparisons(mod, variables = "win_big",
avg_comparisons(mod, variables =dat.filter(pl.col('win_big') == 1))
newdata= "win_big",
avg_comparisons(mod, variables =dat.filter(pl.col('win_big') == 0)) newdata
Section 8.2
="win_big", by="work") avg_comparisons(mod, variables
9. Experiments
Section 9.1
= get_dataset("thornton")
dat = ols("outcome ~ incentive", data = dat.to_pandas()).fit()
mod
mod.params= "incentive", vcov = "HC2") avg_comparisons(mod, variables
= ols("outcome ~ incentive * (age + distance + hiv2004)",
mod =dat.to_pandas()).fit()
data
mod.params= "incentive", vcov = "HC2") avg_comparisons(mod, variables
Section 9.2
= get_dataset("factorial_01")
dat = ols("Y ~ Ta + Tb + Ta:Tb", data = dat.to_pandas()).fit()
mod
mod.params= ["Ta", "Tb"]).show()
plot_predictions(mod, by = "Ta",
avg_comparisons(mod, variables = dat.filter(pl.col("Tb") == 0))
newdata =["Ta", "Tb"], cross=True)
avg_comparisons(mod, variables= "Ta", by = "Tb")
avg_comparisons(mod, variables = "Ta", by = "Tb",
avg_comparisons(mod, variables = "b1 - b0 = 0") hypothesis
10. Interactions and polynomials
Section 10.1
Categorical-by-categorical
= get_dataset("interaction_01").to_pandas()
dat = logit("Y ~ X * M", data=dat).fit()
mod =["X", "M"])
avg_predictions(mod, by=["M", "X"]).show()
plot_predictions(mod, by= "X")
avg_comparisons(mod, variables = "X", by = "M")
avg_comparisons(mod, variables
avg_comparisons(mod,= "X",
variables = "M",
by = "b2 - b0 = 0") hypothesis
Categorical-by-continuous
= get_dataset("interaction_02")
dat = logit("Y ~ X * M", data=dat.to_pandas()).fit()
mod
mod.summary()= lambda x: np.quantile(x, [0, 0.25, 0.5, 0.75, 1])
fivenum range = lambda x: np.quantile(x, [0, 1])
= predictions(mod, newdata=datagrid(X = [0, 1], M = fivenum))
p = ["M", "X"]).show()
plot_predictions(mod, condition = "X")
avg_comparisons(mod, variables = "X", newdata=datagrid(M = range))
comparisons(mod, variables = "X",
comparisons(mod, variables =datagrid(M = range),
newdata= "b1 - b0 = 0") hypothesis
Continuous-by-continuous
= get_dataset("interaction_03")
dat = logit("Y ~ X * M", data=dat).fit()
mod = datagrid(
predictions(mod, newdata = [-2, 2],
X = [-1, 0, 1]))
M =["X", "M"]).show()
plot_predictions(mod, condition= "X")
avg_slopes(mod, variables = "X", newdata = datagrid(M = fivenum))
slopes(mod, variables = "X", condition = "M").show()
plot_slopes(mod, variables = "X", newdata=datagrid(M = range))
slopes(mod, variables = "X", newdata=datagrid(M = range),
slopes(mod, variables = "b1 - b0 = 0") hypothesis
Multiple interactions
= get_dataset("interaction_04")
dat = logit("Y ~ X * M1 * M2", data = dat).fit()
mod
mod.summary()= ["X", "M1", "M2"]).show()
plot_predictions(mod, by = datagrid(X = 0, M1 = 0, M2 = 0))
avg_predictions(mod, newdata = "X")
avg_comparisons(mod, variables = "X", by = "M1",
avg_comparisons(mod, variables = "b1 - b0 = 0")
hypothesis
avg_comparisons(mod,= "X", by = ["M2", "M1"])
variables
avg_comparisons(mod,= "b1 - b0 = 0",
hypothesis = "X", by = ["M2", "M1"])
variables
avg_comparisons(mod,= "b3 - b2 = 0",
hypothesis = "X", by = ["M2", "M1"])
variables
avg_comparisons(mod,= "(b1 - b0) - (b3 - b2) = 0",
hypothesis = "X", by = ["M2", "M1"]) variables
Section 10.2
= get_dataset("polynomial_01").to_pandas()
dat = ols("Y ~ X", data = dat).fit()
mod_linear ="X", points=0.05).show()
plot_predictions(mod_linear, condition= ols("Y ~ X + I(X**2) + I(X**3)", data = dat).fit()
mod_cubic ="X", points=0.05).show()
plot_predictions(mod_cubic, condition="X", newdata=datagrid(X=[-2, 0, 2])) slopes(mod_cubic, variables
= get_dataset("polynomial_02").to_pandas()
dat = ols("Y ~ X + I(X**2) + I(X**3)", data = dat).fit()
mod_cubic ="X", points = 0.1).show()
plot_predictions(mod_cubic, condition= ols("Y ~ M * (X + I(X**2) + I(X**3))",
mod_cubic_interaction =dat).fit()
data
plot_predictions(mod_cubic_interaction, =["X", "M"], points=0.1).show()
condition= lambda x: np.quantile(x, [0, 0.25, 0.5, 0.75, 1])
fivenum ="X",
slopes(mod_cubic_interaction, variables=datagrid(M=[0, 1], X=fivenum)) newdata
13. Machine learning
from marginaleffects import *
import numpy as np
import polars.selectors as cs
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (OneHotEncoder,
FunctionTransformer)from sklearn.compose import make_column_transformer
from xgboost import XGBRegressor
= get_dataset("airbnb")
airbnb
5, :6]
airbnb[:
= train_test_split(airbnb, train_size = 3/4) train, test
Section 13.2
# Function to convert data frames into X and y matrices
def df_to_xy(data):
= data.select(cs.by_name("price", require_all=False))
y = data.select(~cs.by_name("price", require_all=False))
X return y, X
# Convert categorical variables to dummies
= airbnb.select(~cs.numeric()).columns
catvar = make_column_transformer(
preprocessor
(OneHotEncoder(), catvar),=FunctionTransformer(lambda x: x.to_numpy()),)
remainder
# Scikit-Learn pipeline: pre-process and fit
= make_pipeline(preprocessor, XGBRegressor())
pipeline
# Run the pipeline
= fit_sklearn(
mod
df_to_xy,=train,
data=pipeline,)
engine
= "unit_type", newdata = test)
avg_predictions(mod, by
= ["bedrooms", "unit_type"]).show()
plot_predictions(mod, by
= train.sample(10000)
airbnb_subset
= datagrid(
grid = np.unique,
bedrooms = np.unique,
unit_type = airbnb_subset,
newdata = "counterfactual")
grid_type
plot_predictions(mod,= ["bedrooms", "unit_type"],
by = grid).show() newdata
Section 13.3
= {"bedrooms": 2})
avg_comparisons(mod, variables
avg_comparisons(mod,= ["bedrooms", "Wireless Internet"],
variables = True) cross
15.1 Roadmap
At the time of writing (April 2025), the following features were not available in the Python version of marginaleffects
. Please check marginaleffects.com for developments.
- Bayesian models.
- Mixed effects models.
- Categorical outcome models.
- Functions in the
comparison
argument ofcomparisons()
. - Rug plots in
plot_*()
functions. - Bootstrap, simulation-based inference, and conformal prediction.
- Multiple comparison correction.
- Clustered standard errors.
- Robust standard errors and link scale predictions for logistic regression models with Statsmodels.