9. Prediction and Bayesian inference¶
Bayesian inference has three steps:
- We have a model (e.g., normal or linear growth) of how the data are generated and the data themselves. We combine them to figure out which parameter values are most likely to have generated the data. End up with a posterior distribution of parameter values - values that could reasonably have generated the data and their likelihoods.
- From the posterior distribution, we can make predictions about future data and their uncertainties.
- Incorporate prior information to make better predictions.
9.1 Propagating uncertainty in inference using posterior simulations¶
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import arviz as az
import pymc as pm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.robust.scale import mad
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/__init__.py:50: FutureWarning: ArviZ is undergoing a major refactor to improve flexibility and extensibility while maintaining a user-friendly interface. Some upcoming changes may be backward incompatible. For details and migration guidance, visit: https://python.arviz.org/en/latest/user_guide/migration_guide.html warn(
def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
"""
Fit a linear model using statsmodels, print summary, plot, and show formula.
Args:
data: pandas DataFrame
predictors: list of predictor column names (str)
outcome: outcome column name (str)
add_constant: whether to add intercept (default True)
show_plot: whether to plot (default True)
scatter_kws: dict, kwargs for scatterplot
line_kws: dict, kwargs for regression line
"""
X = data[predictors].copy()
if add_constant:
X = sm.add_constant(X, prepend=False)
y = data[outcome]
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())
# Print formula
params = results.params
formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
if add_constant:
formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
print("Formula:", formula)
# Print residual standard deviation and its uncertainty
sigma = np.sqrt(results.mse_resid)
sigma_se = sigma / np.sqrt(2 * results.df_resid)
print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
# Print median absolute deviation of residuals
print("MAD of residuals:", round(mad(results.resid), 2))
# Plot if only one predictor
if show_plot and len(predictors) == 1:
x_name = predictors[0]
ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
ax.set_title('Linear Regression Fit')
plt.show()
def fit_and_plot_bayes(data, predictor, outcome,
intercept_mu=0, intercept_sigma=50,
slope_mu=0, slope_sigma=50,
sigma_sigma=50,
samples=2000, tune=1000, hdi_prob=0.95,
show_trace=True, show_forest=True,
show_posterior=True, show_regression=True,
n_regression_lines=100):
"""
Fit a Bayesian simple linear regression using PyMC and optionally plot diagnostics.
Args:
data: pandas DataFrame
predictor: str, name of the predictor column
outcome: str, name of the outcome column
intercept_mu, intercept_sigma: prior mean and std for intercept ~ Normal
slope_mu, slope_sigma: prior mean and std for slope ~ Normal
sigma_sigma: prior std for residual noise ~ HalfNormal
samples: number of posterior draws
tune: number of tuning steps
hdi_prob: HDI probability for summaries and plots
show_trace: plot trace and posterior density per parameter
show_forest: plot forest (posterior means + HDI)
show_posterior: plot posterior densities
show_regression: plot data with posterior regression lines
n_regression_lines: number of posterior draws to overlay on regression plot
Returns:
trace: PyMC InferenceData object
"""
x = data[predictor].values
y = data[outcome].values
with pm.Model() as model:
intercept = pm.Normal("intercept", mu=intercept_mu, sigma=intercept_sigma)
slope = pm.Normal("slope", mu=slope_mu, sigma=slope_sigma)
sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)
mu = intercept + slope * x
likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y)
trace = pm.sample(samples, tune=tune)
print(pm.summary(trace, hdi_prob=hdi_prob))
if show_trace:
az.plot_trace(trace)
plt.tight_layout()
plt.show()
if show_forest:
az.plot_forest(trace, hdi_prob=hdi_prob)
plt.show()
if show_posterior:
az.plot_posterior(trace, hdi_prob=hdi_prob)
plt.show()
if show_regression:
posterior = trace.posterior
a_samples = posterior["intercept"].values.flatten()
b_samples = posterior["slope"].values.flatten()
plt.scatter(x, y)
x_grid = np.linspace(x.min(), x.max(), 100)
idx = np.random.choice(len(a_samples), n_regression_lines, replace=False)
for i in idx:
plt.plot(x_grid, a_samples[i] + b_samples[i] * x_grid, alpha=0.05, color="gray")
plt.plot(x_grid, a_samples.mean() + b_samples.mean() * x_grid, color="red")
plt.xlabel(predictor)
plt.ylabel(outcome)
plt.title("Bayesian regression with posterior uncertainty")
plt.show()
return trace
# Import hibbs.dat from /ros_data
hibbs = pd.read_csv('../ros_data/hibbs.dat',
sep='\s+')
# Display the first few rows of the dataset
display(hibbs.head(10))
fit_and_plot_lm(data = hibbs, predictors = ['growth'], outcome = 'vote', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)
# Bayesian linear regression with PyMC
x = hibbs['growth'].values
y = hibbs['vote'].values
with pm.Model() as model: # Create a new PyMC model context to define the Bayesian model
intercept = pm.Normal("intercept", mu=0, sigma=50) # Define a Normal prior for the intercept with mean 0 and std 50
slope = pm.Normal("slope", mu=0, sigma=50) # Define a Normal prior for the slope with mean 0 and std 50
sigma = pm.HalfNormal("sigma", sigma=50) # Define a HalfNormal prior for the noise std (constrained positive)
mu = intercept + slope * x # Compute the expected value of y as a linear function of x
likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y) # Define the likelihood: observed y ~ Normal(mu, sigma)
trace = pm.sample(2000, tune=1000) # Run MCMC sampling: 2000 posterior draws after 1000 tuning steps
print(pm.summary(trace, hdi_prob=0.95)) # Print posterior summary with 95% HDI (highest density interval)
# Plot the posterior distributions and trace
az.plot_trace(trace)
plt.tight_layout()
plt.show()
# Forest plot: shows posterior means and HDI intervals
az.plot_forest(trace, hdi_prob=0.95)
# Posterior plot: shows density of each parameter
az.plot_posterior(trace, hdi_prob=0.95)
plt.show()
# Plot regression line with posterior uncertainty
posterior = trace.posterior
a_samples = posterior["intercept"].values.flatten()
b_samples = posterior["slope"].values.flatten()
plt.scatter(x, y)
x_grid = np.linspace(x.min(), x.max(), 100)
for i in np.random.choice(len(a_samples), 100,
replace=False):
plt.plot(x_grid, a_samples[i] + b_samples[i] *
x_grid, alpha=0.05, color="gray")
plt.plot(x_grid, a_samples.mean() +
b_samples.mean() * x_grid, color="red")
plt.xlabel("growth")
plt.ylabel("vote")
plt.title("Bayesian regression with posterior uncertainty")
plt.show()
<>:3: SyntaxWarning: invalid escape sequence '\s' <>:3: SyntaxWarning: invalid escape sequence '\s' /var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_44536/254143673.py:3: SyntaxWarning: invalid escape sequence '\s' sep='\s+')
| year | growth | vote | inc_party_candidate | other_candidate | |
|---|---|---|---|---|---|
| 0 | 1952 | 2.40 | 44.60 | Stevenson | Eisenhower |
| 1 | 1956 | 2.89 | 57.76 | Eisenhower | Stevenson |
| 2 | 1960 | 0.85 | 49.91 | Nixon | Kennedy |
| 3 | 1964 | 4.21 | 61.34 | Johnson | Goldwater |
| 4 | 1968 | 3.02 | 49.60 | Humphrey | Nixon |
| 5 | 1972 | 3.62 | 61.79 | Nixon | McGovern |
| 6 | 1976 | 1.08 | 48.95 | Ford | Carter |
| 7 | 1980 | -0.39 | 44.70 | Carter | Reagan |
| 8 | 1984 | 3.86 | 59.17 | Reagan | Mondale |
| 9 | 1988 | 2.27 | 53.94 | Bush, Sr. | Dukakis |
OLS Regression Results
==============================================================================
Dep. Variable: vote R-squared: 0.580
Model: OLS Adj. R-squared: 0.550
Method: Least Squares F-statistic: 19.32
Date: Tue, 31 Mar 2026 Prob (F-statistic): 0.000610
Time: 07:55:45 Log-Likelihood: -42.839
No. Observations: 16 AIC: 89.68
Df Residuals: 14 BIC: 91.22
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
growth 3.0605 0.696 4.396 0.001 1.567 4.554
const 46.2476 1.622 28.514 0.000 42.769 49.726
==============================================================================
Omnibus: 5.392 Durbin-Watson: 2.379
Prob(Omnibus): 0.067 Jarque-Bera (JB): 2.828
Skew: -0.961 Prob(JB): 0.243
Kurtosis: 3.738 Cond. No. 4.54
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: vote = 46.25 + 3.06*growth
Residual std dev (σ): 3.76 ± 0.71
MAD of residuals: 2.27
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 46.233 1.858 42.250 49.720 0.033 0.028 3150.0
slope 3.069 0.791 1.550 4.695 0.014 0.013 3114.0
sigma 4.138 0.930 2.624 5.969 0.017 0.024 2686.0
ess_tail r_hat
intercept 3200.0 1.0
slope 3351.0 1.0
sigma 2134.0 1.0
# Using the matrix of posterior simulations to express uncertainty about a parameter estimate or function of parameter estimates
a = posterior["intercept"].values.flatten()
b = posterior["slope"].values.flatten()
print("First 5 samples of intercept (a):", a[:5])
print("Length of a samples:", len(a))
print("First 5 samples of slope (b):", b[:5])
print("Length of b samples:", len(b))
# Compute the ratio of intercept to slope for each posterior sample - just an example of a function of parameters
z = a / b
print(f"median: {np.median(z):.2f}, MAD:{mad(z):.2f}")
First 5 samples of intercept (a): [46.33711389 46.33711389 46.33711389 46.33711389 45.40322975] Length of a samples: 8000 First 5 samples of slope (b): [3.00528213 3.00528213 3.00528213 3.00528213 3.47101783] Length of b samples: 8000 median: 15.08, MAD:4.07
9.2 Prediction and uncertainty: predict, posterior_linpred, and posterior_predict¶
After fitting a regression $y = a + b x + \epsilon$ to data, we can use it to predict a new data point or set of data points with predictors $x_{new}$. We can do this in three ways:
- Point prediction, $\hat{y} = \hat{a} + \hat{b} x_{new}$, where $\hat{a}$ and $\hat{b}$ are the mean of the posterior distribution of $a$ and $b$. This is a single value prediction. This ignores the uncertainty in the parameter estimates and the variability in the data.
- Linear prediction with uncertainty, $\hat{y} = a + b x_{new}$, where $a$ and $b$ are drawn from the posterior distribution. This gives us a distribution of predicted values that incorporates the uncertainty in the parameter estimates but not the variability in the data.
- The predictive distribution for a new observation, $\hat{y} = a + b x_{new} + \epsilon$, where $a$ and $b$ are drawn from the posterior distribution and $\epsilon$ is drawn from the distribution of the residuals. This gives us a distribution of predicted values that incorporates both the uncertainty in the parameter estimates and the variability in the data.
Example blood pressure y is predicted from dose x of a drug.
Point prediction is a single best guess at a given dose, no uncertainty. E.g., at dose x, the average blood pressure is 120 mmHg. Linear prediction is the same, but with uncertainty. E.g., at dose x, the average blood pressure is between 110 and 130 mmHg. Predictive distribution is predicting a specific person's blood pressure at a given dose. E.g., at dose x, a person's blood pressure is between 100 and 140 mmHg.
With lots of data the uncertainty in the linear prediction will be small, but the uncertainty in the predictive distribution will still be large because of the variability in the data. With little data, both uncertainties will be large.
Point prediction¶
# --- Point prediction ---
# Predict the incumbent party's vote share given 2.0% economic growth
# This uses the posterior mean of the intercept and slope as point estimates
new_growth = 2.0 # Hypothetical new value of economic growth
a_hat = a.mean() # Posterior mean of intercept (point estimate)
b_hat = b.mean() # Posterior mean of slope (point estimate)
y_point_pred = a_hat + b_hat * new_growth # Point prediction: y = a_hat + b_hat * x_new
print(f"Point prediction: {a_hat:.1f} + {b_hat:.1f} * {new_growth} = {y_point_pred:.1f}")
Point prediction: 46.2 + 3.1 * 2.0 = 52.4
Linear predictor with uncertainty¶
Uses the full posterior distribution of $a$ and $b$ to compute $\hat{y} = a + b x_{new}$ for each posterior draw. This gives a distribution of predicted values that captures uncertainty in the estimated regression line, but not individual-level variability.
Linear prediction with uncertainty¶
# Linear prediction with uncertainty
a = posterior["intercept"].values.flatten()
b = posterior["slope"].values.flatten()
y_pred_samples = a + b * new_growth # Compute predicted y for each posterior sample of a and b
# print y_pred_samples head and length to check
print("First 5 predicted y samples:", y_pred_samples[:5])
print("Length of predicted y samples:", len(y_pred_samples))
print(f"Prediction with uncertainty: median={np.median(y_pred_samples):.1f}, MAD={mad(y_pred_samples):.1f}")
# print distribution of predicted y samples
plt.hist(y_pred_samples, bins=30, density=True, alpha=0.7)
plt.xlabel("Predicted vote share")
plt.ylabel("Density")
plt.title("Posterior predictive distribution for new growth=2.0%")
plt.show()
First 5 predicted y samples: [52.34767815 52.34767815 52.34767815 52.34767815 52.34526541] Length of predicted y samples: 8000 Prediction with uncertainty: median=52.4, MAD=1.0
Predictive distribution for a new observation using posterior_predict¶
# --- Predictive distribution for a new observation (posterior_predict) ---
# Adds residual noise (sigma) to the linear predictor to account for individual-level variability
# This is the full predictive distribution: y_pred = a + b * x_new + epsilon
sigma_samples = trace.posterior["sigma"].values.flatten() # Posterior samples of residual std dev
n_sims = len(a) # Number of posterior draws
y_pred = a + b * new_growth + np.random.normal(0, sigma_samples, n_sims) # Add random noise ~ N(0, sigma) to each draw
# Summarize the predictive distribution
y_pred_median = np.median(y_pred) # Median of predictive distribution
y_pred_mad = mad(y_pred) # MAD: robust measure of spread
# Win probability: the fraction of simulated elections where the incumbent gets > 50% of the vote.
# y_pred > 50 produces a boolean array (True/False for each of the 8000 simulations).
# np.mean() treats True as 1 and False as 0, so the mean IS the proportion.
# E.g., if 5600 out of 8000 simulations have y_pred > 50, then win_prob = 5600/8000 = 0.70.
# This works because each simulation already incorporates both parameter uncertainty AND
# residual variability, so the proportion directly estimates the probability.
win_prob = np.mean(y_pred > 50)
print(f"Predicted incumbent vote share: {y_pred_median:.1f}, with s.e. {y_pred_mad:.1f}")
print(f"Pr(incumbent win) = {win_prob:.2f}")
# Plot the predictive distribution
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(y_pred, bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')
ax.axvline(50, color='black', linestyle='--', label='50% threshold') # Win/lose boundary
ax.axvline(y_pred_median, color='red', linestyle='--', label=f'Median: {y_pred_median:.1f}')
ax.set_xlabel('Predicted vote share (%)')
ax.set_ylabel('Density')
ax.set_title(f'Predictive distribution at growth = {new_growth}%')
ax.legend()
plt.show()
Predicted incumbent vote share: 52.4, with s.e. 4.1 Pr(incumbent win) = 0.72
Prediction given a range of input values¶
# Create a grid of growth values from -2.0 to 4.0 in steps of 0.5
new_grid = np.arange(-2.0, 4.5, 0.5)
# Point predictions: single best guess (posterior mean) for each growth value
y_point_pred_grid = a.mean() + b.mean() * new_grid
# Linear predictor with uncertainty: posterior draws of a + b*x for each growth value (n_sims x n_grid matrix)
y_linpred_grid = a[:, None] + b[:, None] * new_grid[None, :]
# Full predictive distribution: adds residual noise (sigma) to each linear predictor draw
y_pred_grid = y_linpred_grid + np.random.normal(0, sigma_samples[:, None], y_linpred_grid.shape)
print("First 5 point predictions for grid:", y_point_pred_grid[:5])
print("Shape of linear predictor grid:", y_linpred_grid.shape)
print("Shape of full predictive grid:", y_pred_grid.shape)
display(pd.DataFrame({
'growth': new_grid,
'point_pred': y_point_pred_grid,
'linpred_median': np.median(y_linpred_grid, axis=0),
'linpred_mad': mad(y_linpred_grid, axis=0),
'pred_median': np.median(y_pred_grid, axis=0),
'pred_mad': mad(y_pred_grid, axis=0)
}))
First 5 point predictions for grid: [40.09547055 41.62978762 43.16410469 44.69842176 46.23273884] Shape of linear predictor grid: (8000, 13) Shape of full predictive grid: (8000, 13)
| growth | point_pred | linpred_median | linpred_mad | pred_median | pred_mad | |
|---|---|---|---|---|---|---|
| 0 | -2.0 | 40.095471 | 40.098814 | 3.143626 | 40.151361 | 5.153935 |
| 1 | -1.5 | 41.629788 | 41.645276 | 2.750526 | 41.702306 | 4.955122 |
| 2 | -1.0 | 43.164105 | 43.179480 | 2.394542 | 43.215047 | 4.705621 |
| 3 | -0.5 | 44.698422 | 44.704079 | 2.084559 | 44.723523 | 4.466381 |
| 4 | 0.0 | 46.232739 | 46.242357 | 1.764202 | 46.313202 | 4.382266 |
| 5 | 0.5 | 47.767056 | 47.790697 | 1.476641 | 47.768615 | 4.302465 |
| 6 | 1.0 | 49.301373 | 49.305630 | 1.231637 | 49.326583 | 4.205838 |
| 7 | 1.5 | 50.835690 | 50.843929 | 1.047895 | 50.843407 | 4.129710 |
| 8 | 2.0 | 52.370007 | 52.371053 | 1.000356 | 52.466981 | 4.161060 |
| 9 | 2.5 | 53.904324 | 53.915128 | 1.102440 | 53.860832 | 4.264261 |
| 10 | 3.0 | 55.438641 | 55.445861 | 1.274693 | 55.359260 | 4.121965 |
| 11 | 3.5 | 56.972958 | 56.972529 | 1.515940 | 56.920709 | 4.172369 |
| 12 | 4.0 | 58.507275 | 58.512106 | 1.799375 | 58.560900 | 4.438528 |
Propagating uncertainty¶
If we have a best estimate of 2% growth, but uncertainty with a normal distribution with standard deviation of 0.3%
x_new = np.random.normal(2, 0.3, size=n_sims) # Simulate n new growth values from N(2, 0.3) to represent uncertainty in the new growth value
y_pred = a + b * x_new + np.random.normal(0, sigma_samples, n_sims) # Add random noise ~ N(0, sigma) to each draw
# Summarize the predictive distribution
y_pred_median = np.median(y_pred) # Median of predictive distribution
y_pred_mad = mad(y_pred) # MAD: robust measure of spread
# Win probability: the fraction of simulated elections where the incumbent gets > 50% of the vote.
# y_pred > 50 produces a boolean array (True/False for each of the 8000 simulations).
# np.mean() treats True as 1 and False as 0, so the mean IS the proportion.
# E.g., if 5600 out of 8000 simulations have y_pred > 50, then win_prob = 5600/8000 = 0.70.
# This works because each simulation already incorporates both parameter uncertainty AND
# residual variability, so the proportion directly estimates the probability.
win_prob = np.mean(y_pred > 50)
print(f"Predicted incumbent vote share: {y_pred_median:.1f}, with s.e. {y_pred_mad:.1f}")
print(f"Pr(incumbent win) = {win_prob:.2f}")
# Plot the predictive distribution
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(y_pred, bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')
ax.axvline(50, color='black', linestyle='--', label='50% threshold') # Win/lose boundary
ax.axvline(y_pred_median, color='red', linestyle='--', label=f'Median: {y_pred_median:.1f}')
ax.set_xlabel('Predicted vote share (%)')
ax.set_ylabel('Density')
ax.set_title(f'Predictive distribution at growth = {new_growth}%')
ax.legend()
plt.show()
Predicted incumbent vote share: 52.4, with s.e. 4.2 Pr(incumbent win) = 0.71
Simulating uncertainty for the linear predictor and new observations¶
earnings = pd.read_csv('../ros_data/earnings.csv', skiprows=0)
earnings['earn_k'] = earnings['earn'] / 1000
earnings['c_height'] = earnings['height'] - 66 # Center height around 66 inches for better interpretability of the intercept
display(earnings.head())
earnings_clean = earnings.dropna(subset=['c_height', 'weight'])
print(earnings_clean[['c_height', 'weight']].info())
print(earnings_clean[['c_height', 'weight']].isna().sum())
fit_and_plot_lm(earnings_clean, predictors=['c_height'], outcome='weight', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)
trace_2 = fit_and_plot_bayes(data = earnings_clean, predictor = 'c_height', outcome = 'weight',
intercept_mu=0, intercept_sigma=50,
slope_mu=0, slope_sigma=50,
sigma_sigma=50,
samples=2000, tune=1000, hdi_prob=0.95,
show_trace=True, show_forest=True,
show_posterior=True, show_regression=True,
n_regression_lines=100)
| height | weight | male | earn | earnk | ethnicity | education | mother_education | father_education | walk | exercise | smokenow | tense | angry | age | earn_k | c_height | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 74 | 210.0 | 1 | 50000.0 | 50.0 | White | 16.0 | 16.0 | 16.0 | 3 | 3 | 2.0 | 0.0 | 0.0 | 45 | 50.0 | 8 |
| 1 | 66 | 125.0 | 0 | 60000.0 | 60.0 | White | 16.0 | 16.0 | 16.0 | 6 | 5 | 1.0 | 0.0 | 0.0 | 58 | 60.0 | 0 |
| 2 | 64 | 126.0 | 0 | 30000.0 | 30.0 | White | 16.0 | 16.0 | 16.0 | 8 | 1 | 2.0 | 1.0 | 1.0 | 29 | 30.0 | -2 |
| 3 | 65 | 200.0 | 0 | 25000.0 | 25.0 | White | 17.0 | 17.0 | NaN | 8 | 1 | 2.0 | 0.0 | 0.0 | 57 | 25.0 | -1 |
| 4 | 63 | 110.0 | 0 | 50000.0 | 50.0 | Other | 16.0 | 16.0 | 16.0 | 5 | 6 | 2.0 | 0.0 | 0.0 | 91 | 50.0 | -3 |
<class 'pandas.DataFrame'>
Index: 1789 entries, 0 to 1815
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 c_height 1789 non-null int64
1 weight 1789 non-null float64
dtypes: float64(1), int64(1)
memory usage: 41.9 KB
None
c_height 0
weight 0
dtype: int64
OLS Regression Results
==============================================================================
Dep. Variable: weight R-squared: 0.301
Model: OLS Adj. R-squared: 0.300
Method: Least Squares F-statistic: 769.1
Date: Tue, 31 Mar 2026 Prob (F-statistic): 4.35e-141
Time: 07:55:48 Log-Likelihood: -8558.6
No. Observations: 1789 AIC: 1.712e+04
Df Residuals: 1787 BIC: 1.713e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
c_height 4.9494 0.178 27.733 0.000 4.599 5.299
const 153.3726 0.693 221.435 0.000 152.014 154.731
==============================================================================
Omnibus: 343.790 Durbin-Watson: 1.912
Prob(Omnibus): 0.000 Jarque-Bera (JB): 707.829
Skew: 1.116 Prob(JB): 1.98e-154
Kurtosis: 5.124 Cond. No. 3.93
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: weight = 153.37 + 4.95*c_height
Residual std dev (σ): 28.95 ± 0.48
MAD of residuals: 22.76
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 153.354 0.692 152.001 154.728 0.006 0.008 11467.0
slope 4.951 0.176 4.615 5.297 0.002 0.002 11689.0
sigma 28.968 0.479 28.061 29.934 0.005 0.005 10952.0
ess_tail r_hat
intercept 6285.0 1.0
slope 6413.0 1.0
sigma 6257.0 1.0
Inferences for slope and error standard deviation are same, but intercept is different because of centering. The slope is the same because it is the same relationship between height and weight, but the intercept is different because it is now the expected weight for a person who is 66 inches tall, rather than a person who is 0 inches tall.
intercept 153.333 0.694 152.039 154.739 0.007 0.008 10740.0
slope 4.953 0.182 4.604 5.305 0.002 0.002 11080.0
sigma 28.971 0.484 28.017 29.894 0.005 0.005 10878.0
So from formula $ weight = 153.333 + 4.953 * (height - 66) + \epsilon$, we can see that for the average person who is 66 inches tall, the expected weight is 153.333 pounds with an uncertainty of 0.694 pounds. But for a specific person who is 66 inches tall, the expected weight is between 153.333 - 28.971 and 153.333 + 28.971 pounds, which is a much wider range because it incorporates the variability in the data.
When applying a fitted regression to a new data point $x^{new}$, we can make inferences about the linear predictor $\hat{y} = a + b x^{new}$, or about the predicted value $y_{new} = a + b x^{new} + \epsilon$. For example, lets predict the weight of a person who is 70 inches tall, so c_height = 70 - 66 = 4 inches.
Linear predictor = $a + 4 \times b$ Predicted value = $a + 4 \times b + \epsilon$
Sometimes we are interested in predicting the average weight and sometimes we are interested in predicting the weight of a specific person. They have different uncertainties because the former only incorporates uncertainty in the parameter estimates, while the latter also incorporates variability in the data.
For a linear model, the two types of predictions have the same mean, but different uncertainty. This is because as the sample size increases, the uncertainty in the parameter estimates decreases, but the variability in the data does not change. So with a large sample size, the uncertainty in the linear predictor will be small, but the uncertainty in the predictive distribution will still be large because of the variability in the data. With a small sample size, both uncertainties will be large.
# Extract posterior samples from the earnings model (fit_2 equivalent)
posterior_2 = trace_2.posterior
a = posterior_2["intercept"].values.flatten() # Posterior samples of intercept
b = posterior_2["slope"].values.flatten() # Posterior samples of slope
sigma_samples = posterior_2["sigma"].values.flatten() # Posterior samples of residual std dev
# New data: predict weight for a person 70 inches tall (c_height = 70 - 66 = 4)
new_c_height = 4.0
# Point prediction: single best guess using posterior means
y_point_pred_2 = a.mean() + b.mean() * new_c_height
print(f"Point prediction at c_height={new_c_height}: {y_point_pred_2:.1f}")
# Linear predictor with uncertainty: posterior draws of a + 4.0*b
# Captures uncertainty in the regression line but not individual variability
y_linpred_2 = a + b * new_c_height
print(f"Linear predictor: median={np.median(y_linpred_2):.1f}, MAD={mad(y_linpred_2):.1f}")
# Posterior predictive distribution: a + 4.0*b + epsilon
# Captures both parameter uncertainty and individual-level variability
y_postpred_2 = a + b * new_c_height + np.random.normal(0, sigma_samples)
print(f"Posterior predictive: median={np.median(y_postpred_2):.1f}, MAD={mad(y_postpred_2):.1f}")
Point prediction at c_height=4.0: 173.2 Linear predictor: median=173.2, MAD=0.9 Posterior predictive: median=173.2, MAD=29.1
9.3 Prior information and Bayesian synthesis¶
Classical stats produce summaries of single dataset, but Bayesian combine data with prior information to produce a posterior distribution that reflects both the data and the prior information. This allows us to make better predictions by incorporating prior knowledge.
Expressing data and prior information on the same scale¶
Bayesian inference for normal distribution, with $\theta$ being the continuous variable of interest. Goal to combined prior estimate $\hat{\theta}_{prior}$ with prior standard error $se_{prior}$ and a data estimate $\hat{\theta}_{data}$ with data standard error $se_{data}$. The resulting Bayesian estimate $\hat{\theta}_{Bayes}$ with standard error $se_{Bayes}$ is given by:
$$\hat{\theta}_{Bayes} = \frac{\hat{\theta}_{prior} / se_{prior}^2 + \hat{\theta}_{data} / se_{data}^2}{1 / se_{prior}^2 + 1 / se_{data}^2}$$
$$se_{Bayes} = \frac{1}{\sqrt{1 / se_{prior}^2 + 1 / se_{data}^2}}$$
This formula shows that the estimate is a compromise between the prior and the data. It is a weighted average of the prior and data estimates, where the weights are inversely proportional to their variances (squared standard errors). The more precise (lower standard error) estimate gets more influence on the final estimate. The total prevision of the final posterior estimate is the sum of the precisions of the prior and data estimates - combining information always makes us more precise.
If both standard errors are the same, then the Bayesian estimate is just the average of the prior and data estimates - it is the mid point.
Variance is uncertainty and inverse variance (1/variance) is precision. Precision after combining = precision of prior + precision of data. So instead of averaging uncertainties we are adding certainties.
Bayesian information aggregation¶
Example:
Election coming up, fitted model using economic and political conditions gives forecast that Democratic candidate will receive 52.4% of two party vote with uncertainty of 4.1%. Notion above, $\hat{\theta}_{prior} = 0.524$ and $se_{prior} = 0.041$. Then we get a poll that shows 190 of 400 likely voters support the Democratic candidate, so $\hat{\theta}_{data} = 190/400 = 0.475$ and $se_{data} = \sqrt{0.475 \times (1 - 0.475) / 400} = 0.025$. Assumes nonresponse bias is negligible and poll is representative of the electorate.
We combine prior and data, assuming uncertainties are independent. Prior distribution from forecast based on previous elections and current conditions - forecast uncertainty coming from nonzero residuals in model. Data from survey of voters - uncertainty coming from sampling variability. Both sources of uncertainty are independent.
# Prior: forecast model estimate
theta_prior = 0.524 # prior mean (52.4% Democratic vote share)
se_prior = 0.041 # prior standard error (uncertainty of 4.1%)
# Data: poll estimate
theta_data = 190 / 400 # poll proportion (0.475)
se_data = np.sqrt(theta_data * (1 - theta_data) / 400) # standard error of poll proportion (0.025)
# Calculate Bayes estimate of vote share by combining prior and data
# Assuming both are normally distributed and independent, the posterior mean is a weighted average:
theta_post = (theta_prior / se_prior**2 + theta_data / se_data**2) / (1 / se_prior**2 + 1 / se_data**2)
se_post = np.sqrt(1 / (1 / se_prior**2 + 1 / se_data**2)) # posterior standard error
print(f"Posterior mean (Bayes estimate): {theta_post:.3f}, with s.e. {se_post:.3f}")
# Grid of theta values to evaluate the densities
theta_grid = np.linspace(0.35, 0.65, 500) # range of vote share values
# Compute normal densities for prior and data
from scipy.stats import norm # import normal distribution
prior_density = norm.pdf(theta_grid, theta_prior, se_prior) # prior distribution
data_density = norm.pdf(theta_grid, theta_data, se_data) # data (likelihood) distribution
bayes_density = norm.pdf(theta_grid, theta_post, se_post) # posterior distribution
# Plot both distributions
fig, ax = plt.subplots(figsize=(8, 4)) # create figure
ax.plot(theta_grid, prior_density, label=f'Prior: N({theta_prior}, {se_prior})') # plot prior
ax.plot(theta_grid, data_density, label=f'Data: N({theta_data:.3f}, {se_data:.3f})') # plot data
ax.plot(theta_grid, bayes_density, label=f'Posterior: N({theta_post:.3f}, {se_post:.3f})') # plot posterior
ax.set_xlabel('θ (Democratic vote share)') # x-axis label
ax.set_ylabel('Density') # y-axis label
ax.set_title('Prior and data distributions') # title
ax.axvline(0.5, color='black', linestyle='--', alpha=0.3, label='50% threshold') # reference line
ax.legend() # show legend
plt.show() # display plot
Posterior mean (Bayes estimate): 0.488, with s.e. 0.021
Different ways of assigning prior distributions and performing Bayesian calculations¶
- Multiple ways to do same analysis. Comparing groups can be done with simple formula or regression - same answer. Similar with Bayesian inference - can do it with formulas or software.
- Bayesian inference works with any uncertain quantity.
- Prior changes depending on situation.
9.4 Example of Bayesian inference: beauty and sex ratio¶
Prior information can help us refine estimates from noisy data.
Survey 3000 Americans observed correlation between attractivenss of parents and sex of their children. Coded adults into five attactiveness categories and 56% of children of the most attractive parents were female, compared with 48% of children of other parents. Observed difference was 8 percentage points with standard error of 3 percentage points which is more than 2 standard errors away from zero, so it is statistically significant at the 5% level.
Prior information¶
It is well known that human sex ratio variaties in small range. In US, 48.7% girls amoung whites and 49.2% amount blacks. Similar differences of 0.5% or less found when comparing based on birth order, maternal age, season. Seems difficult to believe that more or less attractive parents could be more than 0.5%.
Prior estimate and standard error¶
$\theta$ is our variable of interest with mean 0% and SD 0.25%. 0% mean means that we have no reason to believe that more attractive parents birth female children. SD of 0.25% indicates we find it highly implausible this is higher than 0.5% or lower than -0.5%.
Data estimate and standard error¶
Survey gives $\hat{\theta_{data}} = 0.06$ with standard error $SE_{data} = 0.03$. The key takeaway: the prior (SD=0.25%) is much more precise than the data (SE=3%), so the posterior will be pulled almost entirely toward the prior mean of 0%. The noisy survey data barely moves the estimate — prior information dominates here.
Bayes estimate¶
$\hat{\theta_{bayes}} = 0.0004$ with standard error $SE_{bayes} = 0.25$ - this indicates that the data has almost no influence on the estimate, and the posterior is essentially the same as the prior. The data is too noisy to provide any meaningful information.
Understanding the result¶
Sample of 3,000 seems good, but provides no information, why?
- Estimated proportion from simple random sample of 3,000 has standard error of $\sqrt{p(1-p)/n} = \sqrt{0.5 \times 0.5 / 3000} = 0.009$. This tells us that a survey of 3,000 people lets us estimate a proportion to within about 0.9 percentage points.
- Now consider comparison of two groups of 1,500 people each. The standard error of the difference in proportions is $\sqrt{p(1-p)/n_1 + p(1-p)/n_2} = \sqrt{0.5 \times 0.5 / 1500 + 0.5 \times 0.5 / 1500} = 0.018$. This tells us that a survey of 3,000 people split into two groups of 1,500 each lets us estimate the difference in proportions to within about 1.8 percentage points.
- The actual standard error of the difference in proportions is 3.3% because groups were uneven - only about 10% of respondents were 'very attractive', so the effective sample size for that group is only about 300 people, which gives a standard error of $\sqrt{p(1-p)/n} = \sqrt{0.5 \times 0.5 / 300} = 0.029$.
- Estimating a comparison to an accuracy of 2-3% is normally considered good, but in this case it is not good enough to detect a difference of 0.5% or less, which is what we would expect based on prior information. So the data is too noisy to provide any meaningful information about the relationship between attractiveness.
import pyreadr
from scipy.stats import norm
result = pyreadr.read_r('/Users/cairanvanrooyen/Desktop/personal/website/alfolio-project/assets/ros_data/sexratio.rda')
print("Keys in the R data file:", result.keys()) # Print the keys to see what data frames are available
df = result[list(result.keys())[0]]
# rename columns to something more convenient
df.columns = ['attractiveness', 'percentage_female']
display(df.head())
fit_and_plot_lm(data = df, predictors = ['attractiveness'], outcome = 'percentage_female', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)
# --- Bayesian conjugate normal update ---
# Prior: no reason to believe effect exists, highly implausible beyond ±0.5%
theta_prior = 0.0 # prior mean (0%)
se_prior = 0.0025 # prior SD (0.25%, so 2 SDs ≈ 0.5%)
# Data: survey estimate
theta_data = 0.06 # observed difference (6%)
se_data = 0.03 # standard error (3%)
# Posterior (normal-normal conjugate update)
theta_post = (theta_prior / se_prior**2 + theta_data / se_data**2) / (1 / se_prior**2 + 1 / se_data**2)
se_post = np.sqrt(1 / (1 / se_prior**2 + 1 / se_data**2))
print(f"Posterior mean: {theta_post:.4f} ({theta_post*100:.2f}%), SE: {se_post:.4f} ({se_post*100:.2f}%)")
# Plot prior, data likelihood, and posterior
theta_grid = np.linspace(-0.02, 0.10, 500)
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(theta_grid, norm.pdf(theta_grid, theta_prior, se_prior), label=f'Prior: N({theta_prior}, {se_prior})')
ax.plot(theta_grid, norm.pdf(theta_grid, theta_data, se_data), label=f'Data: N({theta_data}, {se_data})')
ax.plot(theta_grid, norm.pdf(theta_grid, theta_post, se_post), label=f'Posterior: N({theta_post:.4f}, {se_post:.4f})')
ax.axvline(0, color='black', linestyle='--', alpha=0.3)
ax.set_xlabel('θ (difference in proportion female)')
ax.set_ylabel('Density')
ax.set_title('Beauty and sex ratio: Bayesian update')
ax.legend()
plt.show()
Keys in the R data file: odict_keys(['sexratio'])
| attractiveness | percentage_female | |
|---|---|---|
| 0 | -2.0 | 50.0 |
| 1 | -1.0 | 44.0 |
| 2 | 0.0 | 50.0 |
| 3 | 1.0 | 47.0 |
| 4 | 2.0 | 56.0 |
OLS Regression Results
==============================================================================
Dep. Variable: percentage_female R-squared: 0.284
Model: OLS Adj. R-squared: 0.045
Method: Least Squares F-statistic: 1.190
Date: Tue, 31 Mar 2026 Prob (F-statistic): 0.355
Time: 07:55:50 Log-Likelihood: -13.166
No. Observations: 5 AIC: 30.33
Df Residuals: 3 BIC: 29.55
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
attractiveness 1.5000 1.375 1.091 0.355 -2.875 5.875
const 49.4000 1.944 25.409 0.000 43.213 55.587
==============================================================================
Omnibus: nan Durbin-Watson: 2.698
Prob(Omnibus): nan Jarque-Bera (JB): 0.658
Skew: -0.132 Prob(JB): 0.720
Kurtosis: 1.242 Cond. No. 1.41
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: percentage_female = 49.40 + 1.50*attractiveness
Residual std dev (σ): 4.35 ± 1.77
MAD of residuals: 4.45
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
Posterior mean: 0.0004 (0.04%), SE: 0.0025 (0.25%)
9.5 Uniform, weakly informative, and informative priors in regression¶
When data are less informative, prior is more important.
In Bayesian inference the likelihood is multiplied by the prior to get the posterior.
This section looks at uniform, weakly informative, and informative priors in regression.
Uniform priors¶
Flat, uniform are uninformative priors.
# Bayesian linear regression with PyMC
x = hibbs['growth'].values
y = hibbs['vote'].values
with pm.Model() as model: # Create a new PyMC model context to define the Bayesian model
intercept = pm.Flat("intercept") # Define a flat (uniform) prior for the intercept, which is uninformative
slope = pm.Flat("slope") # Define a flat (uniform) prior for the slope, which is uninformative
sigma = pm.HalfFlat("sigma") # Define a flat (improper) prior for the noise std (constrained positive)
mu = intercept + slope * x # Compute the expected value of y as a linear function of x
likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y) # Define the likelihood: observed y ~ Normal(mu, sigma)
trace = pm.sample(2000, tune=1000) # Run MCMC sampling: 2000 posterior draws after 1000 tuning steps
print(pm.summary(trace, hdi_prob=0.95)) # Print posterior summary with 95% HDI (highest density interval)
# Plot the posterior distributions and trace
az.plot_trace(trace)
plt.tight_layout()
plt.show()
# Forest plot: shows posterior means and HDI intervals
az.plot_forest(trace, hdi_prob=0.95)
# Posterior plot: shows density of each parameter
az.plot_posterior(trace, hdi_prob=0.95)
plt.show()
# Plot regression line with posterior uncertainty
posterior = trace.posterior
a_samples = posterior["intercept"].values.flatten()
b_samples = posterior["slope"].values.flatten()
# Plot data points
sns.scatterplot(x=a_samples, y=b_samples, size=0.5, alpha=0.1, color='blue')
# Heatmap
sns.kdeplot(x=a_samples, y=b_samples, fill=True, cmap='Reds', alpha=0.5)
# 3D contour plot
from scipy.stats import gaussian_kde
xy = np.vstack([a_samples, b_samples])
z = gaussian_kde(xy)(xy)
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(a_samples, b_samples, z, c=z, cmap='Reds', alpha=0.5)
ax.set_xlabel('Intercept')
ax.set_ylabel('Slope')
ax.set_zlabel('Density')
ax.set_title('Joint posterior distribution of intercept and slope')
plt.show()
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 46.212 1.807 42.608 49.718 0.033 0.029 3084.0
slope 3.069 0.773 1.607 4.641 0.014 0.012 3026.0
sigma 4.154 0.903 2.647 5.929 0.015 0.017 3596.0
ess_tail r_hat
intercept 2865.0 1.0
slope 3024.0 1.0
sigma 3420.0 1.0
Default prior distribution¶
Next step beyond a flat is a weakly informative prior.
Common conventional prior choices in PyMC:
| Parameter type | Typical prior | Notes |
|---|---|---|
| Unbounded real (intercept, slope) | pm.Normal(0, σ) |
σ scaled to data |
| Positive (std dev, variance) | pm.HalfNormal(σ) or pm.HalfCauchy(β) |
HalfCauchy has heavier tails |
| Positive (precision, rate) | pm.Gamma(α, β) or pm.Exponential(λ) |
|
| Bounded [0,1] (probabilities) | pm.Beta(α, β) |
Beta(1,1) = uniform on [0,1] |
| Categorical weights | pm.Dirichlet(a) |
a = [1,1,...,1] = uniform |
| Correlation matrices | pm.LKJCorr(η) |
η=1 = uniform over correlations |
# standard deviation of growth
hibbs_growth_sd = hibbs['growth'].std()
print(f"Standard deviation of growth: {hibbs_growth_sd:.2f}")
# standard deviation of vote
hibbs_vote_sd = hibbs['vote'].std()
print(f"Standard deviation of vote: {hibbs_vote_sd:.2f}")
# mean vote
hibbs_vote_mean = hibbs['vote'].mean()
print(f"Mean vote: {hibbs_vote_mean:.2f}")
with pm.Model() as model: # Create a new PyMC model context to define the Bayesian model
intercept = pm.Normal("intercept", mu=hibbs_vote_mean, sigma=2.5*hibbs_vote_sd) # Define a Normal prior for the intercept centered on mean vote
slope = pm.Normal("slope", mu=0, sigma=2.5*(hibbs_vote_sd/hibbs_growth_sd)) # Define a Normal prior for the slope with mean 0 and scaled std
sigma = pm.Exponential("sigma", lam=1/hibbs_vote_sd) # Define an Exponential prior for the noise std with mean equal to sd of vote
mu = intercept + slope * x # Compute the expected value of y as a linear function of x
likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y) # Define the likelihood: observed y ~ Normal(mu, sigma)
trace = pm.sample(2000, tune=1000) # Run MCMC sampling: 2000 posterior draws after 1000 tuning steps
print(pm.summary(trace, hdi_prob=0.95)) # Print posterior summary with 95% HDI (highest density interval)
# Plot the posterior distributions and trace
az.plot_trace(trace)
plt.tight_layout()
plt.show()
# Forest plot: shows posterior means and HDI intervals
az.plot_forest(trace, hdi_prob=0.95)
# Posterior plot: shows density of each parameter
az.plot_posterior(trace, hdi_prob=0.95)
plt.show()
# Plot regression line with posterior uncertainty
posterior = trace.posterior
a_samples = posterior["intercept"].values.flatten()
b_samples = posterior["slope"].values.flatten()
# Plot data points
sns.scatterplot(x=a_samples, y=b_samples, size=0.5, alpha=0.1, color='blue')
Initializing NUTS using jitter+adapt_diag...
Standard deviation of growth: 1.40 Standard deviation of vote: 5.61 Mean vote: 52.05
Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 46.374 1.718 42.879 49.682 0.028 0.023 3711.0
slope 3.010 0.735 1.510 4.428 0.012 0.010 3671.0
sigma 4.013 0.819 2.667 5.676 0.013 0.015 3944.0
ess_tail r_hat
intercept 4250.0 1.0
slope 4135.0 1.0
sigma 4127.0 1.0
<Axes: >
Weakly informative prior distribution based on subject-matter knowledge¶
We should include informative priors when we have strong prior information.
In the economics vs politics example:
The intercept is the expected vote share for the Democratic candidate when economic growth is zero. We know that the Democratic candidate will get some votes even with zero growth, we might assume this is close to 50%, so we can set a weakly informative prior with a mean of 50% and a standard deviation of 10%. This allows for some uncertainty but still reflects our belief that the Democratic candidate will get around 50% of the vote with zero growth.
The coefficients for economic growth should be positive, but we dont know this. Economic growth historically has been 0-4%. Could a difference in 1% growth lead to a difference in vote share of 10 percentage points? Maybe, but seems unlikely. We could set a weakly informative prior with a mean of 5 and a standard deviation of 5, which implies that we think a difference in 1% growth is likely to lead to a difference in vote share of around 5 percentage points, but it could be as low as 0 or as high as 10 percentage points. This allows for a wide range of possible values but still reflects our belief that the effect is likely to be small.
fit_and_plot_bayes(data = hibbs, predictor = 'growth', outcome = 'vote',
intercept_mu=50, intercept_sigma=10,
slope_mu=5, slope_sigma=5,
sigma_sigma=50,
samples=2000, tune=1000, hdi_prob=0.95,
show_trace=True, show_forest=True,
show_posterior=True, show_regression=True,
n_regression_lines=100)
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 46.275 1.748 42.747 49.683 0.031 0.026 3353.0
slope 3.068 0.753 1.626 4.622 0.013 0.011 3402.0
sigma 4.126 0.878 2.648 5.849 0.015 0.016 3804.0
ess_tail r_hat
intercept 3294.0 1.0
slope 3434.0 1.0
sigma 4203.0 1.0
-
<xarray.Dataset> Size: 208kB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 5 6 ... 1994 1995 1996 1997 1998 1999 Data variables: intercept (chain, draw) float64 64kB 46.08 44.7 44.01 ... 45.08 44.91 47.64 slope (chain, draw) float64 64kB 2.782 3.444 4.179 ... 3.383 2.735 sigma (chain, draw) float64 64kB 4.39 3.445 4.124 ... 3.604 3.508 4.057 Attributes: created_at: 2026-03-31T07:31:17.009084+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.7429487705230713 tuning_steps: 1000 -
<xarray.Dataset> Size: 1MB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 ... 1996 1997 1998 1999 Data variables: (12/18) lp (chain, draw) float64 64kB -52.37 -52.32 ... -52.14 perf_counter_start (chain, draw) float64 64kB 9.238e+04 ... 9.238e+04 step_size_bar (chain, draw) float64 64kB 0.5232 0.5232 ... 0.5192 diverging (chain, draw) bool 8kB False False ... False False smallest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan acceptance_rate (chain, draw) float64 64kB 0.8993 0.8425 ... 0.9943 ... ... largest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan divergences (chain, draw) int64 64kB 0 0 0 0 0 0 ... 0 0 0 0 0 0 energy (chain, draw) float64 64kB 53.82 54.64 ... 52.33 max_energy_error (chain, draw) float64 64kB 0.5248 0.4235 ... -0.1207 n_steps (chain, draw) float64 64kB 7.0 5.0 7.0 ... 3.0 7.0 step_size (chain, draw) float64 64kB 0.5359 0.5359 ... 0.6447 Attributes: created_at: 2026-03-31T07:31:17.024314+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.7429487705230713 tuning_steps: 1000 -
<xarray.Dataset> Size: 256B Dimensions: (y_dim_0: 16) Coordinates: * y_dim_0 (y_dim_0) int64 128B 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 Data variables: y (y_dim_0) float64 128B 44.6 57.76 49.91 61.34 ... 51.24 46.32 52.0 Attributes: created_at: 2026-03-31T07:31:17.027368+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2
Example where an informative prior makes a difference: Beauty and sex ratio¶
Election forecasting model, informative prior doesnt do much. With noisy data, priors can be important for keeping estimates reasonable.
result = pyreadr.read_r('/Users/cairanvanrooyen/Desktop/personal/website/alfolio-project/assets/ros_data/sexratio.rda')
print("Keys in the R data file:", result.keys()) # Print the keys to see what data frames are available
df = result[list(result.keys())[0]]
# rename columns to something more convenient
df.columns = ['attractiveness', 'percentage_female']
display(df.head())
fit_and_plot_lm(data = df, predictors = ['attractiveness'], outcome = 'percentage_female', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)
# Compute data-scaled priors (rstanarm defaults)
sd_x = df['attractiveness'].std()
sd_y = df['percentage_female'].std()
mean_y = df['percentage_female'].mean()
fit_and_plot_bayes(data = df, predictor = 'attractiveness', outcome = 'percentage_female',
intercept_mu=mean_y, intercept_sigma=2.5 * sd_y,
slope_mu=0, slope_sigma=2.5 * sd_y / sd_x,
sigma_sigma=sd_y,
samples=2000, tune=1000, hdi_prob=0.95,
show_trace=True, show_forest=True,
show_posterior=True, show_regression=True,
n_regression_lines=100)
Keys in the R data file: odict_keys(['sexratio'])
| attractiveness | percentage_female | |
|---|---|---|
| 0 | -2.0 | 50.0 |
| 1 | -1.0 | 44.0 |
| 2 | 0.0 | 50.0 |
| 3 | 1.0 | 47.0 |
| 4 | 2.0 | 56.0 |
OLS Regression Results
==============================================================================
Dep. Variable: percentage_female R-squared: 0.284
Model: OLS Adj. R-squared: 0.045
Method: Least Squares F-statistic: 1.190
Date: Tue, 31 Mar 2026 Prob (F-statistic): 0.355
Time: 09:01:03 Log-Likelihood: -13.166
No. Observations: 5 AIC: 30.33
Df Residuals: 3 BIC: 29.55
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
attractiveness 1.5000 1.375 1.091 0.355 -2.875 5.875
const 49.4000 1.944 25.409 0.000 43.213 55.587
==============================================================================
Omnibus: nan Durbin-Watson: 2.698
Prob(Omnibus): nan Jarque-Bera (JB): 0.658
Skew: -0.132 Prob(JB): 0.720
Kurtosis: 1.242 Cond. No. 1.41
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: percentage_female = 49.40 + 1.50*attractiveness
Residual std dev (σ): 4.35 ± 1.77
MAD of residuals: 4.45
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds. There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 49.365 2.165 44.768 53.504 0.030 0.034 5536.0
slope 1.435 1.560 -1.670 4.630 0.022 0.025 4951.0
sigma 4.820 1.660 2.244 8.182 0.027 0.023 3598.0
ess_tail r_hat
intercept 4486.0 1.0
slope 3993.0 1.0
sigma 3952.0 1.0
-
<xarray.Dataset> Size: 208kB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 5 6 ... 1994 1995 1996 1997 1998 1999 Data variables: intercept (chain, draw) float64 64kB 47.97 51.51 51.8 ... 47.69 49.62 47.21 slope (chain, draw) float64 64kB 1.076 1.565 1.742 ... -0.1376 3.067 sigma (chain, draw) float64 64kB 5.653 2.603 3.669 ... 4.913 4.039 Attributes: created_at: 2026-03-31T08:01:08.701918+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.6219639778137207 tuning_steps: 1000 -
<xarray.Dataset> Size: 1MB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 ... 1996 1997 1998 1999 Data variables: (12/18) lp (chain, draw) float64 64kB -21.34 -22.38 ... -21.85 perf_counter_start (chain, draw) float64 64kB 9.418e+04 ... 9.418e+04 step_size_bar (chain, draw) float64 64kB 0.7545 0.7545 ... 0.6779 diverging (chain, draw) bool 8kB False False ... False False smallest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan acceptance_rate (chain, draw) float64 64kB 0.7784 0.8737 ... 0.8034 ... ... largest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan divergences (chain, draw) int64 64kB 0 0 0 0 0 0 ... 0 0 0 0 0 0 energy (chain, draw) float64 64kB 25.34 22.89 ... 23.61 max_energy_error (chain, draw) float64 64kB 1.093 1.095 ... 0.8911 n_steps (chain, draw) float64 64kB 3.0 7.0 3.0 ... 7.0 3.0 step_size (chain, draw) float64 64kB 0.5326 0.5326 ... 0.6067 Attributes: created_at: 2026-03-31T08:01:08.714905+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.6219639778137207 tuning_steps: 1000 -
<xarray.Dataset> Size: 80B Dimensions: (y_dim_0: 5) Coordinates: * y_dim_0 (y_dim_0) int64 40B 0 1 2 3 4 Data variables: y (y_dim_0) float64 40B 50.0 44.0 50.0 47.0 56.0 Attributes: created_at: 2026-03-31T08:01:08.718221+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2
# We would expect percentage of girls to be around 48.8% with standard deviation of 0.5%
# Slope of mean 0 and SD0.2 says that we think the slope is likely close to zero, but could plausibly be as large as ±0.4 (2 SDs) in either direction, which is a wide range of plausible effects given the scale of the data.
fit_and_plot_bayes(data = df, predictor = 'attractiveness', outcome = 'percentage_female',
intercept_mu=48.8, intercept_sigma=0.5,
slope_mu=0, slope_sigma=0.2,
sigma_sigma=sd_y,
samples=2000, tune=1000, hdi_prob=0.95,
show_trace=True, show_forest=True,
show_posterior=True, show_regression=True,
n_regression_lines=100)
Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [intercept, slope, sigma]
/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for
Jupyter support
warnings.warn('install "ipywidgets" for Jupyter support')
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds. There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \
intercept 48.842 0.487 47.933 49.849 0.005 0.005 10927.0
slope 0.034 0.199 -0.364 0.419 0.002 0.002 11023.0
sigma 4.511 1.400 2.283 7.316 0.015 0.020 10800.0
ess_tail r_hat
intercept 5891.0 1.0
slope 5935.0 1.0
sigma 6029.0 1.0
-
<xarray.Dataset> Size: 208kB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 5 6 ... 1994 1995 1996 1997 1998 1999 Data variables: intercept (chain, draw) float64 64kB 48.68 49.31 49.66 ... 47.84 48.11 slope (chain, draw) float64 64kB -0.1048 0.06646 ... -0.226 -0.1092 sigma (chain, draw) float64 64kB 6.337 3.746 2.613 ... 3.621 3.526 Attributes: created_at: 2026-03-31T08:13:40.384570+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.5975902080535889 tuning_steps: 1000 -
<xarray.Dataset> Size: 1MB Dimensions: (chain: 4, draw: 2000) Coordinates: * chain (chain) int64 32B 0 1 2 3 * draw (draw) int64 16kB 0 1 2 3 4 ... 1996 1997 1998 1999 Data variables: (12/18) lp (chain, draw) float64 64kB -15.47 -14.82 ... -15.96 perf_counter_start (chain, draw) float64 64kB 9.493e+04 ... 9.493e+04 step_size_bar (chain, draw) float64 64kB 1.088 1.088 ... 0.9643 diverging (chain, draw) bool 8kB False False ... False False smallest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan acceptance_rate (chain, draw) float64 64kB 0.9975 0.9905 ... 1.0 ... ... largest_eigval (chain, draw) float64 64kB nan nan nan ... nan nan divergences (chain, draw) int64 64kB 0 0 0 0 0 0 ... 0 0 0 0 0 0 energy (chain, draw) float64 64kB 16.63 15.74 ... 20.14 17.4 max_energy_error (chain, draw) float64 64kB -0.2683 -0.145 ... -0.3297 n_steps (chain, draw) float64 64kB 3.0 7.0 3.0 ... 3.0 1.0 step_size (chain, draw) float64 64kB 1.117 1.117 ... 1.077 Attributes: created_at: 2026-03-31T08:13:40.396874+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2 sampling_time: 0.5975902080535889 tuning_steps: 1000 -
<xarray.Dataset> Size: 80B Dimensions: (y_dim_0: 5) Coordinates: * y_dim_0 (y_dim_0) int64 40B 0 1 2 3 4 Data variables: y (y_dim_0) float64 40B 50.0 44.0 50.0 47.0 56.0 Attributes: created_at: 2026-03-31T08:13:40.400973+00:00 arviz_version: 0.23.4 inference_library: pymc inference_library_version: 5.28.2