import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import arviz as az
import pymc as pm 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.robust.scale import mad

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/arviz/__init__.py:50: FutureWarning: 
ArviZ is undergoing a major refactor to improve flexibility and extensibility while maintaining a user-friendly interface.
Some upcoming changes may be backward incompatible.
For details and migration guidance, visit: https://python.arviz.org/en/latest/user_guide/migration_guide.html
  warn(

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Print residual standard deviation and its uncertainty
    sigma = np.sqrt(results.mse_resid)
    sigma_se = sigma / np.sqrt(2 * results.df_resid)
    print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
    # Print median absolute deviation of residuals
    print("MAD of residuals:", round(mad(results.resid), 2))
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

def fit_and_plot_bayes(data, *args,
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=True,
                       show_posterior=True, show_regression=True,
                       n_regression_lines=100):
    """
    Fit a Bayesian linear regression using PyMC and optionally plot diagnostics.
    Supports single or multiple predictors.
    Args:
        data: pandas DataFrame
        *args: predictor column name(s) followed by the outcome column name (last arg)
        intercept_mu, intercept_sigma: prior mean and std for intercept ~ Normal
        slope_mu, slope_sigma: prior mean and std for slope ~ Normal
        sigma_sigma: prior std for residual noise ~ HalfNormal
        samples: number of posterior draws
        tune: number of tuning steps
        hdi_prob: HDI probability for summaries and plots
        show_trace: plot trace and posterior density per parameter
        show_forest: plot forest (posterior means + HDI)
        show_posterior: plot posterior densities
        show_regression: plot data with posterior regression lines
        n_regression_lines: number of posterior draws to overlay on regression plot
    Returns:
        trace: PyMC InferenceData object
    """
    predictors = list(args[:-1])
    outcome = args[-1]
    y = data[outcome].values

    with pm.Model() as model:
        intercept = pm.Normal("intercept", mu=intercept_mu, sigma=intercept_sigma)
        slopes = []
        mu = intercept
        for pred in predictors:
            s = pm.Normal(f"slope_{pred}", mu=slope_mu, sigma=slope_sigma)
            slopes.append(s)
            mu = mu + s * data[pred].values
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)
        likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=y)
        trace = pm.sample(samples, tune=tune)

    summary = pm.summary(trace, hdi_prob=hdi_prob)
    print(summary)

    # Print regression formula
    posterior = trace.posterior
    intercept_mean = posterior["intercept"].values.flatten().mean()
    formula = f"{outcome} = {intercept_mean:.2f}"
    for pred in predictors:
        slope_mean = posterior[f"slope_{pred}"].values.flatten().mean()
        formula += f" + {slope_mean:.2f}*{pred}"
    print(f"\nRegression formula: {formula}")

    if show_trace:
        az.plot_trace(trace)
        plt.tight_layout()
        plt.show()

    if show_forest:
        az.plot_forest(trace, hdi_prob=hdi_prob)
        plt.show()

    if show_posterior:
        az.plot_posterior(trace, hdi_prob=hdi_prob)
        plt.show()

    if show_regression:
        a_samples = posterior["intercept"].values.flatten()
        slope_samples = {pred: posterior[f"slope_{pred}"].values.flatten() for pred in predictors}
        idx = np.random.choice(len(a_samples), n_regression_lines, replace=False)

        fig, axes = plt.subplots(1, len(predictors), figsize=(6 * len(predictors), 5))
        if len(predictors) == 1:
            axes = [axes]

        for ax, pred in zip(axes, predictors):
            x = data[pred].values
            ax.scatter(x, y, alpha=0.5)
            x_grid = np.linspace(x.min(), x.max(), 100)

            # For each posterior draw, compute the line for this predictor
            # holding other predictors at their mean
            other_contribution = np.zeros(len(a_samples))
            for other_pred in predictors:
                if other_pred != pred:
                    other_contribution += slope_samples[other_pred] * data[other_pred].mean()

            for i in idx:
                y_line = a_samples[i] + other_contribution[i] + slope_samples[pred][i] * x_grid
                ax.plot(x_grid, y_line, alpha=0.05, color="gray")

            # Mean regression line
            mean_other = sum(slope_samples[op].mean() * data[op].mean() for op in predictors if op != pred)
            y_mean = a_samples.mean() + mean_other + slope_samples[pred].mean() * x_grid
            ax.plot(x_grid, y_mean, color="red")
            ax.set_xlabel(pred)
            ax.set_ylabel(outcome)
            ax.set_title(f"{outcome} vs {pred} (others at mean)")

        plt.tight_layout()
        plt.show()

    return trace

kidiq = pd.read_csv('../ros_data/kidiq.csv', skiprows=0)
display(kidiq.head())

fit_and_plot_lm(kidiq, ['mom_hs'], 'kid_score', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

fit_and_plot_bayes(kidiq, 'mom_hs', 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

fit_and_plot_bayes(kidiq, 'mom_iq', 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              kid_score   R-squared:                       0.056
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     25.69
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           5.96e-07
Time:                        07:27:27   Log-Likelihood:                -1911.8
No. Observations:                 434   AIC:                             3828.
Df Residuals:                     432   BIC:                             3836.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
mom_hs        11.7713      2.322      5.069      0.000       7.207      16.336
const         77.5484      2.059     37.670      0.000      73.502      81.595
==============================================================================
Omnibus:                       11.077   Durbin-Watson:                   1.464
Prob(Omnibus):                  0.004   Jarque-Bera (JB):               11.316
Skew:                          -0.373   Prob(JB):                      0.00349
Kurtosis:                       2.738   Cond. No.                         4.11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: kid_score = 77.55 + 11.77*mom_hs
Residual std dev (σ): 19.85 ± 0.68
MAD of residuals: 19.27

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

                mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept     77.420  2.055    73.247     81.326      0.035    0.025   
slope_mom_hs  11.910  2.304     7.160     16.279      0.039    0.029   
sigma         19.899  0.676    18.629     21.238      0.010    0.009   

              ess_bulk  ess_tail  r_hat  
intercept       3524.0    3957.0    1.0  
slope_mom_hs    3442.0    4055.0    1.0  
sigma           4650.0    4112.0    1.0  

Regression formula: kid_score = 77.42 + 11.91*mom_hs

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 2 seconds.

fit_and_plot_bayes(kidiq, 'mom_hs', 'mom_iq', 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, slope_mom_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 3 seconds.

                mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept     25.393  5.858    14.039     36.862      0.090    0.074   
slope_mom_hs   5.948  2.242     1.630     10.406      0.030    0.029   
slope_mom_iq   0.567  0.060     0.441      0.679      0.001    0.001   
sigma         18.190  0.615    16.983     19.390      0.008    0.008   

              ess_bulk  ess_tail  r_hat  
intercept       4265.0    4475.0    1.0  
slope_mom_hs    5425.0    4497.0    1.0  
slope_mom_iq    4113.0    4245.0    1.0  
sigma           5425.0    4512.0    1.0  

Regression formula: kid_score = 25.39 + 5.95*mom_hs + 0.57*mom_iq

<xarray.Dataset> Size: 272kB
Dimensions:       (chain: 4, draw: 2000)
Coordinates:
  * chain         (chain) int64 32B 0 1 2 3
  * draw          (draw) int64 16kB 0 1 2 3 4 5 ... 1995 1996 1997 1998 1999
Data variables:
    intercept     (chain, draw) float64 64kB 12.84 19.14 34.0 ... 24.23 16.73
    slope_mom_hs  (chain, draw) float64 64kB 6.562 3.028 9.324 ... 3.807 8.139
    slope_mom_iq  (chain, draw) float64 64kB 0.6747 0.6481 ... 0.5882 0.6416
    sigma         (chain, draw) float64 64kB 16.97 17.8 18.31 ... 18.71 17.9
Attributes:
    created_at:                 2026-04-14T06:27:36.876245+00:00
    arviz_version:              0.23.4
    inference_library:          pymc
    inference_library_version:  5.28.2
    sampling_time:              2.6884310245513916
    tuning_steps:               1000

array([0, 1, 2, 3])

array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,))

array([[12.84138487, 19.13816914, 34.00154017, ..., 25.19445788,
        22.01242024, 23.65264259],
       [18.88400043, 26.97497511, 15.35722588, ..., 29.45887767,
        27.11765302, 28.71487173],
       [26.8925523 , 27.52246881, 23.12574077, ..., 28.27496491,
        25.64493047, 18.39029086],
       [21.18446285, 29.62252304, 29.36562109, ..., 33.90464494,
        24.23148432, 16.72681133]], shape=(4, 2000))

array([[6.5619548 , 3.02768307, 9.32438692, ..., 3.36829021, 3.51210071,
        4.10732237],
       [7.71019915, 2.1590484 , 2.41237629, ..., 6.73498631, 6.567157  ,
        3.23643717],
       [3.40744355, 3.63851649, 5.07602058, ..., 7.6756672 , 8.43291534,
        8.79765789],
       [6.79527093, 4.61733249, 3.92805883, ..., 8.71136681, 3.80749745,
        8.13912242]], shape=(4, 2000))

# plot mother iq vs kid score colored by whether mother graduated high school and with regression lines for each group
sns.lmplot(data=kidiq, x='mom_iq', y='kid_score', hue='mom_hs', ci=None)
plt.title('Kid Score vs Mom IQ by High School Graduation')
plt.show()

# lm for mothers who graduated high school - filter data to only include those with mom_hs == 1
kidiq_graduated = kidiq[kidiq['mom_hs'] == 1]
kidiq_not_graduated = kidiq[kidiq['mom_hs'] == 0]

fit_and_plot_lm(kidiq_graduated, ['mom_iq'], 'kid_score', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

fit_and_plot_lm(kidiq_not_graduated, ['mom_iq'], 'kid_score', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              kid_score   R-squared:                       0.143
Model:                            OLS   Adj. R-squared:                  0.140
Method:                 Least Squares   F-statistic:                     56.42
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           5.24e-13
Time:                        07:27:37   Log-Likelihood:                -1462.0
No. Observations:                 341   AIC:                             2928.
Df Residuals:                     339   BIC:                             2936.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
mom_iq         0.4846      0.065      7.511      0.000       0.358       0.612
const         39.7862      6.663      5.971      0.000      26.679      52.893
==============================================================================
Omnibus:                        5.765   Durbin-Watson:                   1.612
Prob(Omnibus):                  0.056   Jarque-Bera (JB):                5.908
Skew:                          -0.314   Prob(JB):                       0.0521
Kurtosis:                       2.851   Cond. No.                         720.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: kid_score = 39.79 + 0.48*mom_iq
Residual std dev (σ): 17.66 ± 0.68
MAD of residuals: 15.55

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              kid_score   R-squared:                       0.294
Model:                            OLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     37.87
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           2.00e-08
Time:                        07:27:37   Log-Likelihood:                -405.14
No. Observations:                  93   AIC:                             814.3
Df Residuals:                      91   BIC:                             819.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
mom_iq         0.9689      0.157      6.154      0.000       0.656       1.282
const        -11.4820     14.601     -0.786      0.434     -40.485      17.521
==============================================================================
Omnibus:                        2.584   Durbin-Watson:                   1.924
Prob(Omnibus):                  0.275   Jarque-Bera (JB):                2.360
Skew:                          -0.389   Prob(JB):                        0.307
Kurtosis:                       2.950   Cond. No.                         685.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: kid_score = -11.48 + 0.97*mom_iq
Residual std dev (σ): 19.07 ± 1.41
MAD of residuals: 18.01

# Add interaction term between mom_hs and mom_iq to allow slopes to differ by subgroup
kidiq['interaction_mom_hs_mom_iq'] = kidiq['mom_hs'] * kidiq['mom_iq']
fit_and_plot_bayes(kidiq, 'mom_hs', 'mom_iq', 'interaction_mom_hs_mom_iq', 'kid_score',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_mom_hs, slope_mom_iq, slope_interaction_mom_hs_mom_iq, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 6 seconds.

                                   mean      sd  hdi_2.5%  hdi_97.5%  \
intercept                        -7.200  13.057   -33.852     17.108   
slope_mom_hs                     46.206  14.645    18.697     76.067   
slope_mom_iq                      0.923   0.141     0.664      1.213   
slope_interaction_mom_hs_mom_iq  -0.431   0.155    -0.730     -0.124   
sigma                            18.034   0.622    16.759     19.201   

                                 mcse_mean  mcse_sd  ess_bulk  ess_tail  r_hat  
intercept                            0.320    0.204    1670.0    2402.0    1.0  
slope_mom_hs                         0.356    0.228    1691.0    2173.0    1.0  
slope_mom_iq                         0.003    0.002    1692.0    2331.0    1.0  
slope_interaction_mom_hs_mom_iq      0.004    0.002    1691.0    2223.0    1.0  
sigma                                0.011    0.010    3391.0    3235.0    1.0  

Regression formula: kid_score = -7.20 + 46.21*mom_hs + 0.92*mom_iq + -0.43*interaction_mom_hs_mom_iq

<xarray.Dataset> Size: 336kB
Dimensions:                          (chain: 4, draw: 2000)
Coordinates:
  * chain                            (chain) int64 32B 0 1 2 3
  * draw                             (draw) int64 16kB 0 1 2 ... 1997 1998 1999
Data variables:
    intercept                        (chain, draw) float64 64kB -28.01 ... -6...
    slope_mom_hs                     (chain, draw) float64 64kB 66.23 ... 33.36
    slope_mom_iq                     (chain, draw) float64 64kB 1.16 ... 0.8988
    slope_interaction_mom_hs_mom_iq  (chain, draw) float64 64kB -0.6673 ... -...
    sigma                            (chain, draw) float64 64kB 17.48 ... 18.34
Attributes:
    created_at:                 2026-04-14T06:27:44.579354+00:00
    arviz_version:              0.23.4
    inference_library:          pymc
    inference_library_version:  5.28.2
    sampling_time:              6.260993003845215
    tuning_steps:               1000

array([0, 1, 2, 3])

array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,))

array([[-28.00593401, -33.76112636, -32.35074354, ..., -28.08350644,
        -29.50090373, -34.42396576],
       [-20.60025847, -22.66895804, -22.62604555, ..., -12.01971193,
        -11.39695048,  -7.17804917],
       [-20.68920349, -21.11911682,  -0.11679469, ..., -29.08114619,
        -12.82624724,   6.85569922],
       [ -5.50619552,  -0.17826862,  -9.65059075, ..., -28.1112805 ,
         -1.584939  ,  -6.10924139]], shape=(4, 2000))

array([[66.22745641, 68.74598074, 69.76285915, ..., 71.42365506,
        61.55304365, 76.16961656],
       [64.8221493 , 63.45241184, 64.10575322, ..., 48.29688758,
        59.88629385, 50.07724235],
       [78.28373021, 72.53796796, 48.62896564, ..., 64.83711981,
        54.80457305, 34.44320524],
       [37.89098512, 39.83561351, 37.33350756, ..., 68.93450002,
        45.44509301, 33.36115997]], shape=(4, 2000))

earnings = pd.read_csv('../ros_data/earnings.csv', skiprows=0)
# earnings['earn_k'] = earnings['earn'] / 1000
# earnings['c_height'] = earnings['height'] - 66 # Center height around 66 inches for better interpretability of the intercept

display(earnings.head())

earnings_clean = earnings.dropna(subset=['height', 'weight'])

# Predict weight in pounds from height
# fit_and_plot_lm(earnings_clean, ['height'], 'weight', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

earnings_model = fit_and_plot_bayes(earnings_clean, 'height', 'weight',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

print(earnings_model)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_height, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 5 seconds.

                 mean      sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept    -164.165  11.550  -186.812   -140.853      0.241    0.175   
slope_height    4.813   0.173     4.469      5.157      0.004    0.003   
sigma          28.970   0.490    28.004     29.921      0.008    0.007   

              ess_bulk  ess_tail  r_hat  
intercept       2293.0    2534.0    1.0  
slope_height    2283.0    2585.0    1.0  
sigma           3458.0    3324.0    1.0  

Regression formula: weight = -164.16 + 4.81*height

Inference data with groups:
	> posterior
	> sample_stats
	> observed_data

# Posterior prediction for a new observation (PyMC equivalent of R's posterior_predict)
# Predict weight for a person who is 66 inches tall
new_height = 66

posterior = earnings_model.posterior
intercept_samples = posterior["intercept"].values.flatten()  # all posterior draws for intercept (e.g. 4000 samples)
slope_samples = posterior["slope_height"].values.flatten()  # all posterior draws for slope
sigma_samples = posterior["sigma"].values.flatten()  # all posterior draws for residual std dev

# Point prediction for each posterior draw: E[weight | height=66]
mu_samples = intercept_samples + slope_samples * new_height  # linear predictor evaluated at new_height for each draw

# Full posterior predictive: sample a new observation from Normal(mu, sigma) for each draw
# This adds residual noise on top of the mean prediction, capturing both:
#   1. Parameter uncertainty (from the spread of intercept/slope draws)
#   2. Individual-level variation (from sigma)
# This is what makes it a *prediction* interval rather than just a *confidence* interval
pred_samples = np.random.normal(mu_samples, sigma_samples)

print(f"Predicted weight for height={new_height} inches:")
print(f"  Mean: {pred_samples.mean():.1f} lbs")
print(f"  Median: {np.median(pred_samples):.1f} lbs")
print(f"  50% interval: [{np.percentile(pred_samples, 25):.1f}, {np.percentile(pred_samples, 75):.1f}]")
print(f"  95% interval: [{np.percentile(pred_samples, 2.5):.1f}, {np.percentile(pred_samples, 97.5):.1f}]")

fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(pred_samples, bins=50, density=True, alpha=0.7)
ax.axvline(pred_samples.mean(), color='red', linestyle='--', label=f'Mean: {pred_samples.mean():.1f}')
ax.set_xlabel('Predicted weight (lbs)')
ax.set_ylabel('Density')
ax.set_title(f'Posterior predictive distribution for height = {new_height} inches')
ax.legend()
plt.show()

Predicted weight for height=66 inches:
  Mean: 153.5 lbs
  Median: 153.0 lbs
  50% interval: [133.5, 173.0]
  95% interval: [97.5, 211.1]

earnings['c_height'] = earnings['height'] - 66 # Center height around 66 inches for better interpretability of the intercept

display(earnings.head())

earnings_clean = earnings.dropna(subset=['c_height', 'weight', 'male'])

# Predict weight in pounds from height and male

earnings_model = fit_and_plot_bayes(earnings_clean, 'male', 'c_height', 'weight',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_male, slope_c_height, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 1 seconds.

                   mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept       149.509  0.916   147.630    151.197      0.013    0.010   
slope_male       11.840  1.968     7.871     15.538      0.030    0.023   
slope_c_height    3.892  0.253     3.399      4.381      0.004    0.003   
sigma            28.697  0.475    27.793     29.648      0.006    0.005   

                ess_bulk  ess_tail  r_hat  
intercept         4846.0    5811.0    1.0  
slope_male        4397.0    4569.0    1.0  
slope_c_height    4854.0    5437.0    1.0  
sigma             6527.0    6053.0    1.0  

Regression formula: weight = 149.51 + 11.84*male + 3.89*c_height

# Predict weight of a 70 inch woman:

# Posterior prediction for a new observation (PyMC equivalent of R's posterior_predict)
# Predict weight for a person who is 66 inches tall
new_height = 70

posterior = earnings_model.posterior
intercept_samples = posterior["intercept"].values.flatten()  # all posterior draws for intercept (e.g. 4000 samples)
slope_height_samples = posterior["slope_c_height"].values.flatten()  # all posterior draws for slope
slope_male_samples = posterior["slope_male"].values.flatten()  # all posterior draws for slope
sigma_samples = posterior["sigma"].values.flatten()  # all posterior draws for residual std dev

# Point prediction for each posterior draw: E[weight | height=70]
mu_samples = intercept_samples + slope_height_samples * (70-66) + slope_male_samples * 0

# Full posterior predictive: sample a new observation from Normal(mu, sigma) for each draw
# This adds residual noise on top of the mean prediction, capturing both:
#   1. Parameter uncertainty (from the spread of intercept/slope draws)
#   2. Individual-level variation (from sigma)
# This is what makes it a *prediction* interval rather than just a *confidence* interval
pred_samples = np.random.normal(mu_samples, sigma_samples)

print(f"Predicted weight for height={new_height} inches:")
print(f"  Mean: {pred_samples.mean():.1f} lbs")
print(f"  Median: {np.median(pred_samples):.1f} lbs")
print(f"  50% interval: [{np.percentile(pred_samples, 25):.1f}, {np.percentile(pred_samples, 75):.1f}]")
print(f"  95% interval: [{np.percentile(pred_samples, 2.5):.1f}, {np.percentile(pred_samples, 97.5):.1f}]")

fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(pred_samples, bins=50, density=True, alpha=0.7)
ax.axvline(pred_samples.mean(), color='red', linestyle='--', label=f'Mean: {pred_samples.mean():.1f}')
ax.set_xlabel('Predicted weight (lbs)')
ax.set_ylabel('Density')
ax.set_title(f'Posterior predictive distribution for height = {new_height} inches')
ax.legend()
plt.show()

Predicted weight for height=70 inches:
  Mean: 164.9 lbs
  Median: 165.1 lbs
  50% interval: [146.5, 183.4]
  95% interval: [107.8, 220.7]

# Create dummy variables for ethnicity
# Drop any existing eth_ columns first to avoid duplicates on re-run
earnings_clean = earnings_clean[[c for c in earnings_clean.columns if not c.startswith('eth_')]]
eth_dummies = pd.get_dummies(earnings_clean['ethnicity'], prefix='eth', dtype=int)
eth_dummies = eth_dummies.drop(columns='eth_Black')  # Black is reference group
earnings_clean = pd.concat([earnings_clean, eth_dummies], axis=1)
print(eth_dummies.columns.tolist())

with pm.Model() as earnings_eth_model:
    intercept = pm.Normal("intercept", mu=0, sigma=50)
    male = pm.Normal("male", mu=0, sigma=50)
    c_height = pm.Normal("c_height", mu=0, sigma=50)
    eth_Hispanic = pm.Normal("eth_Hispanic", mu=0, sigma=50)
    eth_Other = pm.Normal("eth_Other", mu=0, sigma=50)
    eth_White = pm.Normal("eth_White", mu=0, sigma=50)
    sigma = pm.HalfNormal("sigma", sigma=50)

    # intercept = expected weight for a Black female of average height
    # each eth_ coefficient = difference from Black reference group
    mu = (intercept
          + male * earnings_clean['male'].values
          + c_height * earnings_clean['c_height'].values
          + eth_Hispanic * earnings_clean['eth_Hispanic'].values
          + eth_Other * earnings_clean['eth_Other'].values
          + eth_White * earnings_clean['eth_White'].values)

    y = pm.Normal("y", mu=mu, sigma=sigma, observed=earnings_clean['weight'].values)
    trace_eth = pm.sample(2000, tune=1000)

print(pm.summary(trace_eth, hdi_prob=0.95))
az.plot_trace(trace_eth)
plt.tight_layout()
plt.show()

['eth_Hispanic', 'eth_Other', 'eth_White']

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, male, c_height, eth_Hispanic, eth_Other, eth_White, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 3 seconds.

                 mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept     153.997  2.215   149.709    158.387      0.035    0.027   
male           12.127  1.989     8.135     15.971      0.028    0.021   
c_height        3.852  0.252     3.387      4.370      0.003    0.003   
eth_Hispanic   -5.772  3.558   -12.955      1.081      0.047    0.038   
eth_Other     -11.832  5.117   -21.836     -1.974      0.065    0.055   
eth_White      -4.880  2.262    -9.218     -0.416      0.034    0.027   
sigma          28.648  0.479    27.688     29.568      0.006    0.005   

              ess_bulk  ess_tail  r_hat  
intercept       4121.0    4687.0    1.0  
male            5005.0    5109.0    1.0  
c_height        5213.0    5139.0    1.0  
eth_Hispanic    5702.0    5752.0    1.0  
eth_Other       6244.0    5932.0    1.0  
eth_White       4323.0    4830.0    1.0  
sigma           7086.0    5873.0    1.0

# read csv
df = pd.read_csv('../ros_data/congress.csv')
display(df.head())

print(df['inc86'].value_counts())
print(df['inc88'].value_counts())
print(df['inc90'].value_counts())

# histogram of v88, with 10 bins and a KDE curve
sns.histplot(df['v88'], bins=20, kde=True)
plt.title('Distribution of Vote Share in 1988 (v88)')
plt.xlabel('Vote Share in 1988 (v88)')
plt.ylabel('Density')
plt.show()

# scatter plot of v86 vs v88, colored by inc88
sns.scatterplot(data=df, x='v86', y='v88', hue='inc88')
plt.title('Vote Share in 1986 vs 1988, colored by Incumbency in 1988')
plt.xlabel('Vote Share in 1986 (v86)')
plt.ylabel('Vote Share in 1988 (v88)')
plt.legend(title='Incumbent in 1988')
plt.show()

# scatter plot of v86_adj vs v88_adj, colored by inc88
sns.scatterplot(data=df, x='v86_adj', y='v88_adj', hue='inc88')
plt.title('Vote Share in 1986 vs 1988, colored by Incumbency in 1988')
plt.xlabel('Vote Share in 1986 (v86_adj)')
plt.ylabel('Vote Share in 1988 (v88_adj)')
plt.legend(title='Incumbent in 1988')
plt.show()

inc86
 1    231
-1    163
 0     41
Name: count, dtype: int64
inc88
 1    245
-1    160
 0     30
Name: count, dtype: int64
inc90
 1    247
-1    158
 0     30
Name: count, dtype: int64

sns.lmplot(data=df, x='v86_adj', y='v88_adj', hue='inc88', ci=None)
plt.title('Linear Regression of 1988 Vote Share on 1986 Vote Share, colored by Incumbency in 1988')
plt.xlabel('Vote Share in 1986 (v86_adj)')
plt.ylabel('Vote Share in 1988 (v88_adj)')
plt.show()

# Predict 1988 from 1986

congress_model = fit_and_plot_bayes(df, 'v86_adj', 'inc88', 'v88_adj',
                       intercept_mu=0, intercept_sigma=50,
                       slope_mu=0, slope_sigma=50,
                       sigma_sigma=50,
                       samples=2000, tune=1000, hdi_prob=0.95,
                       show_trace=True, show_forest=False,
                       show_posterior=False, show_regression=True,
                       n_regression_lines=100)

# There are issues with this, suggesting interactions

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [intercept, slope_v86_adj, slope_inc88, sigma]

/opt/anaconda3/envs/ros_pymc/lib/python3.12/site-packages/rich/live.py:260: UserWarning: install "ipywidgets" for 
Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 2 seconds.

                mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  \
intercept      0.238  0.017     0.204      0.271      0.000      0.0   
slope_v86_adj  0.522  0.033     0.455      0.583      0.001      0.0   
slope_inc88    0.096  0.007     0.082      0.109      0.000      0.0   
sigma          0.067  0.002     0.063      0.072      0.000      0.0   

               ess_bulk  ess_tail  r_hat  
intercept        2293.0    3138.0    1.0  
slope_v86_adj    2199.0    2853.0    1.0  
slope_inc88      2404.0    3791.0    1.0  
sigma            4504.0    4314.0    1.0  

Regression formula: v88_adj = 0.24 + 0.52*v86_adj + 0.10*inc88

# Extract posterior simulations (equivalent to R's as.matrix(fit88))
posterior = congress_model.posterior

# Get parameter draws (flatten across chains)
sigma = posterior['sigma'].values.flatten()
beta0 = posterior['intercept'].values.flatten()
beta1 = posterior['slope_v86_adj'].values.flatten()
beta2 = posterior['slope_inc88'].values.flatten()
n_sims = len(sigma)

# Generate predicted vote shares for 1990 using 1988 data
# y_tilde_i = beta0 + beta1 * v88_adj_i + beta2 * inc90_i + noise
v88_adj = df['v88_adj'].values
inc90 = df['inc90'].values

y_tilde = np.zeros((n_sims, 435))
for s in range(n_sims):
    mu = beta0[s] + beta1[s] * v88_adj + beta2[s] * inc90
    y_tilde[s, :] = np.random.normal(mu, sigma[s])

# Count predicted Democratic wins per simulation
dem_wins = (y_tilde > 0.5).sum(axis=1)

# Build summary table like Figure 10.7
sim_table = pd.DataFrame({
    'sim': range(1, n_sims + 1),
    'sigma': sigma,
    'beta0': beta0,
    'beta1': beta1,
    'beta2': beta2,
})
# Add a few example district predictions
for j in [0, 1, 434]:
    sim_table[f'y_tilde_{j+1}'] = y_tilde[:, j]
sim_table['Dem_wins'] = dem_wins

# Show first few, last, and summary stats
display(sim_table.head())
display(sim_table.tail(1))
print('\nSummary:')
display(sim_table.describe().loc[['mean', '50%', 'std']].rename(index={'50%': 'median'}))

Summary:

# read csv
nes = pd.read_csv('../ros_data/nes.txt', sep=' ')
# display(nes.head())
# filter only years 1972 to 2000
nes = nes[(nes['year'] >= 1972) & (nes['year'] <= 2000)]
display(nes.head())
# count unique years
print("Unique years in NES data:", nes['year'].nunique())
# print unique years
print("Unique years in NES data:", nes['year'].unique())
# print unique columns
print("Columns in NES data:", nes.columns.tolist())
# print unique values in ideo
print("Unique values in 'ideo':", nes['ideo'].unique())
# print unique values in real_ideo
print("Unique values in 'real_ideo':", nes['real_ideo'].unique())
# print unique values in partyid7
print("Unique values in 'partyid7':", nes['partyid7'].unique())

# loop through years 2 at a time starting 1972 and ending with 2000

for loop_year in range(1972, 2002, 2):
    print(f"\n--- Year: {loop_year} ---")
    # filter data for this year
    data = nes[nes['year'] == loop_year]
    # drop rows with missing values in real_ideo or partyid7
    data = data.dropna(subset=['real_ideo', 'partyid7'])
    # fit and plot linear model predicting partyid7 from real_ideo
    fit_and_plot_lm(data, ['real_ideo'], 'partyid7', add_constant=True, show_plot=False, scatter_kws=None, line_kws=None)

Unique years in NES data: 15
Unique years in NES data: [1972 1974 1976 1978 1980 1982 1984 1986 1988 1990 1992 1994 1996 1998
 2000]
Columns in NES data: ['year', 'resid', 'weight1', 'weight2', 'weight3', 'age', 'gender', 'race', 'educ1', 'urban', 'region', 'income', 'occup1', 'union', 'religion', 'educ2', 'educ3', 'martial_status', 'occup2', 'icpsr_cty', 'fips_cty', 'partyid7', 'partyid3', 'partyid3_b', 'str_partyid', 'father_party', 'mother_party', 'dlikes', 'rlikes', 'dem_therm', 'rep_therm', 'regis', 'vote', 'regisvote', 'presvote', 'presvote_2party', 'presvote_intent', 'ideo_feel', 'ideo7', 'ideo', 'cd', 'state', 'inter_pre', 'inter_post', 'black', 'female', 'age_sq', 'rep_presvote', 'rep_pres_intent', 'south', 'real_ideo', 'presapprov', 'perfin1', 'perfin2', 'perfin', 'presadm', 'age_10', 'age_sq_10', 'newfathe', 'newmoth', 'parent_party', 'white', 'year_new', 'income_new', 'age_new', 'vote.1', 'age_discrete', 'race_adj', 'dvote', 'rvote']
Unique values in 'ideo': [nan  1.  5.  3.]
Unique values in 'real_ideo': [nan  5.  6.  4.  7.  2.  3.  1.]
Unique values in 'partyid7': [ 1.  5.  6.  7.  2.  3.  4. nan]

--- Year: 1972 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.113
Model:                            OLS   Adj. R-squared:                  0.112
Method:                 Least Squares   F-statistic:                     169.2
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           1.69e-36
Time:                        07:43:01   Log-Likelihood:                -2769.0
No. Observations:                1330   AIC:                             5542.
Df Residuals:                    1328   BIC:                             5552.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.5394      0.041     13.008      0.000       0.458       0.621
const          1.5578      0.180      8.664      0.000       1.205       1.911
==============================================================================
Omnibus:                      495.446   Durbin-Watson:                   2.027
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               69.067
Skew:                           0.107   Prob(JB):                     1.01e-15
Kurtosis:                       1.904   Cond. No.                         15.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.56 + 0.54*real_ideo
Residual std dev (σ): 1.94 ± 0.04
MAD of residuals: 2.28

--- Year: 1974 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     142.4
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           6.75e-31
Time:                        07:43:01   Log-Likelihood:                -2196.6
No. Observations:                1065   AIC:                             4397.
Df Residuals:                    1063   BIC:                             4407.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.5144      0.043     11.933      0.000       0.430       0.599
const          1.5302      0.186      8.214      0.000       1.165       1.896
==============================================================================
Omnibus:                      265.630   Durbin-Watson:                   1.919
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               48.578
Skew:                           0.099   Prob(JB):                     2.83e-11
Kurtosis:                       1.973   Cond. No.                         14.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.53 + 0.51*real_ideo
Residual std dev (σ): 1.91 ± 0.04
MAD of residuals: 2.2

--- Year: 1976 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     268.5
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           1.52e-54
Time:                        07:43:01   Log-Likelihood:                -2406.7
No. Observations:                1184   AIC:                             4817.
Df Residuals:                    1182   BIC:                             4827.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6563      0.040     16.387      0.000       0.578       0.735
const          1.0079      0.179      5.624      0.000       0.656       1.360
==============================================================================
Omnibus:                      146.930   Durbin-Watson:                   1.966
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.049
Skew:                          -0.026   Prob(JB):                     3.32e-09
Kurtosis:                       2.112   Cond. No.                         15.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.01 + 0.66*real_ideo
Residual std dev (σ): 1.85 ± 0.04
MAD of residuals: 2.46

--- Year: 1978 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     205.8
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           9.38e-44
Time:                        07:43:01   Log-Likelihood:                -2903.6
No. Observations:                1453   AIC:                             5811.
Df Residuals:                    1451   BIC:                             5822.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.4994      0.035     14.347      0.000       0.431       0.568
const          1.5222      0.152      9.997      0.000       1.223       1.821
==============================================================================
Omnibus:                      195.606   Durbin-Watson:                   1.967
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               51.180
Skew:                           0.091   Prob(JB):                     7.70e-12
Kurtosis:                       2.099   Cond. No.                         14.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.52 + 0.50*real_ideo
Residual std dev (σ): 1.79 ± 0.03
MAD of residuals: 2.22

--- Year: 1980 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     158.3
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           7.21e-33
Time:                        07:43:01   Log-Likelihood:                -1439.7
No. Observations:                 701   AIC:                             2883.
Df Residuals:                     699   BIC:                             2892.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6462      0.051     12.582      0.000       0.545       0.747
const          0.9090      0.237      3.842      0.000       0.444       1.374
==============================================================================
Omnibus:                      108.506   Durbin-Watson:                   1.931
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               26.029
Skew:                          -0.040   Prob(JB):                     2.23e-06
Kurtosis:                       2.059   Cond. No.                         15.9
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.91 + 0.65*real_ideo
Residual std dev (σ): 1.89 ± 0.05
MAD of residuals: 2.01

--- Year: 1982 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     190.3
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           4.85e-39
Time:                        07:43:01   Log-Likelihood:                -1670.1
No. Observations:                 814   AIC:                             3344.
Df Residuals:                     812   BIC:                             3354.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6796      0.049     13.794      0.000       0.583       0.776
const          0.6245      0.223      2.800      0.005       0.187       1.062
==============================================================================
Omnibus:                      128.028   Durbin-Watson:                   2.040
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               30.499
Skew:                           0.058   Prob(JB):                     2.38e-07
Kurtosis:                       2.059   Cond. No.                         16.0
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.62 + 0.68*real_ideo
Residual std dev (σ): 1.89 ± 0.05
MAD of residuals: 2.4

--- Year: 1984 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.175
Model:                            OLS   Adj. R-squared:                  0.174
Method:                 Least Squares   F-statistic:                     259.0
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           5.33e-53
Time:                        07:43:01   Log-Likelihood:                -2565.0
No. Observations:                1226   AIC:                             5134.
Df Residuals:                    1224   BIC:                             5144.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6611      0.041     16.092      0.000       0.581       0.742
const          1.0822      0.183      5.920      0.000       0.724       1.441
==============================================================================
Omnibus:                       98.065   Durbin-Watson:                   1.914
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               32.705
Skew:                          -0.049   Prob(JB):                     7.91e-08
Kurtosis:                       2.206   Cond. No.                         15.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.08 + 0.66*real_ideo
Residual std dev (σ): 1.96 ± 0.04
MAD of residuals: 2.46

--- Year: 1986 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.138
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     235.0
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           2.46e-49
Time:                        07:43:01   Log-Likelihood:                -3054.5
No. Observations:                1476   AIC:                             6113.
Df Residuals:                    1474   BIC:                             6124.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6085      0.040     15.331      0.000       0.531       0.686
const          1.1709      0.176      6.656      0.000       0.826       1.516
==============================================================================
Omnibus:                      291.207   Durbin-Watson:                   1.885
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               59.949
Skew:                           0.061   Prob(JB):                     9.60e-14
Kurtosis:                       2.020   Cond. No.                         16.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.17 + 0.61*real_ideo
Residual std dev (σ): 1.92 ± 0.04
MAD of residuals: 2.38

--- Year: 1988 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.189
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     259.4
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           1.28e-52
Time:                        07:43:01   Log-Likelihood:                -2328.7
No. Observations:                1113   AIC:                             4661.
Df Residuals:                    1111   BIC:                             4671.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.6801      0.042     16.107      0.000       0.597       0.763
const          1.0416      0.194      5.367      0.000       0.661       1.422
==============================================================================
Omnibus:                      115.109   Durbin-Watson:                   1.949
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.222
Skew:                          -0.189   Prob(JB):                     3.04e-09
Kurtosis:                       2.162   Cond. No.                         15.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.04 + 0.68*real_ideo
Residual std dev (σ): 1.96 ± 0.04
MAD of residuals: 2.43

--- Year: 1990 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.123
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     165.5
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           1.53e-35
Time:                        07:43:01   Log-Likelihood:                -2477.3
No. Observations:                1180   AIC:                             4959.
Df Residuals:                    1178   BIC:                             4969.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.5568      0.043     12.866      0.000       0.472       0.642
const          1.3893      0.189      7.332      0.000       1.018       1.761
==============================================================================
Omnibus:                      381.467   Durbin-Watson:                   1.889
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               57.041
Skew:                           0.042   Prob(JB):                     4.11e-13
Kurtosis:                       1.926   Cond. No.                         15.1
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 1.39 + 0.56*real_ideo
Residual std dev (σ): 1.98 ± 0.04
MAD of residuals: 2.31

--- Year: 1992 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.233
Method:                 Least Squares   F-statistic:                     411.8
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           4.11e-80
Time:                        07:43:01   Log-Likelihood:                -2774.0
No. Observations:                1350   AIC:                             5552.
Df Residuals:                    1348   BIC:                             5562.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.7271      0.036     20.294      0.000       0.657       0.797
const          0.7228      0.158      4.561      0.000       0.412       1.034
==============================================================================
Omnibus:                       76.752   Durbin-Watson:                   2.018
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               31.298
Skew:                          -0.106   Prob(JB):                     1.60e-07
Kurtosis:                       2.285   Cond. No.                         14.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.72 + 0.73*real_ideo
Residual std dev (σ): 1.89 ± 0.04
MAD of residuals: 2.29

--- Year: 1994 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.265
Model:                            OLS   Adj. R-squared:                  0.265
Method:                 Least Squares   F-statistic:                     447.2
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           5.54e-85
Time:                        07:43:01   Log-Likelihood:                -2510.5
No. Observations:                1240   AIC:                             5025.
Df Residuals:                    1238   BIC:                             5035.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.8022      0.038     21.146      0.000       0.728       0.877
const          0.5221      0.177      2.955      0.003       0.175       0.869
==============================================================================
Omnibus:                       28.987   Durbin-Watson:                   2.065
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               20.965
Skew:                          -0.213   Prob(JB):                     2.80e-05
Kurtosis:                       2.526   Cond. No.                         16.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.52 + 0.80*real_ideo
Residual std dev (σ): 1.83 ± 0.04
MAD of residuals: 2.09

--- Year: 1996 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.362
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     590.4
Date:                Tue, 14 Apr 2026   Prob (F-statistic):          1.14e-103
Time:                        07:43:01   Log-Likelihood:                -2080.9
No. Observations:                1043   AIC:                             4166.
Df Residuals:                    1041   BIC:                             4176.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.9503      0.039     24.299      0.000       0.874       1.027
const         -0.2857      0.179     -1.599      0.110      -0.636       0.065
==============================================================================
Omnibus:                       12.887   Durbin-Watson:                   1.968
Prob(Omnibus):                  0.002   Jarque-Bera (JB):               12.555
Skew:                          -0.239   Prob(JB):                      0.00188
Kurtosis:                       2.753   Cond. No.                         15.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = -0.29 + 0.95*real_ideo
Residual std dev (σ): 1.78 ± 0.04
MAD of residuals: 1.7

--- Year: 1998 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.207
Model:                            OLS   Adj. R-squared:                  0.206
Method:                 Least Squares   F-statistic:                     244.9
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           3.20e-49
Time:                        07:43:01   Log-Likelihood:                -1911.4
No. Observations:                 939   AIC:                             3827.
Df Residuals:                     937   BIC:                             3837.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.7072      0.045     15.651      0.000       0.618       0.796
const          0.7717      0.201      3.845      0.000       0.378       1.166
==============================================================================
Omnibus:                       45.150   Durbin-Watson:                   1.856
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               20.393
Skew:                          -0.123   Prob(JB):                     3.73e-05
Kurtosis:                       2.322   Cond. No.                         15.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.77 + 0.71*real_ideo
Residual std dev (σ): 1.85 ± 0.04
MAD of residuals: 2.35

--- Year: 2000 ---
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               partyid7   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.270
Method:                 Least Squares   F-statistic:                     176.9
Date:                Tue, 14 Apr 2026   Prob (F-statistic):           1.60e-34
Time:                        07:43:01   Log-Likelihood:                -967.69
No. Observations:                 476   AIC:                             1939.
Df Residuals:                     474   BIC:                             1948.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
real_ideo      0.8048      0.061     13.300      0.000       0.686       0.924
const          0.3502      0.274      1.276      0.203      -0.189       0.889
==============================================================================
Omnibus:                       11.681   Durbin-Watson:                   1.918
Prob(Omnibus):                  0.003   Jarque-Bera (JB):                6.809
Skew:                          -0.100   Prob(JB):                       0.0332
Kurtosis:                       2.449   Cond. No.                         15.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: partyid7 = 0.35 + 0.80*real_ideo
Residual std dev (σ): 1.85 ± 0.06
MAD of residuals: 2.37

Variable	Description
`v86`, `v88`, `v90`	Democratic share of the two-party vote in each district (1986, 1988, 1990)
`v86_adj`, `v88_adj`, `v90_adj`	Adjusted vote share — uncontested races winsorized to avoid 0/1 extremes
`inc86`, `inc88`, `inc90`	Incumbency status: +1 = Dem incumbent, -1 = Rep incumbent, 0 = open seat

	inc86	inc88	inc90	v86	v88	v90	v86_adj	v88_adj	v90_adj
0	1	1	1	0.745036	0.772443	0.714029	0.745036	0.772443	0.714029
1	1	1	1	0.673845	0.636182	0.597050	0.673845	0.636182	0.597050
2	1	1	0	0.696457	0.664928	0.521043	0.696457	0.664928	0.521043
3	-1	-1	-1	0.464590	0.273834	0.234377	0.464590	0.273834	0.234377
4	-1	-1	0	0.391095	0.263613	0.477439	0.391095	0.263613	0.477439

	sim	sigma	beta0	beta1	beta2	y_tilde_1	y_tilde_2	y_tilde_435	Dem_wins
0	1	0.067848	0.239418	0.515399	0.093285	0.696838	0.696872	0.691548	255
1	2	0.067314	0.217457	0.554205	0.093199	0.654943	0.577956	0.571270	258
2	3	0.068343	0.252716	0.493964	0.097277	0.693813	0.568496	0.584509	262
3	4	0.071545	0.250206	0.503098	0.101749	0.786488	0.743149	0.718841	262
4	5	0.067992	0.231123	0.539080	0.094944	0.725125	0.629334	0.636047	262

	sim	sigma	beta0	beta1	beta2	y_tilde_1	y_tilde_2	y_tilde_435	Dem_wins
mean	4000.50000	0.067366	0.237602	0.521709	0.096226	0.736238	0.665941	0.630627	259.98475
median	4000.50000	0.067279	0.237542	0.521681	0.096297	0.735521	0.666381	0.629967	260.00000
std	2309.54541	0.002281	0.017228	0.032812	0.006860	0.067754	0.067164	0.068355	2.48456

10. Linear regression with multiple predictors¶

10.1 Adding predictors to a model¶

Starting with a binary predictor¶

A single continuous predictor¶

Include both predictors in the model¶

Understanding the fitted model¶

10.2 Interpreting regression coefficients¶

It’s not always possible to change one predictor while holding all others constant¶

Counterfactual and predictive interpretations¶

10.3 Interactions¶

When should we look for interactions?¶

Interpreting regression coefficients in the presence of interactions¶

10.4 Indicator variables¶

Using indicator variables for multiple levels of a categorical predictor¶

Using an index variable to access a group-level predictor¶

10.5 Formulating paired or blocked designs as a regression problem¶

Completely randomised experiment¶

Paired design¶

Block design¶

10.6 Example: uncertainty in predicting congressional elections¶

Background¶

Simulation for inferences and predictions of new data points¶

Combining simulation and analytic calculations¶

10.7 Mathematical notation and statistical inference¶

Predictors¶

Regression in vector-matrix notation¶

Two ways of writing the model¶

Least squares, maximum likelihood, and Bayesian inference¶

Nonidentified parameters, collinearity, and the likelihood function¶

Hypothesis testing: why we do not like t tests and F tests¶

10.8 Weighted regression¶

Three models leading to weighted regression¶

Using a matrix of weights to account for correlated errors¶

10.9 Fitting the same model to many datasets¶

Predicting party identification¶

10.10 Bibliographic note¶

10.11 Exercises¶

	kid_score	mom_hs	mom_iq	mom_work	mom_age
0	65	1	121.117529	4	27
1	98	1	89.361882	4	25
2	85	1	115.443165	4	27
3	83	1	99.449639	3	25
4	115	1	92.745710	4	27

	height	weight	male	earn	earnk	ethnicity	education	mother_education	father_education	walk	exercise	smokenow	tense	angry	age
0	74	210.0	1	50000.0	50.0	White	16.0	16.0	16.0	3	3	2.0	0.0	0.0	45
1	66	125.0	0	60000.0	60.0	White	16.0	16.0	16.0	6	5	1.0	0.0	0.0	58
2	64	126.0	0	30000.0	30.0	White	16.0	16.0	16.0	8	1	2.0	1.0	1.0	29
3	65	200.0	0	25000.0	25.0	White	17.0	17.0	NaN	8	1	2.0	0.0	0.0	57
4	63	110.0	0	50000.0	50.0	Other	16.0	16.0	16.0	5	6	2.0	0.0	0.0	91

	year	resid	weight1	weight2	weight3	age	gender	race	educ1	urban	...	parent_party	white	year_new	income_new	age_new	vote.1	age_discrete	race_adj	dvote	rvote
13790	1972	1	1.0	1.0	1.0	75	2	1	2	3.0	...	NaN	1	10	-2	2.947545	NaN	4	1.0	NaN	NaN
13791	1972	3	1.0	1.0	1.0	24	1	1	2	3.0	...	-2.0	1	10	0	-2.152455	1.0	1	1.0	NaN	NaN
13792	1972	4	1.0	1.0	1.0	21	1	1	2	3.0	...	2.0	1	10	0	-2.452455	1.0	1	1.0	0.0	1.0
13794	1972	6	1.0	1.0	1.0	30	2	1	2	3.0	...	2.0	1	10	1	-1.552455	NaN	2	1.0	1.0	0.0
13795	1972	7	1.0	1.0	1.0	73	1	1	2	3.0	...	2.0	1	10	1	2.747545	1.0	4	1.0	0.0	1.0