import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from statsmodels.robust.scale import mad

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Print residual standard deviation and its uncertainty
    sigma = np.sqrt(results.mse_resid)
    sigma_se = sigma / np.sqrt(2 * results.df_resid)
    print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
    # Print median absolute deviation of residuals
    print("MAD of residuals:", round(mad(results.resid), 2))
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

# Import hibbs.dat from /ros_data
hibbs = pd.read_csv('../ros_data/hibbs.dat', 
                    sep='\s+')

# Display the first few rows of the dataset
display(hibbs.head(10))

fit_and_plot_lm(data = hibbs, predictors = ['growth'], outcome = 'vote', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

<>:3: SyntaxWarning: invalid escape sequence '\s'
<>:3: SyntaxWarning: invalid escape sequence '\s'
/var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_82899/79942832.py:3: SyntaxWarning: invalid escape sequence '\s'
  sep='\s+')

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   vote   R-squared:                       0.580
Model:                            OLS   Adj. R-squared:                  0.550
Method:                 Least Squares   F-statistic:                     19.32
Date:                Fri, 13 Mar 2026   Prob (F-statistic):           0.000610
Time:                        07:54:23   Log-Likelihood:                -42.839
No. Observations:                  16   AIC:                             89.68
Df Residuals:                      14   BIC:                             91.22
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
growth         3.0605      0.696      4.396      0.001       1.567       4.554
const         46.2476      1.622     28.514      0.000      42.769      49.726
==============================================================================
Omnibus:                        5.392   Durbin-Watson:                   2.379
Prob(Omnibus):                  0.067   Jarque-Bera (JB):                2.828
Skew:                          -0.961   Prob(JB):                        0.243
Kurtosis:                       3.738   Cond. No.                         4.54
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: vote = 46.25 + 3.06*growth
Residual std dev (σ): 3.76 ± 0.71
MAD of residuals: 2.27

from scipy.stats import norm

result = 1 - norm.cdf(50, 52.3, 3.9)
print(result)

0.7223187166432111

a = 46.3
b = 3.0
sigma = 3.9
x = hibbs['growth']
n = len(x)

y = a + b * x + np.random.normal(0, sigma, size=n)
fake_data = pd.DataFrame({'growth': x, 'vote': y})
display(fake_data.head(10))

# Refit
X = sm.add_constant(fake_data[['growth']])
results = sm.OLS(fake_data['vote'], X).fit()

b_hat = results.params['growth']
print(f"Estimated slope (b_hat): {b_hat:.2f}")
b_se = results.bse['growth']
print(f"Standard error of slope (b_se): {b_se:.2f}")

cover_68 = abs(b - b_hat) < b_se
cover_95 = abs(b - b_hat) < 2 * b_se

print(f"68% coverage: {cover_68}")
print(f"95% coverage: {cover_95}")

Estimated slope (b_hat): 3.78
Standard error of slope (b_se): 0.61
68% coverage: False
95% coverage: True

fit_and_plot_lm(data = fake_data, predictors = ['growth'], outcome = 'vote', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   vote   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.712
Method:                 Least Squares   F-statistic:                     38.14
Date:                Fri, 13 Mar 2026   Prob (F-statistic):           2.41e-05
Time:                        07:54:23   Log-Likelihood:                -40.761
No. Observations:                  16   AIC:                             85.52
Df Residuals:                      14   BIC:                             87.07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
growth         3.7761      0.611      6.175      0.000       2.465       5.088
const         45.2152      1.424     31.744      0.000      42.160      48.270
==============================================================================
Omnibus:                        1.899   Durbin-Watson:                   2.880
Prob(Omnibus):                  0.387   Jarque-Bera (JB):                1.144
Skew:                          -0.346   Prob(JB):                        0.565
Kurtosis:                       1.888   Cond. No.                         4.54
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: vote = 45.22 + 3.78*growth
Residual std dev (σ): 3.30 ± 0.62
MAD of residuals: 3.74

n_simulations = 1000
cover_68_count = 0
cover_95_count = 0
for i in range(n_simulations):
    y = a + b * x + np.random.normal(0, sigma, size=n)
    fake_data = pd.DataFrame({'growth': x, 'vote': y})
    X = sm.add_constant(fake_data[['growth']])
    results = sm.OLS(fake_data['vote'], X).fit()
    b_hat = results.params['growth']
    b_se = results.bse['growth']
    if abs(b - b_hat) < b_se:
        cover_68_count += 1
    if abs(b - b_hat) < 2 * b_se:
        cover_95_count += 1

print(f"68% coverage: {cover_68_count / n_simulations:.2f}")
print(f"95% coverage: {cover_95_count / n_simulations:.2f}")

68% coverage: 0.67
95% coverage: 0.93

n_0 = 20
population_mean_0 = 2
population_sd_0 = 5
data_0 = np.random.normal(population_mean_0, population_sd_0, size=n_0)
sample_mean_0 = np.mean(data_0)
sample_sd_0 = np.std(data_0, ddof=1)
sample_standard_error_0 = sample_sd_0 / np.sqrt(n_0)
print(f"Data: {data_0}")
print(f"Sample mean: {sample_mean_0:.2f}")
print(f"Sample standard deviation: {sample_sd_0:.2f}")
print(f"Sample standard error: {sample_standard_error_0:.2f}")

X_const_0 = np.ones((len(data_0), 1))
results_0 = sm.OLS(data_0, X_const_0).fit()
print(results_0.summary())

sns.scatterplot(x=data_0, y=np.ones(len(data_0)))

Data: [-5.49164234  1.47496622  5.88262182 -1.74030413  6.79782363 -5.62631569
 -0.99651605 -1.9485498   3.54093597 10.60755627  3.45778513  3.37706077
  0.14066584 -0.46305676  5.76016099 -0.89073903 -1.09044826 -4.447712
 -1.53498923 -2.06835602]
Sample mean: 0.74
Sample standard deviation: 4.30
Sample standard error: 0.96
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                      -0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Fri, 13 Mar 2026   Prob (F-statistic):                nan
Time:                        07:54:23   Log-Likelihood:                -57.057
No. Observations:                  20   AIC:                             116.1
Df Residuals:                      19   BIC:                             117.1
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7370      0.962      0.766      0.453      -1.277       2.751
==============================================================================
Omnibus:                        1.368   Durbin-Watson:                   1.764
Prob(Omnibus):                  0.505   Jarque-Bera (JB):                1.040
Skew:                           0.531   Prob(JB):                        0.594
Kurtosis:                       2.651   Cond. No.                         1.00
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

<Axes: >

n_1 = 30
population_mean_1 = 8
population_sd_1 = 5
data_1 = np.random.normal(population_mean_1, population_sd_1, size=n_1)
sample_mean_1 = np.mean(data_1)
sample_sd_1 = np.std(data_1, ddof=1)
sample_standard_error_1 = sample_sd_1 / np.sqrt(n_1)
print(f"Data: {data_1}")
print(f"Sample mean: {sample_mean_1:.2f}")
print(f"Sample standard deviation: {sample_sd_1:.2f}")
print(f"Sample standard error: {sample_standard_error_1:.2f}")

diff_mean = sample_mean_1 - sample_mean_0
se = np.sqrt(sample_standard_error_0**2 + sample_standard_error_1**2)
print(f"Difference in means: {diff_mean:.2f}")
print(f"Standard error of difference: {se:.2f}")

Data: [10.78496536  6.262307   15.701071    6.22162615 13.51133295  8.08222322
  4.56121395  6.46001725  3.40373035  6.89014161 13.15550496 19.4313046
  9.24910624  3.56272558  9.74161906 10.5732374   3.12882254  7.01951856
  1.63567549 19.4612974   7.06603197  9.37069769  9.53425243 -5.64943585
  9.46204123 10.77952011  6.78315336 15.8473238   0.20609541 13.7543584 ]
Sample mean: 8.53
Sample standard deviation: 5.49
Sample standard error: 1.00
Difference in means: 7.80
Standard error of difference: 1.39

# Alternatively combine into a single vector and create an indicator variable
n = n_0 + n_1
y = np.concatenate([data_0, data_1])
x = np.concatenate([np.zeros(n_0), np.ones(n_1)])
fake = pd.DataFrame({'x': x, 'y': y})

X = sm.add_constant(fake[['x']])
fit = sm.OLS(fake['y'], X).fit()
print(fit.summary())

sns.scatterplot(data=fake, x='x', y='y')
plt.plot([0, 1], [fit.params['const'], fit.params['const'] + fit.params['x']], color='red')
plt.title('Linear Regression Fit')
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.373
Model:                            OLS   Adj. R-squared:                  0.360
Method:                 Least Squares   F-statistic:                     28.54
Date:                Fri, 13 Mar 2026   Prob (F-statistic):           2.49e-06
Time:                        07:54:23   Log-Likelihood:                -150.95
No. Observations:                  50   AIC:                             305.9
Df Residuals:                      48   BIC:                             309.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7370      1.130      0.652      0.517      -1.536       3.010
x              7.7960      1.459      5.342      0.000       4.862      10.730
==============================================================================
Omnibus:                        1.010   Durbin-Watson:                   2.269
Prob(Omnibus):                  0.604   Jarque-Bera (JB):                0.343
Skew:                           0.042   Prob(JB):                        0.842
Kurtosis:                       3.397   Cond. No.                         2.92
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

	year	growth	vote	inc_party_candidate	other_candidate
0	1952	2.40	44.60	Stevenson	Eisenhower
1	1956	2.89	57.76	Eisenhower	Stevenson
2	1960	0.85	49.91	Nixon	Kennedy
3	1964	4.21	61.34	Johnson	Goldwater
4	1968	3.02	49.60	Humphrey	Nixon
5	1972	3.62	61.79	Nixon	McGovern
6	1976	1.08	48.95	Ford	Carter
7	1980	-0.39	44.70	Carter	Reagan
8	1984	3.86	59.17	Reagan	Mondale
9	1988	2.27	53.94	Bush, Sr.	Dukakis

	growth	vote
0	2.40	52.751575
1	2.89	56.822922
2	0.85	51.535142
3	4.21	56.793039
4	3.02	61.165274
5	3.62	53.599406
6	1.08	48.266235
7	-0.39	41.699206
8	3.86	62.825888
9	2.27	54.687790

7. Linear regression with a single predictor¶

Fitting a linear model to the data¶

Understanding the fitted model¶

Using the model to predict¶

7.2 Checking the model-fitting procedure using fake-data simulation¶

Step 1: Creating the pretend world¶

Step 2: Simulating fake data¶

Step 3 Fitting the model and comparing the results to the truth¶

Step 4: Embedding the simulation in a loop¶

7.3 Formulating comparisons as regression models¶

Estimating the mean is the same as regressing on a constant term¶

Estimating a difference is the same as regressing on an indicator variable¶

7.4 Bibliographic note¶

7.5 Exercises¶

7. Linear regression with a single predictor¶

7.1 Example: predicting presidential vote share from the economy¶

Fitting a linear model to the data¶

Understanding the fitted model¶

Using the model to predict¶

7.2 Checking the model-fitting procedure using fake-data simulation¶

Step 1: Creating the pretend world¶

Step 2: Simulating fake data¶

Step 3 Fitting the model and comparing the results to the truth¶

Step 4: Embedding the simulation in a loop¶

7.3 Formulating comparisons as regression models¶

Estimating the mean is the same as regressing on a constant term¶

Estimating a difference is the same as regressing on an indicator variable¶

7.4 Bibliographic note¶

7.5 Exercises¶