from statsmodels.robust.scale import mad

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Print residual standard deviation and its uncertainty
    sigma = np.sqrt(results.mse_resid)
    sigma_se = sigma / np.sqrt(2 * results.df_resid)
    print(f"Residual std dev (σ): {sigma:.2f} ± {sigma_se:.2f}")
    # Print median absolute deviation of residuals
    print("MAD of residuals:", round(mad(results.resid), 2))
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

#x = array between 1 and 20
x = np.arange(1, 21)
n = len(x)
a = 0.2
b = 0.3
sigma = 0.5
y = a + b * x + np.random.normal(0, sigma, n)

data = pd.DataFrame({'x': x, 'y': y})
display(data.head())

sns.scatterplot(x='x', y='y', data=data)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter plot of x and y')
plt.show()

fit_and_plot_lm(data, predictors=['x'], outcome='y')

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     173.1
Date:                Wed, 11 Mar 2026   Prob (F-statistic):           1.13e-10
Time:                        07:55:36   Log-Likelihood:                -15.161
No. Observations:                  20   AIC:                             34.32
Df Residuals:                      18   BIC:                             36.31
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x              0.2777      0.021     13.156      0.000       0.233       0.322
const          0.3336      0.253      1.319      0.204      -0.198       0.865
==============================================================================
Omnibus:                        1.195   Durbin-Watson:                   1.706
Prob(Omnibus):                  0.550   Jarque-Bera (JB):                0.850
Skew:                          -0.138   Prob(JB):                        0.654
Kurtosis:                       2.028   Cond. No.                         25.0
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: y = 0.33 + 0.28*x
Residual std dev (σ): 0.54 ± 0.09
MAD of residuals: 0.59

earnings = pd.read_csv('../ros_data/earnings.csv', skiprows=0)
earnings['earn_k'] = earnings['earn'] / 1000

display(earnings.head())

fit_and_plot_lm(earnings, predictors=['height', 'male'], outcome='earn_k')

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 earn_k   R-squared:                       0.100
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     100.3
Date:                Wed, 11 Mar 2026   Prob (F-statistic):           4.88e-42
Time:                        07:55:36   Log-Likelihood:                -8137.7
No. Observations:                1816   AIC:                         1.628e+04
Df Residuals:                    1813   BIC:                         1.630e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
height         0.6470      0.185      3.493      0.000       0.284       1.010
male          10.6327      1.468      7.241      0.000       7.753      13.512
const        -25.8722     11.962     -2.163      0.031     -49.332      -2.412
==============================================================================
Omnibus:                     1902.421   Durbin-Watson:                   1.895
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           249218.549
Skew:                           4.821   Prob(JB):                         0.00
Kurtosis:                      59.575   Cond. No.                     1.59e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.59e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Formula: earn_k = -25.87 + 0.65*height + 10.63*male
Residual std dev (σ): 21.39 ± 0.36
MAD of residuals: 14.27

# import PearsonLee.txt first row is header
pearson_lee = pd.read_csv('../ros_data/PearsonLee.txt', delim_whitespace=True)
display(pearson_lee.head())

# new df with all columns where par = Mother and chl = Daughter
mother_daughter = pearson_lee[(pearson_lee['par'] == 'Mother') & (pearson_lee['chl'] == 'Daughter')]
display(mother_daughter.head())

# sns.scatterplot(x='parent', y='child', data=mother_daughter)

fit_and_plot_lm(mother_daughter, predictors=['parent'], outcome='child')

mother_daughter_centered = mother_daughter.copy()
mother_daughter_centered['parent_c'] = mother_daughter_centered['parent'] - mother_daughter_centered['parent'].mean()
display(mother_daughter_centered.head())

fit_and_plot_lm(mother_daughter_centered, predictors=['parent_c'], outcome='child')

heights = pd.read_csv('../ros_data/heights.txt', delim_whitespace=True)
display(heights.head())

fit_and_plot_lm(heights, predictors=['mother_height'], outcome='daughter_height')

/var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_9001/3418795757.py:2: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  pearson_lee = pd.read_csv('../ros_data/PearsonLee.txt', delim_whitespace=True)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  child   R-squared:                       0.218
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     51.16
Date:                Wed, 11 Mar 2026   Prob (F-statistic):           1.97e-11
Time:                        08:12:12   Log-Likelihood:                -509.89
No. Observations:                 185   AIC:                             1024.
Df Residuals:                     183   BIC:                             1030.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
parent         0.4664      0.065      7.153      0.000       0.338       0.595
const         34.4827      4.094      8.422      0.000      26.404      42.561
==============================================================================
Omnibus:                        9.513   Durbin-Watson:                   0.254
Prob(Omnibus):                  0.009   Jarque-Bera (JB):                4.926
Skew:                          -0.179   Prob(JB):                       0.0852
Kurtosis:                       2.285   Cond. No.                         913.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: child = 34.48 + 0.47*parent
Residual std dev (σ): 3.83 ± 0.20
MAD of residuals: 4.35

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  child   R-squared:                       0.218
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     51.16
Date:                Wed, 11 Mar 2026   Prob (F-statistic):           1.97e-11
Time:                        08:12:12   Log-Likelihood:                -509.89
No. Observations:                 185   AIC:                             1024.
Df Residuals:                     183   BIC:                             1030.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
parent_c       0.4664      0.065      7.153      0.000       0.338       0.595
const         63.7000      0.282    226.262      0.000      63.145      64.255
==============================================================================
Omnibus:                        9.513   Durbin-Watson:                   0.254
Prob(Omnibus):                  0.009   Jarque-Bera (JB):                4.926
Skew:                          -0.179   Prob(JB):                       0.0852
Kurtosis:                       2.285   Cond. No.                         4.32
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: child = 63.70 + 0.47*parent_c
Residual std dev (σ): 3.83 ± 0.20
MAD of residuals: 4.35

/var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_9001/3418795757.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\s+'`` instead
  heights = pd.read_csv('../ros_data/heights.txt', delim_whitespace=True)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:        daughter_height   R-squared:                       0.252
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     1860.
Date:                Wed, 11 Mar 2026   Prob (F-statistic):               0.00
Time:                        08:12:12   Log-Likelihood:                -12346.
No. Observations:                5524   AIC:                         2.470e+04
Df Residuals:                    5522   BIC:                         2.471e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
mother_height     0.5449      0.013     43.125      0.000       0.520       0.570
const            29.7984      0.790     37.703      0.000      28.249      31.348
==============================================================================
Omnibus:                       25.260   Durbin-Watson:                   0.031
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               31.772
Skew:                          -0.074   Prob(JB):                     1.26e-07
Kurtosis:                       3.340   Cond. No.                     1.62e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.62e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Formula: daughter_height = 29.80 + 0.54*mother_height
Residual std dev (σ): 2.26 ± 0.02
MAD of residuals: 2.16

n = 1000
true_ability = np.random.normal(50, 10, n)
midterm_score = true_ability + np.random.normal(0, 10, n)
final_score = true_ability + np.random.normal(0, 10, n)
scores = pd.DataFrame({'true_ability': true_ability, 'midterm_score': midterm_score, 'final_score': final_score})
display(scores.head())

fit_and_plot_lm(scores, ['midterm_score'], 'final_score', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:            final_score   R-squared:                       0.255
Model:                            OLS   Adj. R-squared:                  0.255
Method:                 Least Squares   F-statistic:                     342.1
Date:                Wed, 11 Mar 2026   Prob (F-statistic):           6.57e-66
Time:                        08:29:13   Log-Likelihood:                -3938.9
No. Observations:                1000   AIC:                             7882.
Df Residuals:                     998   BIC:                             7892.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
midterm_score     0.5228      0.028     18.497      0.000       0.467       0.578
const            23.3882      1.485     15.749      0.000      20.474      26.302
==============================================================================
Omnibus:                        1.672   Durbin-Watson:                   2.049
Prob(Omnibus):                  0.433   Jarque-Bera (JB):                1.537
Skew:                           0.069   Prob(JB):                        0.464
Kurtosis:                       3.133   Cond. No.                         198.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: final_score = 23.39 + 0.52*midterm_score
Residual std dev (σ): 12.44 ± 0.28
MAD of residuals: 12.52

	x	y
0	1	-0.110005
1	2	0.047969
2	3	1.255265
3	4	1.533453
4	5	2.216840

	height	weight	male	earn	earnk	ethnicity	education	mother_education	father_education	walk	exercise	smokenow	tense	angry	age	earn_k
0	74	210.0	1	50000.0	50.0	White	16.0	16.0	16.0	3	3	2.0	0.0	0.0	45	50.0
1	66	125.0	0	60000.0	60.0	White	16.0	16.0	16.0	6	5	1.0	0.0	0.0	58	60.0
2	64	126.0	0	30000.0	30.0	White	16.0	16.0	16.0	8	1	2.0	1.0	1.0	29	30.0
3	65	200.0	0	25000.0	25.0	White	17.0	17.0	NaN	8	1	2.0	0.0	0.0	57	25.0
4	63	110.0	0	50000.0	50.0	Other	16.0	16.0	16.0	5	6	2.0	0.0	0.0	91	50.0

	child	parent	frequency	gp	par	chl
1	59.5	62.5	0.5	fs	Father	Son
2	59.5	63.5	0.5	fs	Father	Son
3	59.5	64.5	1.0	fs	Father	Son
4	60.5	62.5	0.5	fs	Father	Son
5	60.5	66.5	1.0	fs	Father	Son

	child	parent	frequency	gp	par	chl	parent_c
180	52.5	59.5	0.50	md	Mother	Daughter	-3.145946
181	53.5	59.5	0.50	md	Mother	Daughter	-3.145946
182	55.5	59.5	1.00	md	Mother	Daughter	-3.145946
183	56.5	58.5	1.00	md	Mother	Daughter	-4.145946
184	56.5	59.5	0.25	md	Mother	Daughter	-3.145946

	true_ability	midterm_score	final_score
0	53.575017	56.324755	50.121899
1	59.765044	66.258999	59.124500
2	65.054658	68.848283	76.527669
3	41.159978	42.237073	33.467692
4	48.079297	43.221099	52.619541

6. Background on regression modeling¶

6.1 Regression models¶

6.2 Fitting a simple regression to fake data¶

Fitting a regression and displaying the results¶

Comparing estimates to assumed parameter values¶

6.3 Interpret coefficients as comparisons, not effects¶

6.4 Historical origins of regression¶

Daughters’ heights “regressing” to the mean¶

6.5 The paradox of regression to the mean¶

How regression to the mean can confuse people about causal inference; demonstration using fake data¶

6.6 Bibliographic note¶

6.7 Exercises¶