import pandas as pd
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Import hibbs.dat from /ros_data
hibbs = pd.read_csv('../ros_data/hibbs.dat', 
                    sep='\s+')

# Display the first few rows of the dataset
display(hibbs.head(10))

# Add constant term for intercept
hibbs = sm.add_constant(hibbs, prepend=False)

# Fit and summarize the regression model
model = sm.OLS(hibbs['vote'], hibbs[['growth', 'const']])
results = model.fit()

# Print the summary of regression results
print(results.summary())

# Extract slope and intercept from the results
slope = results.params['growth']
intercept = results.params['const']

# Print the formula for the regression line
print(f"y = {intercept:.1f} + {slope:.1f}x")

# Print the mean absolute error
print("Mean Absolute Error:", 
      sm.tools.eval_measures.medianabs(hibbs['vote'], results.fittedvalues))

# Print the standard deviation of the error
print("Standard Deviation of Error:", 
      sm.tools.eval_measures.stde(hibbs['vote'], results.fittedvalues))

# Calculate sigma (standard deviation of residuals)
print("Sigma (std of residuals):", 
      results.scale ** 0.5)

# Plot growth vs vote
ax = sns.regplot(data=hibbs, 
                 x='growth', 
                 y='vote')

# Add formula to the plot
ax.text(0.05, 0.95,
        f"y = {intercept:.1f} + {slope:.1f}x",
        transform=ax.transAxes,
        verticalalignment='top')

plt.show()

# Simple formula:
print("Vote share = {:.1f} + {:.1f}(economic growth)".format(intercept, slope))

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   vote   R-squared:                       0.580
Model:                            OLS   Adj. R-squared:                  0.550
Method:                 Least Squares   F-statistic:                     19.32
Date:                Tue, 03 Feb 2026   Prob (F-statistic):           0.000610
Time:                        09:40:19   Log-Likelihood:                -42.839
No. Observations:                  16   AIC:                             89.68
Df Residuals:                      14   BIC:                             91.22
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
growth         3.0605      0.696      4.396      0.001       1.567       4.554
const         46.2476      1.622     28.514      0.000      42.769      49.726
==============================================================================
Omnibus:                        5.392   Durbin-Watson:                   2.379
Prob(Omnibus):                  0.067   Jarque-Bera (JB):                2.828
Skew:                          -0.961   Prob(JB):                        0.243
Kurtosis:                       3.738   Cond. No.                         4.54
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = 46.2 + 3.1x
Mean Absolute Error: 1.658121283747942
Standard Deviation of Error: 3.520233251276052
Sigma (std of residuals): 3.7632876422297965

Vote share = 46.2 + 3.1(economic growth)

# Import electric.csv from /ros_data
electric = pd.read_csv('../ros_data/electric.csv', index_col=0)

# Print head of electric dataset
display(electric.head(10))

# Plot summary statistics
print(electric.describe())

# Plot distributions of post_test scores for treatment and control groups
sns.displot(data=electric, x="post_test", hue="treatment", col="grade", kind="kde", fill=True)
plt.show()

# Plot violin plots of post_test scores by treatment and grade
sns.violinplot(data=electric, x='grade', y='post_test', hue='treatment', split=True)
plt.show()

# Print mean post_test scores by treatment and grade group
print(electric.groupby(['treatment', 'grade'])['post_test'].mean())

        post_test    pre_test       grade   treatment       supp     pair_id
count  192.000000  192.000000  192.000000  192.000000  96.000000  192.000000
mean    97.149479   72.224479    2.427083    0.500000   0.635417   48.500000
std     17.758700   33.933253    1.060917    0.501307   0.483840   27.783757
min     44.200000    8.800000    1.000000    0.000000   0.000000    1.000000
25%     86.950000   52.500000    2.000000    0.000000   0.000000   24.750000
50%    102.300000   80.750000    2.000000    0.500000   1.000000   48.500000
75%    111.000000  100.625000    3.000000    1.000000   1.000000   72.250000
max    122.000000  119.800000    4.000000    1.000000   1.000000   96.000000

treatment  grade
0          1         68.790476
           2         93.211765
           3        106.175000
           4        110.357143
1          1         77.090476
           2        101.570588
           3        106.510000
           4        114.066667
Name: post_test, dtype: float64

# Simple peacekeeping analysis (following the book)
import numpy as np
import matplotlib.pyplot as plt

# Load the peacekeeping dataset
peace_df = pd.read_stata('../ros_data/pk&pkept_old.dta')

# The authors did not provide variable labels or explanations, so this is my best guess:
# 'pk_dum' indicates presence of UN peacekeepers (1 = present, 0 = absent)
# 'morewar' indicates whether war returned (0 = no return to war, 1 = war returned)
# 'hazard1' is the pre-treatment hazard rate of return to war
# 'cfdate' is the date of ceasefire
# 'faildate' is the date when peace failed (war returned)
# 'pcw' indicates whether the case is post-civil-war (1 = post-civil-war, 0 = not)

# Create a copy of the dataset with selected variables
peace_df = peace_df[['pk_dum', 'morewar', 'hazard1', 'cfdate', 'faildate', 'pcw']].copy()

# Rename columns to more descriptive names
peace_df.columns = ['peacekeepers_present', 'war_returned', 'hazard_rate', 'ceasefire_date', 'failure_date', 'post_civil_war']

# Display the first few rows of the peacekeeping dataset
# display(peace_df.head(10))

# Print value counts for key variables
# print(peace_df['peacekeepers_present'].value_counts(dropna=False))
# print(peace_df['war_returned'].value_counts(dropna=False))
# print(peace_df['hazard_rate'].value_counts(dropna=False))
# print(peace_df['ceasefire_date'].value_counts(dropna=False))
# print(peace_df['failure_date'].value_counts(dropna=False))
# print(peace_df['post_civil_war'].value_counts(dropna=False))

# Convert ceasefire and failure dates to datetime
peace_df['ceasefire_date'] = pd.to_datetime(peace_df['ceasefire_date'])
peace_df['failure_date'] = pd.to_datetime(peace_df['failure_date'])

# Where nans in failure_date, replace with a censoring date (end of 2004)
peace_df['failure_date'] = peace_df['failure_date'].where(~peace_df['failure_date'].isna(), pd.to_datetime('2004-12-31'))

# Calculate delay in years from ceasefire to end of peace (or censoring)
peace_df['delay_years'] = (peace_df['failure_date'] - peace_df['ceasefire_date']).dt.days / 365.24

# Only keep post-civil-war cases with valid delay
peace_df = peace_df[(peace_df['post_civil_war'] == 1) & (~peace_df['delay_years'].isna())].copy()

# Print the proportion of countries that stayed at peace (with and without UN peacekeepers)
print(peace_df.groupby('peacekeepers_present')['war_returned'].value_counts(normalize=True).unstack())

# Plot the distribution of delay years by presence of peacekeepers
sns.displot(data=peace_df, x='delay_years', col='peacekeepers_present')

war_returned               0.0       1.0
peacekeepers_present                    
0.0                   0.338710  0.661290
1.0                   0.558824  0.441176

<seaborn.axisgrid.FacetGrid at 0x30911cbd0>

np.random.seed(1151) # Set seed for reproducibility
N = 50 # Number of samples
x = np.random.uniform(1, 5, N) # Generate uniform random x values
y = np.random.normal(10 + 3 * x, 3, N) # Generate y values based on linear model with noise
x_binary = np.where(x < 3, 0, 1) # Create binary variable based on x
data = pd.DataFrame({'x': x, 'y': y, 'x_binary': x_binary}) # Create DataFrame
# display(data.head(10)) # Display first 10 rows of the dataset

# Scatter plot of y vs x_binary with regression line
ax = sns.scatterplot(data=data, x='x_binary', y='y')
sns.regplot(data=data, x='x_binary', y='y', scatter=False, ax=ax, color='red')
ax.set_xlim(left=0)  # Set x-axis to start at 0
plt.show()

data = sm.add_constant(data, prepend=False) # Add constant
# print(data.head(10)) # Display first 10 rows of the dataset
model = sm.OLS(data['y'], data[['x_binary', 'const']]) # Fit model: y ~ x_binary + const
results = model.fit() # Summarise model
print(results.summary()) # Print the summary of regression results
slope = results.params['x_binary'] # Extract slope
intercept = results.params['const'] # Extract intercept
print(f"y = {intercept:.1f} + {slope:.1f}*x_binary") # Print the formula for the regression line

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.401
Model:                            OLS   Adj. R-squared:                  0.388
Method:                 Least Squares   F-statistic:                     32.07
Date:                Tue, 03 Feb 2026   Prob (F-statistic):           8.16e-07
Time:                        09:40:19   Log-Likelihood:                -125.46
No. Observations:                  50   AIC:                             254.9
Df Residuals:                      48   BIC:                             258.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x_binary       4.8678      0.860      5.663      0.000       3.140       6.596
const         16.6639      0.620     26.886      0.000      15.418      17.910
==============================================================================
Omnibus:                        1.154   Durbin-Watson:                   1.800
Prob(Omnibus):                  0.562   Jarque-Bera (JB):                1.186
Skew:                           0.287   Prob(JB):                        0.553
Kurtosis:                       2.510   Cond. No.                         2.67
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = 16.7 + 4.9*x_binary

# Scatter plot of y vs x with regression line
ax = sns.scatterplot(data=data, x='x', y='y')
# Extend regression line to x=0 by specifying xlim and plotting a manual line
sns.regplot(data=data, x='x', y='y', scatter=False, ax=ax, color='red', truncate=False)
plt.show()

data = sm.add_constant(data, prepend=False) # Add constant
# print(data.head(10)) # Display first 10 rows of the dataset
model = sm.OLS(data['y'], data[['x', 'const']]) # Fit model: y ~ x_binary + const
results = model.fit() # Summarise model
print(results.summary()) # Print the summary of regression results
slope = results.params['x'] # Extract slope
intercept = results.params['const'] # Extract intercept
print(f"y = {intercept:.1f} + {slope:.1f}*x") # Print the formula for the regression line

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.485
Model:                            OLS   Adj. R-squared:                  0.474
Method:                 Least Squares   F-statistic:                     45.20
Date:                Tue, 03 Feb 2026   Prob (F-statistic):           1.96e-08
Time:                        09:40:19   Log-Likelihood:                -121.67
No. Observations:                  50   AIC:                             247.3
Df Residuals:                      48   BIC:                             251.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x              2.7857      0.414      6.723      0.000       1.953       3.619
const         10.4407      1.362      7.668      0.000       7.703      13.179
==============================================================================
Omnibus:                        3.603   Durbin-Watson:                   1.961
Prob(Omnibus):                  0.165   Jarque-Bera (JB):                2.686
Skew:                           0.546   Prob(JB):                        0.261
Kurtosis:                       3.312   Cond. No.                         12.2
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = 10.4 + 2.8*x

# Non linear model
np.random.seed(1151) # Set seed for reproducibility
N = 50 # Number of samples
x = np.random.uniform(1, 5, N) # Generate uniform random x values
y = np.random.normal(5 + 30 * np.exp(-x), 2, N) # Generate y values based on non-linear model with noise
data = pd.DataFrame({'x': x, 'y': y}) # Create DataFrame

# Fit lowess
ax = sns.scatterplot(data=data, x='x', y='y')
sns.regplot(data=data, x='x', y='y', scatter=False, ax=ax, color='red', lowess=True)
plt.show()

# Fit linear
ax = sns.scatterplot(data=data, x='x', y='y')
sns.regplot(data=data, x='x', y='y', scatter=False, ax=ax, color='red')
plt.show()



from scipy.optimize import curve_fit

# Define the exponential decay function
def exp_decay(x, a, b):
    return a + b * np.exp(-x)

# Fit the model to your data
params, covariance = curve_fit(lambda x, a, b: a + b * np.exp(-x), data['x'], data['y'], p0=[5, 30])

# Print estimated parameters
print("Estimated a:", params[0])
print("Estimated b:", params[1])

# Plot the fit
plt.scatter(data['x'], data['y'], label='Data')
plt.plot(np.sort(data['x']), params[0] + params[1] * np.exp(-np.sort(data['x'])), color='red', label='Fitted curve')
plt.legend()
plt.show()

Estimated a: 4.6802800130372
Estimated b: 32.50996224944559

import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

# Non linear model
np.random.seed(1151) # Set seed for reproducibility
import numpy as np

N = 100
z = np.repeat([0, 1], N // 2)
# print(z)
xx = np.random.normal(0, np.where(z == 0, 1.2, 0.8), N) ** 2
yy = np.random.normal(20 + 5 * xx + 10 * z, 3)
data = pd.DataFrame({'xx': xx, 'z': z, 'yy': yy})

# Mean
print(data.groupby('z')['yy'].mean())
# Difference in means
print("Difference in means y", data[data['z'] == 1]['yy'].mean() - data[data['z'] == 0]['yy'].mean())

# Mean
print(data.groupby('z')['xx'].mean())
# Difference in means
print("Difference in means x", data[data['z'] == 1]['xx'].mean() - data[data['z'] == 0]['xx'].mean())


# display(data.head(10))

# Fit linear
ax = sns.scatterplot(data=data, x='xx', y='yy')
sns.regplot(data=data[z==1], x='xx', y='yy', scatter=False, ax=ax, color='red', truncate=False)
sns.regplot(data=data[z==0], x='xx', y='yy', scatter=False, ax=ax, color='green')
plt.show()

fit_and_plot_lm(data[z==1], predictors=['xx'], outcome='yy', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)
fit_and_plot_lm(data[z==0], predictors=['xx'], outcome='yy', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

z
0    28.161774
1    31.649151
Name: yy, dtype: float64
Difference in means y 3.4873768848605664
z
0    1.688619
1    0.488257
Name: xx, dtype: float64
Difference in means x -1.2003618536469403

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     yy   R-squared:                       0.483
Model:                            OLS   Adj. R-squared:                  0.473
Method:                 Least Squares   F-statistic:                     44.90
Date:                Tue, 03 Feb 2026   Prob (F-statistic):           2.12e-08
Time:                        09:40:20   Log-Likelihood:                -128.65
No. Observations:                  50   AIC:                             261.3
Df Residuals:                      48   BIC:                             265.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
xx             4.4098      0.658      6.701      0.000       3.087       5.733
const         29.4960      0.559     52.742      0.000      28.372      30.620
==============================================================================
Omnibus:                        1.724   Durbin-Watson:                   2.400
Prob(Omnibus):                  0.422   Jarque-Bera (JB):                0.956
Skew:                           0.289   Prob(JB):                        0.620
Kurtosis:                       3.352   Cond. No.                         1.97
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: yy = 29.50 + 4.41*xx

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     yy   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     304.3
Date:                Tue, 03 Feb 2026   Prob (F-statistic):           2.06e-22
Time:                        09:40:20   Log-Likelihood:                -133.81
No. Observations:                  50   AIC:                             271.6
Df Residuals:                      48   BIC:                             275.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
xx             4.9022      0.281     17.444      0.000       4.337       5.467
const         19.8838      0.695     28.620      0.000      18.487      21.281
==============================================================================
Omnibus:                        1.119   Durbin-Watson:                   2.010
Prob(Omnibus):                  0.572   Jarque-Bera (JB):                0.803
Skew:                          -0.310   Prob(JB):                        0.669
Kurtosis:                       2.981   Cond. No.                         3.67
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Formula: yy = 19.88 + 4.90*xx

	year	growth	vote	inc_party_candidate	other_candidate
0	1952	2.40	44.60	Stevenson	Eisenhower
1	1956	2.89	57.76	Eisenhower	Stevenson
2	1960	0.85	49.91	Nixon	Kennedy
3	1964	4.21	61.34	Johnson	Goldwater
4	1968	3.02	49.60	Humphrey	Nixon
5	1972	3.62	61.79	Nixon	McGovern
6	1976	1.08	48.95	Ford	Carter
7	1980	-0.39	44.70	Carter	Reagan
8	1984	3.86	59.17	Reagan	Mondale
9	1988	2.27	53.94	Bush, Sr.	Dukakis

1.1 The three challenges of statistics¶

1.2 Why learn regression?¶

1.3 Some examples of regression¶

A randomized experiment on the effect of an educational television program¶

Estimating the effects of United Nations peacekeeping, using pre-treatment variables to adjust for differences between treatment and control groups¶

1.4 Challenges in building, understanding, and interpreting regressions¶

Regression to estimate a relationship of interest¶

Binary outcome¶

Continuous outcome¶

Nonlinear regression¶

Regression to adjust for differences between treatment and control groups¶

Interpreting coefficients in a predictive model¶

Building, interpreting, and checking regression models¶

1.5 Classical and Bayesian inference¶

Information¶

Assumptions¶

Classical inference¶

Bayesian inference¶

	post_test	pre_test	grade	treatment	supp	pair_id
1	48.9	13.8	1	1	1.0	1
2	70.5	16.5	1	1	0.0	2
3	89.7	18.5	1	1	1.0	3
4	44.2	8.8	1	1	0.0	4
5	77.5	15.3	1	1	1.0	5
6	84.7	15.0	1	1	0.0	6
7	78.9	19.4	1	1	0.0	7
8	86.8	15.0	1	1	1.0	8
9	60.8	11.8	1	1	1.0	9
10	75.7	16.4	1	1	1.0	10