import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

def fit_and_plot_lm(data, predictors, outcome, add_constant=True, show_plot=True, scatter_kws=None, line_kws=None):
    """
    Fit a linear model using statsmodels, print summary, plot, and show formula.
    Args:
        data: pandas DataFrame
        predictors: list of predictor column names (str)
        outcome: outcome column name (str)
        add_constant: whether to add intercept (default True)
        show_plot: whether to plot (default True)
        scatter_kws: dict, kwargs for scatterplot
        line_kws: dict, kwargs for regression line
    """
    X = data[predictors].copy()
    if add_constant:
        X = sm.add_constant(X, prepend=False)
    y = data[outcome]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
    # Print formula
    params = results.params
    formula = f"{outcome} = " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    if add_constant:
        formula = f"{outcome} = {params['const']:.2f} + " + " + ".join([f"{params[name]:.2f}*{name}" for name in predictors])
    print("Formula:", formula)
    # Plot if only one predictor
    if show_plot and len(predictors) == 1:
        x_name = predictors[0]
        ax = sns.scatterplot(data=data, x=x_name, y=outcome, **(scatter_kws or {}))
        x_vals = np.linspace(data[x_name].min(), data[x_name].max(), 100)
        y_vals = params['const'] + params[x_name] * x_vals if add_constant else params[x_name] * x_vals
        ax.plot(x_vals, y_vals, color='red', **(line_kws or {}))
        ax.set_title('Linear Regression Fit')
        plt.show()

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

mile = pd.read_csv('../ros_data/mile.csv')

display(mile)

fit_and_plot_lm(mile, ['year'], 'seconds', add_constant=True, show_plot=True, scatter_kws=None, line_kws=None)

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                seconds   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     1277.
Date:                Wed, 18 Feb 2026   Prob (F-statistic):           3.78e-26
Time:                        09:56:41   Log-Likelihood:                -54.745
No. Observations:                  32   AIC:                             113.5
Df Residuals:                      30   BIC:                             116.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
year          -0.3930      0.011    -35.734      0.000      -0.416      -0.371
const       1006.8760     21.532     46.762      0.000     962.902    1050.850
==============================================================================
Omnibus:                        0.318   Durbin-Watson:                   0.840
Prob(Omnibus):                  0.853   Jarque-Bera (JB):                0.115
Skew:                           0.144   Prob(JB):                        0.944
Kurtosis:                       2.937   Cond. No.                     1.72e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.72e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
Formula: seconds = 1006.88 + -0.39*year

# Create a dataframe with some fabricated data
x = np.arange(1, 2000)
a = 1.5e9  # 1.5x10^9
b = np.log(2) / 50  # Doubling every 50 units
y = a * np.exp(b * (x - 1900))  # Shift x to start from 1900
data = pd.DataFrame({'x': x, 'y': y})

sns.scatterplot(data=data, x='x', y='y')
plt.title('Exponential growth y = a * exp(b * x), b > 0')
# x limit start 1900
plt.xlim(1900, 2000)
plt.ylim(1.5e9, 6e9)
plt.show()

# Exponential decline example
x = np.arange(1, 10)
a = 1000
b = np.log(0.8)
y = a * np.exp(b * x)
data = pd.DataFrame({'x': x, 'y': y})

sns.scatterplot(data=data, x='x', y='y')
plt.title('Exponential decline y = a * exp(b * x), b < 0')
plt.show()

# Create a dataframe with some fabricated data
x = np.arange(1, 4000)
a = 3.3
b = 0.74
y = a * x ** b
data = pd.DataFrame({'x': x, 'y': y})

sns.scatterplot(data=data, x='x', y='y')
plt.title('Power-law growth y = A * x^b, b > 0')
plt.show()

# Modify x and y to be log of the original values
data['x_log'] = np.log(data['x'])
data['y_log'] = np.log(data['y'])

sns.scatterplot(data=data, x='x_log', y='y_log')
plt.title('Power-law growth y = A * x^b, b > 0')
plt.show()

# Make x and y axis log scale
sns.scatterplot(data=data, x='x', y='y')
plt.xscale('log')
plt.yscale('log')
plt.title('Power-law growth y = A * x^b, b > 0 (log-log scale)')
plt.show()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data from previous step
height_counts_women = np.array([
    80, 107, 296, 695, 1612, 2680, 4645, 8201, 9948, 11733, 10270, 9942, 6181, 3990, 2131, 1154, 245, 257, 0, 0, 0, 0
]) * 10339 / 74167

weight_counts_women = np.array([
    362, 1677, 4572, 9363, 11420, 12328, 9435, 7023, 5047, 3621, 2753, 2081, 1232, 887, 2366
]) * 10339 / 74167

height_counts_men = np.array([
    0, 0, 0, 0, 0, 0, 0, 542, 668, 1221, 2175, 4213, 5535, 7980, 9566, 9578, 8867, 6716, 5019, 2745, 1464, 1263
]) * 9983 / 67552

# Round counts to nearest integer for sampling
height_counts_women_int = np.round(height_counts_women).astype(int)
weight_counts_women_int = np.round(weight_counts_women).astype(int)
height_counts_men_int = np.round(height_counts_men).astype(int)

# Create synthetic data arrays
height_data_women = np.concatenate([
    np.full(count, bin_idx) for bin_idx, count in enumerate(height_counts_women_int)
])
weight_data_women = np.concatenate([
    np.full(count, bin_idx) for bin_idx, count in enumerate(weight_counts_women_int)
])
height_data_men = np.concatenate([
    np.full(count, bin_idx) for bin_idx, count in enumerate(height_counts_men_int)
])

# Create DataFrames with thousands of entries
height_df_women = pd.DataFrame({'height_bin': height_data_women})
weight_df_women = pd.DataFrame({'weight_bin': weight_data_women})
height_df_men = pd.DataFrame({'height_bin': height_data_men})

# Seaborn histograms men and women heights on single plot
plt.figure(figsize=(8, 4))
sns.histplot(data=height_df_women, x='height_bin', color='blue', label='Women', alpha=0.5)
sns.histplot(data=height_df_men, x='height_bin', color='orange', label='Men', alpha=0.5)
plt.xlabel("Height Bin")
plt.ylabel("Count")
plt.title("Distribution of Heights of Men and Women")
plt.legend()
plt.show()

from scipy.stats import norm

print(norm.pdf(0.5, 0.49, 0.04) / 2e5)

print(1/norm.pdf(0.5, 0.49, 0.04) * 2e5)

4.8333514600356156e-05
20689.57757921108

from scipy.stats import binom

print(binom.pmf(int(1e5), int(2e5), 0.5))
print(1/binom.pmf(int(1e5), int(2e5), 0.5))

print(binom.pmf(int(1e5), int(2e5), 0.49))

0.0017841218859990207
560.4998222641325
7.519171007276916e-21

	yr	month	min	sec	year	seconds
0	1913	5.0	4	14.40	1913.416667	254.40
1	1915	7.0	4	12.60	1915.583333	252.60
2	1923	8.0	4	10.40	1923.666667	250.40
3	1931	10.0	4	9.20	1931.833333	249.20
4	1933	7.0	4	7.60	1933.583333	247.60
5	1934	6.0	4	6.80	1934.500000	246.80
6	1937	8.0	4	6.40	1937.666667	246.40
7	1942	7.0	4	6.20	1942.583333	246.20
8	1942	7.0	4	6.20	1942.583333	246.20
9	1942	9.0	4	4.60	1942.750000	244.60
10	1943	7.0	4	2.60	1943.583333	242.60
11	1944	7.0	4	1.60	1944.583333	241.60
12	1945	7.0	4	1.40	1945.583333	241.40
13	1954	5.0	3	59.40	1954.416667	239.40
14	1954	6.0	3	58.00	1954.500000	238.00
15	1957	7.0	3	57.20	1957.583333	237.20
16	1958	8.0	3	54.50	1958.666667	234.50
17	1962	1.0	3	54.40	1962.083333	234.40
18	1964	11.0	3	54.10	1964.916667	234.10
19	1965	6.0	3	53.60	1965.500000	233.60
20	1966	7.0	3	51.30	1966.583333	231.30
21	1967	6.0	3	51.10	1967.500000	231.10
22	1975	5.0	3	51.00	1975.416667	231.00
23	1975	8.0	3	49.40	1975.666667	229.40
24	1979	7.0	3	49.00	1979.583333	229.00
25	1980	7.0	3	48.80	1980.583333	228.80
26	1981	8.0	3	48.53	1981.666667	228.53
27	1981	8.2	3	48.40	1981.683333	228.40
28	1981	8.3	3	47.33	1981.691667	227.33
29	1985	7.0	3	46.32	1985.583333	226.32
30	1993	9.0	3	44.39	1993.750000	224.39
31	1999	7.0	3	43.13	1999.583333	223.13

3. Some basic methods in mathematics and probability¶

3.1 Weighted averages¶

3.2 Vectors and matrices¶

3.3 Graphing a line¶

3.4 Exponential and power-law growth and decline; logarithmic and log-log relationships¶

Exponential growth and decline¶

Power-law growth and decline¶

3.5 Probability distributions¶

Mean and standard deviations of a probability distribution¶

Normal distribution, mean and standard deviation¶

Linear transformations¶

Mean and standard deviation of the sum of correlated random variables¶

Log normal distribution¶

Binomial distribution¶

Poisson distribution¶

Unclassified distributions¶

Probability distributions of error¶

Comparing distributions¶

3.6 Probability modeling¶

Empirical forecasting¶

Using a reasonable seeming but inappropriate model¶

General lessons for probability modeling¶

3.7 Bibliographic note¶

3.8 Exercises¶