import numpy as np                                          # for numerical operations
from scipy import stats                                     # for statistical distributions

# Create the data
y = np.repeat([0, 1, 2, 3, 4], [600, 300, 50, 30, 20])    # repeat each value the specified number of times

n = len(y)                                                  # sample size (1000)
estimate = np.mean(y)                                       # sample mean (point estimate)
se = np.std(y, ddof=1) / np.sqrt(n)                         # standard error = sample SD / sqrt(n), ddof=1 for sample SD

int_50 = estimate + stats.t.ppf([0.25, 0.75], df=n-1) * se   # 50% confidence interval using t-distribution quantiles
int_95 = estimate + stats.t.ppf([0.025, 0.975], df=n-1) * se  # 95% confidence interval using t-distribution quantiles

print(f"Estimate: {estimate:.3f}")                          # print the sample mean
print(f"SE: {se:.4f}")                                      # print the standard error
print(f"50% CI: [{int_50[0]:.3f}, {int_50[1]:.3f}]")       # print the 50% confidence interval
print(f"95% CI: [{int_95[0]:.3f}, {int_95[1]:.3f}]")       # print the 95% confidence interval

Estimate: 0.570
SE: 0.0277
50% CI: [0.551, 0.589]
95% CI: [0.516, 0.624]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read all lines from the file
with open('../ros_data/death_polls.dat', 'r') as f:
    lines = [line.strip() for line in f.readlines()]  # strip whitespace from each line

# Group every 4 lines into one record
records = []
for i in range(0, len(lines), 4):                     # step through lines in groups of 4
    year_month = lines[i].split()                      # split "2002 10" into ["2002", "10"]
    year = int(year_month[0])                          # first value is the year
    month = float(year_month[1])                       # second value is the month (can be fractional)
    val1 = int(lines[i+1])                             # percent support
    val2 = int(lines[i+2])                             # percent oppose
    val3 = int(lines[i+3])                             # percent don't know
    records.append([year, month, val1, val2, val3])

# Create a DataFrame
df = pd.DataFrame(records, columns=['year', 'month', 'support', 'oppose', 'dont_know'])

# val1, val2, val3 are percentages (sum to 100), not raw counts
# percent support among those with an opinion
df['percent_support'] = df['support'] / (df['support'] + df['oppose'])

# SE for a proportion requires actual sample size; Gallup polls typically survey ~1000 people
n_respondents = 1000
df['se'] = np.sqrt(df['percent_support'] * (1 - df['percent_support']) / n_respondents)
df['ci_68_lower'] = df['percent_support'] - stats.norm.ppf(0.84) * df['se']
df['ci_68_upper'] = df['percent_support'] + stats.norm.ppf(0.84) * df['se']

display(df.head())

fig, ax = plt.subplots(figsize=(10, 5))

# Convert year + fractional month to a decimal year for x-axis positioning
x = df['year'] + (df['month'] - 1) / 12

# Error bars from ci_68_lower to ci_68_upper
yerr_lower = (df['percent_support'] - df['ci_68_lower']) * 100
yerr_upper = (df['ci_68_upper'] - df['percent_support']) * 100

ax.errorbar(x, df['percent_support'] * 100, yerr=[yerr_lower, yerr_upper],
            fmt='o', color='#1a1a2e', ecolor='#888', elinewidth=1.2,
            capsize=3, capthick=1.2, markersize=5)

ax.set_xlabel('Year')
ax.set_ylabel('Support for death penalty (%)')
ax.set_title('Public support for the death penalty')
sns.despine()
plt.tight_layout()
plt.show()

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

votes = pd.read_csv('../ros_data/Riverbay.csv', skiprows=0, names=['1_tally', '2_tally', '3_tally', '4_tally', '5_tally', '6_tally', 'candidate'])

# print the unique candidate names and count of unique candidate names
print(votes['candidate'].unique())
print(len(votes['candidate'].unique()))

# drop rows where the candidate name is missing (NaN)
votes.dropna(subset=['candidate'], inplace=True)

#reset the index to be the candidate names
votes.set_index('candidate', inplace=True)

# filter for candidates of interest
candidates_of_interest = ['Hal Spitzer', 'Margie Best', 'Greg Stevens', 'Josh Walker', 'Clotelia Smith', 'Dave Barron', 'Alphonse Preston', 'Andy Willis']
votes = votes[votes.index.isin(candidates_of_interest)]

display(votes.head())

# print unique names (index)
print(votes.index.unique())

# for each tally calculate the difference from the previous tally
votes['diff_2'] = votes['2_tally'] - votes['1_tally']
votes['diff_3'] = votes['3_tally'] - votes['2_tally']
votes['diff_4'] = votes['4_tally'] - votes['3_tally']
votes['diff_5'] = votes['5_tally'] - votes['4_tally']
votes['diff_6'] = votes['6_tally'] - votes['5_tally']

# copy the differences into a new DataFrame for plotting
diffs = votes[['1_tally', 'diff_2', 'diff_3', 'diff_4', 'diff_5', 'diff_6']].copy()

# rename columns for better plotting
diffs.columns = ['1', '2', '3', '4', '5', '6']

# calculate the sum for each tally
print(diffs[['1', '2', '3', '4', '5', '6']].sum())

# calculate the total for each candidate across all tallies
diffs['total'] = diffs[['1', '2', '3', '4', '5', '6']].sum(axis=1)

# display(diffs.head())

# for each candidate, calculate the percetage of votes they recieved for each tally of the total votes for that tally
for col in ['1', '2', '3', '4', '5', '6']:
    diffs[col] = diffs[col] / diffs[col].sum() * 100
    
display(diffs.head())

cols = ['1', '2', '3', '4', '5', '6']

# plot the percentage of votes for each candidate across the tallies, include legend and labels
plt.figure(figsize=(10, 6))
for candidate in diffs.index:
    plt.plot(cols, diffs.loc[candidate, cols], marker='o', label=candidate)
plt.xlabel('Tally')
plt.ylabel('Percentage of Votes (%)')
plt.title('Percentage of Votes for Each Candidate Across Tallies')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

# Calculate the standard deviation of votes for each candidate across the 6 tallies
diffs['std_dev'] = diffs[['1', '2', '3', '4', '5', '6']].std(axis=1)
display(diffs[['std_dev']])

# plot the standard deviation of votes for each candidate on y axis and the total votes for each candidate on the x axis
plt.figure(figsize=(10, 6))
plt.scatter(diffs['total'], diffs['std_dev'])
plt.xlabel('Total Votes')
plt.ylabel('Standard Deviation of Votes Across Tallies')
plt.title('Standard Deviation of Votes Across Tallies vs Total Votes')
plt.grid()
plt.tight_layout()
plt.show()

['Clotelia Smith' 'Earl Coppin' 'Clarissa Montes' nan 'Hal Spitzer'
 'Margie Best' 'Josh Walker' 'Greg Stevens' 'Dave Barron' 'Andy Willis'
 'Alphonse Preston']
11

Index(['Clotelia Smith', 'Hal Spitzer', 'Margie Best', 'Josh Walker',
       'Greg Stevens', 'Dave Barron', 'Andy Willis', 'Alphonse Preston'],
      dtype='object', name='candidate')
1    1803
2    1833
3    3869
4    3050
5    3020
6    3491
dtype: int64

	year	month	support	oppose	dont_know	percent_support	se	ci_68_lower	ci_68_upper
0	2002	10.0	70	25	5	0.736842	0.013925	0.722994	0.750690
1	2002	5.0	72	25	3	0.742268	0.013831	0.728513	0.756023
2	2001	10.0	68	26	6	0.723404	0.014145	0.709337	0.737471
3	2001	5.0	65	27	8	0.706522	0.014400	0.692202	0.720842
4	2001	2.0	67	25	8	0.728261	0.014068	0.714271	0.742250

	1_tally	2_tally	3_tally	4_tally	5_tally	6_tally
candidate
Clotelia Smith	208	416	867	1259	1610	2020
Hal Spitzer	333	650	1326	1870	2418	3040
Margie Best	236	483	1017	1422	1821	2300
Josh Walker	229	450	922	1318	1688	2131
Greg Stevens	235	462	970	1342	1724	2176

	1	2	3	4	5	6	total
candidate
Clotelia Smith	11.536328	11.347518	11.656759	12.852459	11.622517	11.744486	2020
Hal Spitzer	18.469218	17.294053	17.472215	17.836066	18.145695	17.817244	3040
Margie Best	13.089296	13.475177	13.802016	13.278689	13.211921	13.720997	2300
Josh Walker	12.701054	12.056738	12.199535	12.983607	12.251656	12.689774	2131
Greg Stevens	13.033833	12.384070	13.130008	12.196721	12.649007	12.947579	2176

4. Statistical inference¶

4.1 Sampling distributions and generative models¶

Sampling, measurement error and model error¶

Sampling distribution¶

4.2 Estimates, standard errors, and confidence intervals¶

Parameters, estimands, and estimates¶

Standard errors, inferencial uncertainty, and confidence intervals¶

Standard errors and confidence intervals for averages and proportions¶

Standard error and confidence interval for a proportion when y=0 or y=n¶

Standard error for a comparison¶

Sampling distribution of the same mean and standard deviation: normal and x2 distributions¶

Degrees of freedom¶

Confidence intervals from the t distribution¶

Inference for discrete data¶

Linear transformations¶

Claudes laymans explanation¶

Weighted averages¶

4.3 Bias and unmodeled uncertainty¶

Bias in estimation¶

Adjusting inferences to account for bias and unmodeled uncertainty¶

4.4 Statistical significance, hypothesis testing, and statistical errors¶

Statistical significance¶

Hypothesis testing for simple comparisons¶

Hypothesis testing: general formulation¶

Comparisons of parameters to fixed values and each other: interpreting confidence intervals as hypothesis tests¶

Type 1 and type 2 errors and why we dont like talking about them¶

Type M (magnitude) and type S (sign) errors¶

Hypothesis testing and statistical practice¶

4.5 Problems with the concept of statistical significance¶

Statistical significance is not the same as practical significance¶

Non significant results are not evidence of no effect¶

The difference between “significant” and “not significant” is not itself statistically significant¶

Researcher degrees of freedom, p-hacking, and forking paths¶

The statistical significance filter¶

Example: A flawed study of ovulation and political attitudes¶

4.6 Example of hypothesis testing: 55,000 residents need your help!¶

4.7 Moving beyond hypothesis testing¶

4.8 Bibliographic note¶

4.9 Exercises¶

	std_dev
candidate
Clotelia Smith	0.536054
Hal Spitzer	0.429701
Margie Best	0.286932
Josh Walker	0.362337
Greg Stevens	0.376835
Dave Barron	0.203392
Andy Willis	0.884638
Alphonse Preston	0.461595