import pandas as pd
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
import statsmodels.api as sm

healthexpenditure = pd.read_csv('../ros_data/healthdata.txt', 
                               sep='\s+')

display(healthexpenditure.head()) # Display first few rows of the dataset

# Scatterplot of spending vs lifespan for each country: 
# country
# spending
# lifespan

plt.figure(figsize=(10, 6))
plt.scatter(healthexpenditure['spending'], healthexpenditure['lifespan'])

# Add country names as labels for each point
for idx, row in healthexpenditure.iterrows():
    plt.annotate(row['country'], 
                (row['spending'], row['lifespan']),
                fontsize=9,
                alpha=0.7)

plt.title('Health Expenditure vs Lifespan by Country')
plt.xlabel('Health Expenditure per Capita (USD)')
plt.ylabel('Average Lifespan (Years)')
plt.show()

<>:8: SyntaxWarning: invalid escape sequence '\s'
<>:8: SyntaxWarning: invalid escape sequence '\s'
/var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_3242/3270023866.py:8: SyntaxWarning: invalid escape sequence '\s'
  sep='\s+')

allnames =  pd.read_csv('../ros_data/allnames_clean.csv')

# store last letter of each name in a new column
allnames['last_letter'] = allnames['name'].str[-1]

# filter for only males
allnames = allnames[allnames['sex'] == 'M']

display(allnames.head())

# Melt the data to convert year columns (X1880, X1881, ...) to rows
year_columns = [col for col in allnames.columns if col.startswith('X') and len(col) > 1]
allnames_long = allnames.melt(
    id_vars=['name', 'sex', 'last_letter'],
    value_vars=year_columns,
    var_name='year',
    value_name='count'
)

display(allnames_long.head())

# Remove 'X' prefix from year and convert to integer
allnames_long['year'] = allnames_long['year'].str[1:].astype(int)

# Transform data: year as index, last_letter as columns, counts as values
names_pivot = allnames_long.pivot_table(
    index='year',
    columns='last_letter',
    values='count',
    aggfunc='sum',
    fill_value=0
)

display(names_pivot)

# plot bar chart of letter frequencies for 1906, 1956 and 2006 as a percentage of total names that year. x should be letters, y should be percentage of names with that letter as last letter
years_to_plot = [1906, 1956, 2006]
percentage_data = {}
for year in years_to_plot:
    year_data = names_pivot.loc[year]
    total_names = year_data.sum()
    percentage_data[year] = (year_data / total_names) * 100
percentage_df = pd.DataFrame(percentage_data)
percentage_df.plot(kind='bar', figsize=(12, 6))
plt.title('Last Letter Frequencies in Male Names for Selected Years')
plt.xlabel('Last Letter')
plt.ylabel('Percentage of Names (%)')
plt.legend(title='Year')
plt.show()

# plot line chart showing trend of all last letters from 1880 to 2020
names_pivot_percentage = names_pivot.div(names_pivot.sum(axis=1), axis=0) * 100
plt.figure(figsize=(14, 8))
for letter in names_pivot_percentage.columns:
    plt.plot(names_pivot_percentage.index, names_pivot_percentage[letter], label=letter)
plt.title('Trends in Last Letter Frequencies in Male Names (1880-2020)')
plt.xlabel('Year')
plt.ylabel('Percentage of Names (%)')
plt.legend(title='Last Letter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Load and filter data to age group 45-54
data = pd.read_csv('../ros_data/white_nonhisp_death_rates_from_1999_to_2013_by_sex.txt', sep='\s+')
data = data[(data['Age'] >= 45) & (data['Age'] <= 54)]

# Aggregate by Year and Age, calculate death rate
by_year_age = data.groupby(['Year', 'Age'])[['Deaths', 'Population']].sum().reset_index()
by_year_age['death_rate'] = by_year_age['Deaths'] / by_year_age['Population']
display(by_year_age.head())

# Aggregate by Year only
by_year = data.groupby('Year')[['Deaths', 'Population']].sum().reset_index()
by_year['death_rate'] = by_year['Deaths'] / by_year['Population']
display(by_year)

# Plot trend by year
sns.lineplot(data=by_year, x="Year", y="death_rate")
plt.ylabel('Death Rate')
plt.show()

# Aggregate adjusted data by year from year-age breakdown
by_year_adjusted = by_year_age.groupby('Year')[['death_rate', 'Deaths']].sum().reset_index()
display(by_year_adjusted)

# Plot adjusted trend
sns.lineplot(data=by_year_adjusted, x="Year", y="death_rate")
plt.ylabel('Adjusted Death Rate')
plt.show()

# Aggregate by Year and Age, calculate death rate
by_year_age_gender = data.groupby(['Year', 'Age', 'Male'])[['Deaths', 'Population']].sum().reset_index()
by_year_age_gender['death_rate'] = by_year_age_gender['Deaths'] / by_year_age_gender['Population']
by_year_age_gender = by_year_age_gender.groupby(['Year', 'Male'])[['death_rate']].sum().reset_index()
# normalise death rates by dividing all death rates by the death rate in 1999 for males and females separately
by_year_age_gender['death_rate'] = by_year_age_gender.groupby('Male')['death_rate'].transform(lambda x: x / x[by_year_age_gender['Year'] == 1999].values[0])
print('by_year_age_gender')
display(by_year_age_gender.head())

# Plot adjusted trend
sns.lineplot(data=by_year_age_gender, x="Year", y="death_rate", hue='Male')
plt.ylabel('Adjusted Death Rate')
plt.show()

<>:2: SyntaxWarning: invalid escape sequence '\s'
<>:2: SyntaxWarning: invalid escape sequence '\s'
/var/folders/6m/c2f6bq0j2fbclrsw82pn99380000gn/T/ipykernel_3242/3699307391.py:2: SyntaxWarning: invalid escape sequence '\s'
  data = pd.read_csv('../ros_data/white_nonhisp_death_rates_from_1999_to_2013_by_sex.txt', sep='\s+')

by_year_age_gender

	country	spending	lifespan
0	Australia	3357	81.4
1	Austria	3763	80.1
2	Belgium	3595	79.8
3	Canada	3895	80.7
4	Czech	1626	77.0

	X	name	sex	X1880	X1881	X1882	X1883	X1884	X1885	X1886	...	X2002	X2003	X2004	X2005	X2006	X2007	X2008	X2009	X2010	last_letter
61353	61407	John	M	9655	8769	9557	8894	9387	8756	9026	...	17429	17206	16429	15747	15140	14405	13273	12048	11424	n
61354	61408	William	M	9533	8524	9298	8387	8897	8044	8252	...	20103	19976	20213	19025	18915	18839	18337	17852	16870	m
61355	61409	James	M	5927	5442	5892	5224	5693	5175	5355	...	16941	16880	16431	16108	16213	15908	15108	14121	13714	s
61356	61410	Charles	M	5348	4637	5092	4826	4802	4599	4533	...	7203	7689	7642	7918	7999	7440	7259	7254	7028	s
61357	61411	George	M	5126	4664	5193	4736	4961	4674	4671	...	3010	2909	2734	2820	2699	2755	2544	2375	2344	e

last_letter	a	b	c	d	e	f	g	h	i	j	...	q	r	s	t	u	v	w	x	y	z
year
1880	754	509	349	9177	13466	108	147	4040	200	0	...	0	7454	18384	6934	25	33	853	304	8328	29
1881	742	470	331	8387	12389	83	146	3761	206	0	...	0	7273	16320	6198	26	18	748	267	7767	8
1882	766	506	349	9702	14584	135	146	4162	207	0	...	0	7988	18182	7051	10	43	875	355	8774	31
1883	734	452	299	8796	13153	106	135	3905	167	0	...	0	7510	16489	6724	16	44	794	283	8236	12
1884	788	496	319	9856	14599	131	160	4210	158	0	...	0	8983	17657	6983	15	36	826	343	9155	27
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2006	36070	42600	26635	51410	142937	1968	1929	98359	33558	1241	...	430	176451	143207	43509	2201	2010	43217	13992	123621	3086
2007	34565	42123	26864	50595	143645	2090	2040	99302	35231	1254	...	431	177166	142137	43393	2311	2295	40251	14306	123330	3301
2008	32841	39945	25318	47910	140905	2195	2059	100155	38151	1381	...	339	174595	137037	43798	2405	2418	36937	14834	122565	3473
2009	31378	38862	24048	46167	135456	2212	2396	99881	40912	1416	...	377	173177	129848	43637	2417	2589	33181	16640	112873	3633
2010	28392	38859	23125	44398	128951	2255	2666	98015	42956	1459	...	342	166047	123649	43370	2318	2723	30656	16352	110392	3476

	Year	Age	Deaths	Population	death_rate
0	1999	45	8304	3166393	0.002623
1	1999	46	8809	3007083	0.002929
2	1999	47	9135	2986252	0.003059
3	1999	48	9461	2805975	0.003372
4	1999	49	10265	2859406	0.003590

	Year	Deaths	Population	death_rate
0	1999	106808	27995805	0.003815
1	2000	111964	28669184	0.003905
2	2001	117086	29733531	0.003938
3	2002	119812	29880552	0.004010
4	2003	121832	30260532	0.004026
5	2004	123037	30629390	0.004017
6	2005	126669	31024600	0.004083
7	2006	127969	31365190	0.004080
8	2007	127898	31561137	0.004052
9	2008	130423	31631081	0.004123
10	2009	130740	31600635	0.004137
11	2010	128034	31444802	0.004072
12	2011	127836	30850953	0.004144
13	2012	124142	30209438	0.004109
14	2013	122531	29497978	0.004154

2. Data and measurement¶

2.1 Examining where data come from¶

Details of measurement can be important¶

2.2 Validity and reliability¶

2.3 All graphs are comparisons¶

Simple scatter plots¶

Display more information on a graph¶

Multiple plots¶

Grids of plots¶

Applying graphical principles to numerical displays and communication more generally¶

Graphics for understanding statistical models¶

2.4 Data and adjustment: trends in mortality rates¶

2.5 Bibliographic note¶

2.6 Exercises¶

	Year	death_rate	Deaths
0	1999	0.039082	106808
1	2000	0.039782	111964
2	2001	0.039642	117086
3	2002	0.040536	119812
4	2003	0.040720	121832
5	2004	0.040581	123037
6	2005	0.041233	126669
7	2006	0.041120	127969
8	2007	0.040724	127898
9	2008	0.041305	130423
10	2009	0.041330	130740
11	2010	0.040599	128034
12	2011	0.041051	127836
13	2012	0.040540	124142
14	2013	0.040836	122531

	Year	Male	death_rate
0	1999	0	1.000000
1	1999	1	1.000000
2	2000	0	1.009832
3	2000	1	1.022359
4	2001	0	1.010740