# Required packages
library("tidyverse")
library("archive")
library("assertr")
library("dataverse")
library("gssr")
library("WDI")
# Create '_raw' subdirectory to store downloads, and 'data' for final cleaned files
if (!dir.exists("_raw"))
dir.create("_raw")
if (!dir.exists("data"))
dir.create("data")Appendix B — Data Sources
B.1 anes_2020.csv: Subset of 2020 ANES survey
This file contains a small number of variables from the 2020 wave of the American National Election Studies Time Series Study.
Despite the direct download link on ANES’s website, they’ve got it locked down to prevent programmatic access from utilities like download.file(). (Not running glimpse() this time because the data has 1700+ columns.)
anes_file <- "_raw/anes_timeseries_2020_csv_20220210.csv"
if (!file.exists(anes_file))
stop("Need to download data manually from ANES website")
df_anes_raw <- read_csv(anes_file)Extract the few columns we care about and convert numeric codes to understandable values:
df_anes <- df_anes_raw |>
mutate(
id = row_number(),
state = case_match(
V201014b,
1 ~ "Alabama",
2 ~ "Alaska",
4 ~ "Arizona",
5 ~ "Arkansas",
6 ~ "California",
8 ~ "Colorado",
9 ~ "Connecticut",
10 ~ "Delaware",
11 ~ "District of Columbia",
12 ~ "Florida",
13 ~ "Georgia",
15 ~ "Hawaii",
16 ~ "Idaho",
17 ~ "Illinois",
18 ~ "Indiana",
19 ~ "Iowa",
20 ~ "Kansas",
21 ~ "Kentucky",
22 ~ "Louisiana",
23 ~ "Maine",
24 ~ "Maryland",
25 ~ "Massachusetts",
26 ~ "Michigan",
27 ~ "Minnesota",
28 ~ "Mississippi",
29 ~ "Missouri",
30 ~ "Montana",
31 ~ "Nebraska",
32 ~ "Nevada",
33 ~ "New Hampshire",
34 ~ "New Jersey",
35 ~ "New Mexico",
36 ~ "New York",
37 ~ "North Carolina",
38 ~ "North Dakota",
39 ~ "Ohio",
40 ~ "Oklahoma",
41 ~ "Oregon",
42 ~ "Pennsylvania",
44 ~ "Rhode Island",
45 ~ "South Carolina",
46 ~ "South Dakota",
47 ~ "Tennessee",
48 ~ "Texas",
49 ~ "Utah",
50 ~ "Vermont",
51 ~ "Virginia",
53 ~ "Washington",
54 ~ "West Virginia",
55 ~ "Wisconsin",
56 ~ "Wyoming"
),
female = case_match(
V201600,
1 ~ 0,
2 ~ 1
),
lgbt = case_match(
V201601,
1 ~ 0,
2:4 ~ 1
),
race = case_match(
V201549x,
1 ~ "White",
2 ~ "Black",
3 ~ "Hispanic",
4 ~ "Asian",
5 ~ "Native American",
6 ~ "Multiracial"
),
age = if_else(V201507x > 0, V201507x, NA),
education = case_match(
V201511x,
1 ~ "Less than high school",
2 ~ "High school",
3 ~ "Some college",
4 ~ "Bachelor's degree",
5 ~ "Graduate degree"
),
employed = case_match(
V201517,
1 ~ 1,
2 ~ 0
),
hours_worked = case_when(
V201527 == -1 ~ 0,
V201527 > 0 ~ V201527,
TRUE ~ NA
),
watch_tucker = case_match(
V201630c,
c(-1, 0) ~ 0,
1 ~ 1
),
watch_maddow = case_match(
V201630d,
c(-1, 0) ~ 0,
1 ~ 1
),
therm_biden = if_else(V201151 %in% 0:100, V201151, NA),
therm_trump = if_else(V201152 %in% 0:100, V201152, NA),
therm_harris = if_else(V201153 %in% 0:100, V201153, NA),
therm_pence = if_else(V201154 %in% 0:100, V201154, NA),
therm_obama = if_else(V201155 %in% 0:100, V201155, NA),
therm_dem_party = if_else(V201156 %in% 0:100, V201156, NA),
therm_rep_party = if_else(V201157 %in% 0:100, V201157, NA),
therm_feminists = if_else(V202160 %in% 0:100, V202160, NA),
therm_liberals = if_else(V202161 %in% 0:100, V202161, NA),
therm_labor_unions = if_else(V202162 %in% 0:100, V202162, NA),
therm_big_business = if_else(V202163 %in% 0:100, V202163, NA),
therm_conservatives = if_else(V202164 %in% 0:100, V202164, NA),
therm_supreme_court = if_else(V202165 %in% 0:100, V202165, NA),
therm_congress = if_else(V202167 %in% 0:100, V202167, NA),
therm_police = if_else(V202171 %in% 0:100, V202171, NA),
therm_scientists = if_else(V202173 %in% 0:100, V202173, NA),
contributed_to_party = case_match(
V202019,
1 ~ 1,
2 ~ 0
),
voted = case_match(
V202068x,
0:1 ~ 0,
2 ~ 1
),
voted_for_biden = if_else(V202073 < 0, NA, V202073),
voted_for_biden = case_match(
voted_for_biden,
1 ~ 1,
2:8 ~ 0
),
voted_for_trump = if_else(V202073 < 0, NA, V202073),
voted_for_trump = case_match(
voted_for_trump,
2 ~ 1,
c(1, 3:8) ~ 0
),
.keep = "none"
)
glimpse(df_anes)Rows: 8,280
Columns: 31
$ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ state <chr> "Oklahoma", "Idaho", "Virginia", "Califor…
$ female <dbl> 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,…
$ lgbt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ race <chr> "Hispanic", "Asian", "White", "Asian", "N…
$ age <dbl> 46, 37, 40, 41, 72, 71, 37, 45, 70, 43, 3…
$ education <chr> "Bachelor's degree", "Some college", "Hig…
$ employed <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,…
$ hours_worked <dbl> 40, 40, 0, 40, 0, 0, 30, 40, 0, 30, 25, 5…
$ watch_tucker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ watch_maddow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ therm_biden <dbl> 0, 0, 65, 70, 15, 85, 50, 50, 85, 85, 100…
$ therm_trump <dbl> 100, 0, 0, 15, 85, 0, 75, 100, 0, 0, 0, 0…
$ therm_harris <dbl> 0, 0, 65, 85, 15, 85, 15, 50, 85, 50, 100…
$ therm_pence <dbl> 85, 0, 0, 15, 90, 0, 75, 50, 0, 50, 0, 50…
$ therm_obama <dbl> 0, 50, 90, 85, 10, 60, 15, 50, 60, 100, 1…
$ therm_dem_party <dbl> 0, 0, 60, 50, 20, 85, 15, 50, NA, 60, 100…
$ therm_rep_party <dbl> 85, 50, 0, 70, 70, 15, 75, 100, NA, 50, 0…
$ therm_feminists <dbl> 65, 100, 75, 70, 30, 60, 60, 100, 50, 50,…
$ therm_liberals <dbl> 30, 0, 75, 70, 10, 70, 0, NA, 30, 50, 50,…
$ therm_labor_unions <dbl> 30, 70, 75, 70, 50, 50, 50, 0, 30, 50, 50…
$ therm_big_business <dbl> 70, 50, 0, 85, 0, 40, 50, 0, 50, 15, 50, …
$ therm_conservatives <dbl> 85, 15, 0, 70, 60, 40, 60, NA, 50, 50, 50…
$ therm_supreme_court <dbl> 100, 50, 25, 85, 60, 60, 70, 50, 50, 50, …
$ therm_congress <dbl> 40, 15, 0, 100, 10, 85, 50, 50, 50, 40, 5…
$ therm_police <dbl> 85, 90, 40, 100, 70, 70, 60, 100, 60, 70,…
$ therm_scientists <dbl> 100, 70, 100, 85, 60, 85, 85, NA, 60, 50,…
$ contributed_to_party <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ voted <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,…
$ voted_for_biden <dbl> NA, 0, 1, 1, 0, 1, 0, NA, NA, 1, 1, 1, 0,…
$ voted_for_trump <dbl> NA, 0, 0, 0, 1, 0, 1, NA, NA, 0, 0, 0, 1,…
Save to anes_2020.csv:
write_csv(df_anes, "data/anes_2020.csv")B.2 county_pres.csv: County-level presidential election returns, 2000–2024
This file contains data from the MIT Election Lab dataset on County Presidential Election returns.
Obtain raw data from Harvard Dataverse:
# Download raw data from Harvard Dataverse repository
county_pres_file <- "_raw/countypres_2000-2024.csv"
if (!file.exists(county_pres_file)) {
df_county_pres_dataverse <- get_dataframe_by_name(
filename = "countypres_2000-2024.tab",
dataset = "10.7910/DVN/VOQCHQ",
server = "dataverse.harvard.edu",
version = "15.0"
)
write_csv(df_county_pres_dataverse, county_pres_file)
}
df_county_pres_raw <- read_csv(county_pres_file)
glimpse(df_county_pres_raw)Rows: 94,409
Columns: 12
$ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2…
$ state <chr> "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", …
$ state_po <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "…
$ county_name <chr> "AUTAUGA", "AUTAUGA", "AUTAUGA", "AUTAUGA", "BALDWIN", …
$ county_fips <chr> "01001", "01001", "01001", "01001", "01003", "01003", "…
$ office <chr> "US PRESIDENT", "US PRESIDENT", "US PRESIDENT", "US PRE…
$ candidate <chr> "AL GORE", "GEORGE W. BUSH", "OTHER", "RALPH NADER", "A…
$ party <chr> "DEMOCRAT", "REPUBLICAN", "OTHER", "GREEN", "DEMOCRAT",…
$ candidatevotes <dbl> 4942, 11993, 113, 160, 13997, 40872, 578, 1033, 5188, 5…
$ totalvotes <dbl> 17208, 17208, 17208, 17208, 56480, 56480, 56480, 56480,…
$ version <dbl> 20250712, 20250712, 20250712, 20250712, 20250712, 20250…
$ mode <chr> "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "…
Clean data to have one row per county-year:
df_county_pres <- df_county_pres_raw |>
filter(!is.na(party), totalvotes > 0) |>
rename(county = county_name) |>
group_by(year, state, county) |>
summarize(
county_fips = first(county_fips),
total_votes = first(totalvotes),
dem_votes = sum(candidatevotes[party == "DEMOCRAT"]),
rep_votes = sum(candidatevotes[party == "REPUBLICAN"]),
.groups = "drop"
) |>
mutate(
region = fct_collapse(
state,
Northeast = c(
"CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND",
"VERMONT", "NEW JERSEY", "NEW YORK", "PENNSYLVANIA"
),
Midwest = c(
"ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN", "IOWA", "KANSAS",
"MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA"
),
South = c(
"DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "MARYLAND",
"NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "WEST VIRGINIA", "ALABAMA",
"KENTUCKY", "MISSISSIPPI", "TENNESSEE", "ARKANSAS", "LOUISIANA", "OKLAHOMA",
"TEXAS"
),
West = c(
"ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH",
"WYOMING", "ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON"
),
other_level = "Unknown"
),
margin = dem_votes - rep_votes,
pct_margin = margin / total_votes,
competitiveness = case_when(
pct_margin < -0.2 ~ -3,
pct_margin < -0.1 ~ -2,
pct_margin < -0.04 ~ -1,
pct_margin < 0.04 ~ 0,
pct_margin < 0.1 ~ 1,
pct_margin < 0.2 ~ 2,
TRUE ~ 3
),
) |>
group_by(state) |>
mutate(
dem_win_state = as.numeric(sum(dem_votes) > sum(rep_votes)),
) |>
ungroup() |>
assert(not_na, everything()) |>
select(year, state, region, everything())Save to county_pres.csv:
write_csv(df_county_pres, "data/county_pres.csv")B.3 crises.csv: International crises
This file contains the International Crisis Behavior actor-level data, version 16. There’s purposely no additional cleaning since it’s used in the data wrangling lecture.
The raw data is stored via a Box link that doesn’t work with download.file(), and my efforts to get ChatGPT to help me get to the underlying data were unsuccessful.
# Read in raw data
crises_file <- "_raw/icb2v16.csv"
if (!file.exists(crises_file))
stop("Need to download data manually from ICB website")
df_crises_raw <- read_csv(crises_file)
glimpse(df_crises_raw)Rows: 1,131
Columns: 95
$ icb2 <chr> "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2…
$ crisno <dbl> 1, 2, 2, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8,…
$ cracno <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ cracid <dbl> 365, 93, 94, 365, 365, 366, 368, 367, 315, 290, 310, …
$ actor <chr> "RUS", "NIC", "COS", "RUS", "RUS", "EST", "LIT", "LAT…
$ systrgyr <dbl> 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1919,…
$ systrgmo <dbl> 5, 5, 5, 6, 11, 11, 11, 11, 1, 1, 3, 3, 3, 3, 3, 3, 4…
$ systrgda <dbl> NA, 25, 25, 23, 18, 18, 18, 18, 15, 15, 20, 20, 20, 2…
$ crisname <chr> "RUSSIAN CIVIL WAR I", "COSTA RICAN COUP", "COSTA RIC…
$ triggr <dbl> 9, 7, 4, 7, 6, 9, 9, 9, 2, 7, 2, 9, 2, 9, 7, 7, 2, 7,…
$ yrtrig <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ motrig <dbl> 5, 5, 1, 6, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 3, 5, 4…
$ datrig <dbl> NA, 25, 25, 23, 18, 22, NA, NA, 15, 23, 20, 11, 8, 20…
$ trigent <dbl> 996, 94, 996, 997, 366, 365, 365, 365, 290, 315, 997,…
$ trigloc <dbl> 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ southv <dbl> 220, 94, 93, 200, 366, 365, 365, 365, 290, 315, 360, …
$ southpow <dbl> 3, 1, 1, 3, 1, 3, 3, 3, 2, 1, 1, 1, 3, 1, 3, 2, 3, 1,…
$ sizedu <dbl> 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, 3, 3, 3, NA, 3, …
$ strcdu <dbl> 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, NA, 1,…
$ comlev <dbl> 7, 1, 1, 7, 8, 8, 8, 8, 3, 3, 1, NA, 1, NA, NA, 3, 1,…
$ majres <dbl> 8, 3, 6, 8, 8, 9, 9, 9, 8, 8, 8, 8, 1, 8, 6, 8, 8, 8,…
$ yerres <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ monres <dbl> 5, 5, 1, 7, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 5, 5, 5…
$ dayres <dbl> 28, 30, 28, 1, 22, 22, NA, NA, 23, 23, 28, 11, 16, 24…
$ trgresra <dbl> 14, 6, 4, 9, 5, 1, NA, NA, 9, 1, 9, 1, 9, 5, 76, NA, …
$ crismg <dbl> 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 8, 7, 7, 7, 7,…
$ cenvio <dbl> 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 2, 3, 3, 3,…
$ sevvio <dbl> 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ usinv <dbl> 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 1,…
$ usfavr <dbl> 3, 1, 3, 3, 3, 1, 1, 1, 3, 3, 5, 5, 5, 5, 1, 3, 5, 5,…
$ suinv <dbl> 9, 1, 1, 9, 9, 8, 8, 8, 1, 1, 3, 3, 1, 3, 1, 1, 5, 5,…
$ sufavr <dbl> 8, 5, 5, 8, 8, 3, 3, 3, 5, 5, 1, 3, 5, 3, 5, 5, 2, 2,…
$ gbinv <dbl> 7, 1, 1, 8, 3, 6, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 8, 9,…
$ gbfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ frinv <dbl> 2, 1, 1, 8, 3, 1, 3, 3, 3, 3, 8, 8, 3, 1, 3, 3, 1, 1,…
$ frfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ itinv <dbl> 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 7, 7, 1, 1, 8, 9, 1, 1,…
$ itfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 3…
$ grinv <dbl> 1, 1, 1, 1, 8, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ grfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ jpinv <dbl> 7, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,…
$ jpfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 5…
$ globorg <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globact <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globfavr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ regorg <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ regact <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rofavr <dbl> 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ outcom <dbl> 1, 1, 4, 1, 4, 1, 1, 1, 2, 2, 4, 1, 4, 1, 2, 2, 2, 2,…
$ outfor <dbl> 6, 4, 4, 6, 9, 8, 8, 8, 9, 9, 6, 1, 7, 4, 1, 1, 1, 1,…
$ outevl <dbl> 2, 2, 3, 2, 3, 2, 2, 2, 4, 4, 3, 2, 3, 2, 1, 1, 1, 1,…
$ outesr <dbl> 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2,…
$ yrterm <dbl> 1920, 1918, 1919, 1919, 1920, 1920, 1920, 1920, 1920,…
$ moterm <dbl> 4, 12, 9, 9, 8, 2, 7, 8, 7, 7, 8, 6, 6, 8, 7, 7, 8, 8…
$ daterm <dbl> 1, 15, 3, 27, 11, 2, 12, 11, 28, 28, 3, 24, 16, 3, 29…
$ trgterra <dbl> 686, 205, 222, 462, 632, 438, 574, 603, 560, 552, 137…
$ resterra <dbl> 673, 199, 218, 453, 627, 438, 574, 603, 551, 552, 128…
$ actloc <dbl> 30, 42, 42, 30, 30, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ geog <dbl> 30, 42, 42, 30, 34, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ cractloc <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 3, 2, 4, 1, 1,…
$ noactr <dbl> 7, 5, 6, 5, 8, 8, 8, 8, 3, 3, 7, 7, 3, 7, 5, 5, 3, 3,…
$ stainsys <dbl> 47, 47, 49, 47, 47, 47, 47, 47, 49, 49, 49, 49, 49, 4…
$ period <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ syslev <dbl> 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pc <dbl> 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pcid <dbl> 27, 6, 6, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ viol <dbl> 3, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ iwc <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3, 7, 1, 1, 1, 1,…
$ powdis <dbl> NA, 1, -1, NA, 12, -12, -12, -12, -1, 1, NA, NA, NA, …
$ gpinv <dbl> 7, 4, 4, 7, 7, 7, 7, 7, 3, 3, 5, 4, 4, 4, 6, 6, 6, 6,…
$ powinv <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ age <dbl> 1, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,…
$ territ <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2,…
$ regime <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 1, 4, 2, 1, 2, 2, 1, 1, 2, 1,…
$ durreg <dbl> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3,…
$ allycap <dbl> 4, 2, 1, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 4,…
$ globmemb <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
$ nuclear <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ powsta <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 3,…
$ issue <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,…
$ chissu <dbl> 4, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6,…
$ gravty <dbl> 2, 1, 2, 2, 3, 6, 6, 6, 3, 3, 6, 3, 5, 3, 3, 4, 2, 4,…
$ pethin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ col <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ unemp <dbl> NA, NA, NA, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, N…
$ inflat <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, NA,…
$ foodpr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ labstr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ short <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, N…
$ econdt <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ regrep <dbl> NA, 1, 1, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA,…
$ socunr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, …
$ massvl <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, NA,…
$ gvinst <dbl> 1, 2, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, N…
$ sourdt <dbl> 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, NA, 3, 1, 1…
Save to crises.csv:
write_csv(df_crises_raw, "data/crises.csv")B.4 fed_papers.csv: Federalist Papers corpus
This file contains the full text of each of the Federalist Papers, per the public domain text archived by Project Gutenberg.
Obtain the raw text from Project Gutenberg:
fed_papers_url <- "https://www.gutenberg.org/files/18/18-0.txt"
fed_papers_file <- "_raw/fed_papers.txt"
if (!file.exists(fed_papers_file))
download.file(url = fed_papers_url, destfile = fed_papers_file)
fed_papers_raw <- readLines(fed_papers_file)Parse text and assemble into data frame:
## Eliminate table of contents and other non-text content
fed_papers <- fed_papers_raw |>
tail(-98) |>
head(-2)
## Combine into single string
fed_papers <- str_c(fed_papers, collapse = "\n")
## Split into individual papers
fed_papers <- fed_papers |>
str_split("THE FEDERALIST.\n") |>
unlist()
## Eliminate the empty first entry, as well as the duplicate of #70
fed_papers <- fed_papers[-1]
fed_papers <- fed_papers[-70]
## Extract author(s) of each paper
author_id_regex <- "\\n\\n(HAMILTON|JAY|MADISON|HAMILTON AND MADISON|HAMILTON OR MADISON)\\n\\n\\n"
paper_author <- fed_papers |>
str_extract(author_id_regex) |>
str_remove_all("\\n") |>
str_to_lower()
## Start each paper text after author identifier
##
## This will keep our classifiers from "peeking" by directly using author info
paper_text <- fed_papers |>
str_split_i(author_id_regex, i = 2)
## Combine into a data frame
df_fed_papers <- tibble(
paper_id = seq_along(fed_papers),
author = paper_author,
text = paper_text
)
glimpse(df_fed_papers)Rows: 85
Columns: 3
$ paper_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ author <chr> "hamilton", "jay", "jay", "jay", "jay", "hamilton", "…
$ text <chr> "To the People of the State of New York:\n\nAfter an …
Save to fed_papers.csv:
write_csv(df_fed_papers, "data/fed_papers.csv")B.6 military.csv: Military spending and personnel
This file contains data from the Correlates of War project’s dataset on National Material Capabilities, version 6.0.
Obtain the raw data by extracting from the zip on the COW website:
# Download zip file containing raw data
#
# This is convoluted because the csv is inside a zip within the zip
military_url <- "https://correlatesofwar.org/wp-content/uploads/NMC_Documentation-6.0.zip"
military_file <- "_raw/NMC-60-abridged.csv"
if (!file.exists(military_file)) {
military_zip_outer <- tempfile(fileext = ".zip")
download.file(url = military_url, destfile = military_zip_outer)
military_zip_inner <- archive_read(military_zip_outer, "NMC-60-abridged.zip")
military_csv <- read_csv(archive_read(military_zip_inner, "NMC-60-abridged.csv"))
write_csv(military_csv, military_file)
}
# Read in raw data
df_military_raw <- read_csv(military_file)
glimpse(df_military_raw)Rows: 15,951
Columns: 11
$ stateabb <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA"…
$ ccode <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ year <dbl> 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1…
$ milex <dbl> 3823, 2466, 1910, 2301, 1556, 1612, 1079, 1170, 1261, 1336, 1…
$ milper <dbl> 17, 15, 14, 13, 15, 11, 10, 11, 11, 11, 12, 12, 11, 12, 12, 1…
$ irst <dbl> 80, 80, 90, 90, 110, 100, 100, 110, 110, 120, 120, 130, 130, …
$ pec <dbl> 254, 277, 302, 293, 303, 321, 332, 345, 390, 424, 502, 556, 6…
$ tpop <dbl> 8659, 8899, 9139, 9379, 9618, 9939, 10268, 10596, 10924, 1125…
$ upop <dbl> 101, 106, 112, 118, 124, 130, 136, 143, 151, 158, 166, 175, 1…
$ cinc <dbl> 0.03969749, 0.03581661, 0.03612655, 0.03713325, 0.03708687, 0…
$ version <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2…
Convert to “long” format containing only spending and personnel, for pedagogical purposes:
df_military <- df_military_raw |>
select(ccode, stateabb, year, spending = milex, personnel = milper) |>
pivot_longer(
cols = c(spending, personnel),
names_to = "mil_indicator",
values_to = "amount"
)
glimpse(df_military)Rows: 31,902
Columns: 5
$ ccode <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ stateabb <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA",…
$ year <dbl> 1816, 1816, 1817, 1817, 1818, 1818, 1819, 1819, …
$ mil_indicator <chr> "spending", "personnel", "spending", "personnel"…
$ amount <dbl> 3823, 17, 2466, 15, 1910, 14, 2301, 13, 1556, 15…
Save to military.csv:
write_csv(df_military, "data/military.csv")B.7 turnout.csv: US voter turnout, 2000–2022
This file uses data from the University of Florida Election Lab, specifically version 1.2 of the General Election Turnout Rates dataset.
Obtain the raw data:
# Download raw data
turnout_url <- "https://election.lab.ufl.edu/data-downloads/turnoutdata/Turnout_1980_2022_v1.2.csv"
turnout_file <- "_raw/Turnout_1980_2022_v1.2.csv"
if (!file.exists(turnout_file))
download.file(url = turnout_url, destfile = turnout_file)
# Read in raw data
df_turnout_raw <- read_csv(turnout_file)
glimpse(df_turnout_raw)Rows: 1,144
Columns: 15
$ YEAR <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022…
$ STATE <chr> "United States", "Alabama", "Alaska", "Arizona…
$ STATE_ABV <chr> NA, "AL", "AK", "AZ", "AR", "CA", "CO", "CT", …
$ TOTAL_BALLOTS_COUNTED <dbl> 112030874, 1424087, 267047, 2592313, 914227, 1…
$ VOTE_FOR_HIGHEST_OFFICE <chr> NA, "https://www.eac.gov/sites/default/files/2…
$ VAP <dbl> 260725069, 3956111, 556592, 5796801, 2347291, …
$ NONCITIZEN_PCT <chr> "7.50%", "2.54%", "3.56%", "7.78%", "3.76%", "…
$ INELIGIBLE_PRISON <dbl> 1175823, 25403, 4778, 31441, 17331, 97608, 163…
$ INELIGIBLE_PROBATION <dbl> 1074600, 27469, 1872, 47515, 28009, 0, 0, 0, 7…
$ INELIGIBLE_PAROLE <dbl> 412595, 7815, 865, 7022, 23829, 0, 0, 0, 344, …
$ INELIGIBLE_FELONS_TOTAL <dbl> 2663018, 60687, 7515, 85978, 69169, 97608, 163…
$ ELIGIBLE_OVERSEAS <dbl> 4400000, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VEP <dbl> 242907672, 3794939, 529263, 5259832, 2189865, …
$ VEP_TURNOUT_RATE <chr> "46.12%", "37.53%", "50.46%", "49.29%", "41.75…
$ VAP_TURNOUT_RATE <chr> "42.97%", "36.00%", "47.98%", "44.72%", "38.95…
Cleaning up into the data file used for class:
df_turnout <- df_turnout_raw |>
# Only want national level data
filter(STATE == "United States") |>
# Grab and rename the columns we want
mutate(
year = YEAR,
voting_age_pop = VAP,
voting_eligible_pop = VEP,
ballots_counted = TOTAL_BALLOTS_COUNTED,
highest_office = VOTE_FOR_HIGHEST_OFFICE,
noncitizen_pct = NONCITIZEN_PCT,
ineligible_felons = INELIGIBLE_FELONS_TOTAL,
eligible_overseas = ELIGIBLE_OVERSEAS,
.keep = "none",
) |>
# Clean up highest_office and noncitizen_pct columns to be numeric
mutate(
noncitizen_pct = str_replace(noncitizen_pct, "\\%", ""),
noncitizen_pct = as.numeric(noncitizen_pct) / 100,
highest_office = str_replace_all(highest_office, ",", ""),
highest_office = as.numeric(highest_office),
) |>
# Calculate number of noncitizens
mutate(
ineligible_noncitizens = noncitizen_pct * voting_age_pop,
) |>
select(-noncitizen_pct) |>
# For vote total, use ballots counted where available, otherwise just use
# votes for highest office
mutate(
votes_counted = if_else(
!is.na(ballots_counted),
ballots_counted,
highest_office
)
) |>
# Convert population counts to millions
mutate(across(-year, \(x) x / 1e6)) |>
# Remove columns no longer needed
select(
year, votes_counted, voting_age_pop, voting_eligible_pop,
ineligible_felons, ineligible_noncitizens, eligible_overseas
) |>
# Order from earliest to latest
arrange(year)
glimpse(df_turnout)Rows: 22
Columns: 7
$ year <dbl> 1980, 1982, 1984, 1986, 1988, 1990, 199…
$ votes_counted <dbl> 86.51522, 67.61558, 92.65268, 64.99113,…
$ voting_age_pop <dbl> 164.4455, 166.0276, 173.9946, 177.9223,…
$ voting_eligible_pop <dbl> 159.6909, 160.4088, 167.7085, 170.4089,…
$ ineligible_felons <dbl> 0.801977, 0.959637, 1.165246, 1.367117,…
$ ineligible_noncitizens <dbl> 5.755592, 6.641105, 7.481768, 8.362350,…
$ eligible_overseas <dbl> 1.803021, 1.981895, 2.360867, 2.216053,…
Double check that the manually calculated voting eligible calculation lines up with the one reported in the data frame:
df_turnout |>
mutate(
vep_manual = voting_age_pop - ineligible_felons -
ineligible_noncitizens + eligible_overseas,
vep_difference = abs(voting_eligible_pop - vep_manual) / voting_eligible_pop,
) |>
select(year, voting_eligible_pop, vep_manual, vep_difference) |>
print(n = Inf)# A tibble: 22 × 4
year voting_eligible_pop vep_manual vep_difference
<dbl> <dbl> <dbl> <dbl>
1 1980 160. 160. 2.35e- 9
2 1982 160. 160. 1.99e- 9
3 1984 168. 168. 1.37e- 9
4 1986 170. 170. 2.88e- 9
5 1988 174. 174. 1.82e- 9
6 1990 177. 177. 1.44e- 9
7 1992 180. 180. 2.12e- 9
8 1994 183. 183. 2.19e- 9
9 1996 186. 186. 1.91e- 9
10 1998 190. 190. 1.50e- 9
11 2000 194. 194. 2.14e- 9
12 2002 198. 198. 4.03e-10
13 2004 203. 203. 2.18e- 9
14 2006 207. 207. 1.52e- 9
15 2008 213. 213. 2.25e- 9
16 2010 222. 222. 1.94e- 4
17 2012 222. 222. 2.66e- 9
18 2014 227. 227. 7.00e-10
19 2016 231. 231. 7.06e-10
20 2018 237. 237. 4.22e-10
21 2020 242. 242. 6.20e-10
22 2022 243. 243. 4.84e- 9
Save cleaned data to turnout.csv:
write_csv(df_turnout, "data/turnout.csv")B.8 wdi.csv: World Development Indicators, 2019
This file contains data from the World Bank’s World Development Indicators dataset.
Obtain raw data using the WDI package:
# Download raw data via WDI package
wdi_file <- "_raw/wdi_2019.csv"
if (!file.exists(wdi_file)) {
df_wdi_pkg <- WDI(
country = "all",
indicator = c(
"gdp_per_capita" = "NY.GDP.PCAP.CD",
"gdp_growth" = "NY.GDP.MKTP.KD.ZG",
"population" = "SP.POP.TOTL",
"inflation" = "FP.CPI.TOTL.ZG",
"unemployment" = "SL.UEM.TOTL.ZS",
"life_expectancy" = "SP.DYN.LE00.IN"
),
start = 2019, end = 2019,
extra = TRUE
)
write_csv(df_wdi_pkg, wdi_file)
}
df_wdi_raw <- read_csv(wdi_file)
glimpse(df_wdi_raw)Rows: 266
Columns: 18
$ country <chr> "Afghanistan", "Africa Eastern and Southern", "Africa …
$ iso2c <chr> "AF", "ZH", "ZI", "AL", "DZ", "AS", "AD", "AO", "AG", …
$ iso3c <chr> "AFG", "AFE", "AFW", "ALB", "DZA", "ASM", "AND", "AGO"…
$ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
$ status <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ lastupdated <date> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01, 2025-…
$ gdp_per_capita <dbl> 496.6025, 1493.8179, 1798.3407, 5460.4305, 4468.4534, …
$ gdp_growth <dbl> 3.9116034, 2.2003404, 3.2821630, 2.0625779, 0.9000000,…
$ population <dbl> 37856121, 675950189, 463365429, 2854191, 43294546, 502…
$ inflation <dbl> 2.3023725, 4.6449672, 1.9830923, 1.4110908, 1.9517682,…
$ unemployment <dbl> 11.185000, 7.584419, 4.395271, 11.466000, 12.259000, N…
$ life_expectancy <dbl> 62.94100, 63.85726, 57.14985, 79.46700, 75.68200, 72.7…
$ region <chr> "South Asia", "Aggregates", "Aggregates", "Europe & Ce…
$ capital <chr> "Kabul", NA, NA, "Tirane", "Algiers", "Pago Pago", "An…
$ longitude <dbl> 69.17610, NA, NA, 19.81720, 3.05097, -170.69100, 1.521…
$ latitude <dbl> 34.52280, NA, NA, 41.33170, 36.73970, -14.28460, 42.50…
$ income <chr> "Low income", "Aggregates", "Aggregates", "Upper middl…
$ lending <chr> "IDA", "Aggregates", "Aggregates", "IBRD", "IBRD", "No…
Minor cleaning to remove unwanted rows and columns:
df_wdi <- df_wdi_raw |>
as_tibble() |>
select(-iso2c, -status, -lastupdated, -capital, -longitude, -latitude) |>
filter(region != "Aggregates") |>
filter(income != "Not classified") |>
mutate(income = case_match(
income,
"Low income" ~ "1. Low",
"Lower middle income" ~ "2. Lower-middle",
"Upper middle income" ~ "3. Upper-middle",
"High income" ~ "4. High"
))
glimpse(df_wdi)Rows: 215
Columns: 12
$ country <chr> "Afghanistan", "Albania", "Algeria", "American…
$ iso3c <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG…
$ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019…
$ gdp_per_capita <dbl> 496.6025, 5460.4305, 4468.4534, 12886.1360, 41…
$ gdp_growth <dbl> 3.9116034, 2.0625779, 0.9000000, -0.4878049, 2…
$ population <dbl> 37856121, 2854191, 43294546, 50209, 76474, 323…
$ inflation <dbl> 2.3023725, 1.4110908, 1.9517682, NA, NA, 17.08…
$ unemployment <dbl> 11.185, 11.466, 12.259, NA, NA, 16.497, NA, 9.…
$ life_expectancy <dbl> 62.94100, 79.46700, 75.68200, 72.75100, 84.098…
$ region <chr> "South Asia", "Europe & Central Asia", "Middle…
$ income <chr> "1. Low", "3. Upper-middle", "3. Upper-middle"…
$ lending <chr> "IDA", "IBRD", "IBRD", "Not classified", "Not …
Save to wdi.csv:
write_csv(df_wdi, "data/wdi.csv")