# Required packages
library("tidyverse")
library("archive")
library("assertr")
library("dataverse")
library("gssr")
library("WDI")
# Create '_raw' subdirectory to store downloads, and 'data' for final cleaned files
if (!dir.exists("_raw"))
dir.create("_raw")
if (!dir.exists("data"))
dir.create("data")
Appendix B — Data Sources
B.1 anes_2020.csv
: Subset of 2020 ANES survey
This file contains a small number of variables from the 2020 wave of the American National Election Studies Time Series Study.
Despite the direct download link on ANES’s website, they’ve got it locked down to prevent programmatic access from utilities like download.file()
. (Not running glimpse()
this time because the data has 1700+ columns.)
<- "_raw/anes_timeseries_2020_csv_20220210.csv"
anes_file if (!file.exists(anes_file))
stop("Need to download data manually from ANES website")
<- read_csv(anes_file) df_anes_raw
Extract the few columns we care about and convert numeric codes to understandable values:
<- df_anes_raw |>
df_anes mutate(
id = row_number(),
state = case_match(
V201014b,1 ~ "Alabama",
2 ~ "Alaska",
4 ~ "Arizona",
5 ~ "Arkansas",
6 ~ "California",
8 ~ "Colorado",
9 ~ "Connecticut",
10 ~ "Delaware",
11 ~ "District of Columbia",
12 ~ "Florida",
13 ~ "Georgia",
15 ~ "Hawaii",
16 ~ "Idaho",
17 ~ "Illinois",
18 ~ "Indiana",
19 ~ "Iowa",
20 ~ "Kansas",
21 ~ "Kentucky",
22 ~ "Louisiana",
23 ~ "Maine",
24 ~ "Maryland",
25 ~ "Massachusetts",
26 ~ "Michigan",
27 ~ "Minnesota",
28 ~ "Mississippi",
29 ~ "Missouri",
30 ~ "Montana",
31 ~ "Nebraska",
32 ~ "Nevada",
33 ~ "New Hampshire",
34 ~ "New Jersey",
35 ~ "New Mexico",
36 ~ "New York",
37 ~ "North Carolina",
38 ~ "North Dakota",
39 ~ "Ohio",
40 ~ "Oklahoma",
41 ~ "Oregon",
42 ~ "Pennsylvania",
44 ~ "Rhode Island",
45 ~ "South Carolina",
46 ~ "South Dakota",
47 ~ "Tennessee",
48 ~ "Texas",
49 ~ "Utah",
50 ~ "Vermont",
51 ~ "Virginia",
53 ~ "Washington",
54 ~ "West Virginia",
55 ~ "Wisconsin",
56 ~ "Wyoming"
),female = case_match(
V201600,1 ~ 0,
2 ~ 1
),lgbt = case_match(
V201601,1 ~ 0,
2:4 ~ 1
),race = case_match(
V201549x,1 ~ "White",
2 ~ "Black",
3 ~ "Hispanic",
4 ~ "Asian",
5 ~ "Native American",
6 ~ "Multiracial"
),age = if_else(V201507x > 0, V201507x, NA),
education = case_match(
V201511x,1 ~ "Less than high school",
2 ~ "High school",
3 ~ "Some college",
4 ~ "Bachelor's degree",
5 ~ "Graduate degree"
),employed = case_match(
V201517,1 ~ 1,
2 ~ 0
),hours_worked = case_when(
== -1 ~ 0,
V201527 > 0 ~ V201527,
V201527 TRUE ~ NA
),watch_tucker = case_match(
V201630c,c(-1, 0) ~ 0,
1 ~ 1
),watch_maddow = case_match(
V201630d,c(-1, 0) ~ 0,
1 ~ 1
),therm_biden = if_else(V201151 %in% 0:100, V201151, NA),
therm_trump = if_else(V201152 %in% 0:100, V201152, NA),
therm_harris = if_else(V201153 %in% 0:100, V201153, NA),
therm_pence = if_else(V201154 %in% 0:100, V201154, NA),
therm_obama = if_else(V201155 %in% 0:100, V201155, NA),
therm_dem_party = if_else(V201156 %in% 0:100, V201156, NA),
therm_rep_party = if_else(V201157 %in% 0:100, V201157, NA),
therm_feminists = if_else(V202160 %in% 0:100, V202160, NA),
therm_liberals = if_else(V202161 %in% 0:100, V202161, NA),
therm_labor_unions = if_else(V202162 %in% 0:100, V202162, NA),
therm_big_business = if_else(V202163 %in% 0:100, V202163, NA),
therm_conservatives = if_else(V202164 %in% 0:100, V202164, NA),
therm_supreme_court = if_else(V202165 %in% 0:100, V202165, NA),
therm_congress = if_else(V202167 %in% 0:100, V202167, NA),
therm_police = if_else(V202171 %in% 0:100, V202171, NA),
therm_scientists = if_else(V202173 %in% 0:100, V202173, NA),
contributed_to_party = case_match(
V202019,1 ~ 1,
2 ~ 0
),voted = case_match(
V202068x,0:1 ~ 0,
2 ~ 1
),voted_for_biden = if_else(V202073 < 0, NA, V202073),
voted_for_biden = case_match(
voted_for_biden,1 ~ 1,
2:8 ~ 0
),voted_for_trump = if_else(V202073 < 0, NA, V202073),
voted_for_trump = case_match(
voted_for_trump,2 ~ 1,
c(1, 3:8) ~ 0
),.keep = "none"
)
glimpse(df_anes)
Rows: 8,280
Columns: 31
$ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ state <chr> "Oklahoma", "Idaho", "Virginia", "Califor…
$ female <dbl> 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,…
$ lgbt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ race <chr> "Hispanic", "Asian", "White", "Asian", "N…
$ age <dbl> 46, 37, 40, 41, 72, 71, 37, 45, 70, 43, 3…
$ education <chr> "Bachelor's degree", "Some college", "Hig…
$ employed <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,…
$ hours_worked <dbl> 40, 40, 0, 40, 0, 0, 30, 40, 0, 30, 25, 5…
$ watch_tucker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ watch_maddow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ therm_biden <dbl> 0, 0, 65, 70, 15, 85, 50, 50, 85, 85, 100…
$ therm_trump <dbl> 100, 0, 0, 15, 85, 0, 75, 100, 0, 0, 0, 0…
$ therm_harris <dbl> 0, 0, 65, 85, 15, 85, 15, 50, 85, 50, 100…
$ therm_pence <dbl> 85, 0, 0, 15, 90, 0, 75, 50, 0, 50, 0, 50…
$ therm_obama <dbl> 0, 50, 90, 85, 10, 60, 15, 50, 60, 100, 1…
$ therm_dem_party <dbl> 0, 0, 60, 50, 20, 85, 15, 50, NA, 60, 100…
$ therm_rep_party <dbl> 85, 50, 0, 70, 70, 15, 75, 100, NA, 50, 0…
$ therm_feminists <dbl> 65, 100, 75, 70, 30, 60, 60, 100, 50, 50,…
$ therm_liberals <dbl> 30, 0, 75, 70, 10, 70, 0, NA, 30, 50, 50,…
$ therm_labor_unions <dbl> 30, 70, 75, 70, 50, 50, 50, 0, 30, 50, 50…
$ therm_big_business <dbl> 70, 50, 0, 85, 0, 40, 50, 0, 50, 15, 50, …
$ therm_conservatives <dbl> 85, 15, 0, 70, 60, 40, 60, NA, 50, 50, 50…
$ therm_supreme_court <dbl> 100, 50, 25, 85, 60, 60, 70, 50, 50, 50, …
$ therm_congress <dbl> 40, 15, 0, 100, 10, 85, 50, 50, 50, 40, 5…
$ therm_police <dbl> 85, 90, 40, 100, 70, 70, 60, 100, 60, 70,…
$ therm_scientists <dbl> 100, 70, 100, 85, 60, 85, 85, NA, 60, 50,…
$ contributed_to_party <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ voted <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,…
$ voted_for_biden <dbl> NA, 0, 1, 1, 0, 1, 0, NA, NA, 1, 1, 1, 0,…
$ voted_for_trump <dbl> NA, 0, 0, 0, 1, 0, 1, NA, NA, 0, 0, 0, 1,…
Save to anes_2020.csv
:
write_csv(df_anes, "data/anes_2020.csv")
B.2 county_pres.csv
: County-level presidential election returns, 2000–2024
This file contains data from the MIT Election Lab dataset on County Presidential Election returns.
Obtain raw data from Harvard Dataverse:
# Download raw data from Harvard Dataverse repository
<- "_raw/countypres_2000-2024.csv"
county_pres_file if (!file.exists(county_pres_file)) {
<- get_dataframe_by_name(
df_county_pres_dataverse filename = "countypres_2000-2024.tab",
dataset = "10.7910/DVN/VOQCHQ",
server = "dataverse.harvard.edu",
version = "15.0"
)write_csv(df_county_pres_dataverse, county_pres_file)
}
<- read_csv(county_pres_file)
df_county_pres_raw
glimpse(df_county_pres_raw)
Rows: 94,409
Columns: 12
$ year <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2…
$ state <chr> "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", …
$ state_po <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "…
$ county_name <chr> "AUTAUGA", "AUTAUGA", "AUTAUGA", "AUTAUGA", "BALDWIN", …
$ county_fips <chr> "01001", "01001", "01001", "01001", "01003", "01003", "…
$ office <chr> "US PRESIDENT", "US PRESIDENT", "US PRESIDENT", "US PRE…
$ candidate <chr> "AL GORE", "GEORGE W. BUSH", "OTHER", "RALPH NADER", "A…
$ party <chr> "DEMOCRAT", "REPUBLICAN", "OTHER", "GREEN", "DEMOCRAT",…
$ candidatevotes <dbl> 4942, 11993, 113, 160, 13997, 40872, 578, 1033, 5188, 5…
$ totalvotes <dbl> 17208, 17208, 17208, 17208, 56480, 56480, 56480, 56480,…
$ version <dbl> 20250712, 20250712, 20250712, 20250712, 20250712, 20250…
$ mode <chr> "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "…
Clean data to have one row per county-year:
<- df_county_pres_raw |>
df_county_pres filter(!is.na(party), totalvotes > 0) |>
rename(county = county_name) |>
group_by(year, state, county) |>
summarize(
county_fips = first(county_fips),
total_votes = first(totalvotes),
dem_votes = sum(candidatevotes[party == "DEMOCRAT"]),
rep_votes = sum(candidatevotes[party == "REPUBLICAN"]),
.groups = "drop"
|>
) mutate(
region = fct_collapse(
state,Northeast = c(
"CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND",
"VERMONT", "NEW JERSEY", "NEW YORK", "PENNSYLVANIA"
),Midwest = c(
"ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN", "IOWA", "KANSAS",
"MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA"
),South = c(
"DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "MARYLAND",
"NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "WEST VIRGINIA", "ALABAMA",
"KENTUCKY", "MISSISSIPPI", "TENNESSEE", "ARKANSAS", "LOUISIANA", "OKLAHOMA",
"TEXAS"
),West = c(
"ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH",
"WYOMING", "ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON"
),other_level = "Unknown"
),margin = dem_votes - rep_votes,
pct_margin = margin / total_votes,
competitiveness = case_when(
< -0.2 ~ -3,
pct_margin < -0.1 ~ -2,
pct_margin < -0.04 ~ -1,
pct_margin < 0.04 ~ 0,
pct_margin < 0.1 ~ 1,
pct_margin < 0.2 ~ 2,
pct_margin TRUE ~ 3
),|>
) group_by(state) |>
mutate(
dem_win_state = as.numeric(sum(dem_votes) > sum(rep_votes)),
|>
) ungroup() |>
assert(not_na, everything()) |>
select(year, state, region, everything())
Save to county_pres.csv
:
write_csv(df_county_pres, "data/county_pres.csv")
B.3 crises.csv
: International crises
This file contains the International Crisis Behavior actor-level data, version 16. There’s purposely no additional cleaning since it’s used in the data wrangling lecture.
The raw data is stored via a Box link that doesn’t work with download.file()
, and my efforts to get ChatGPT to help me get to the underlying data were unsuccessful.
# Read in raw data
<- "_raw/icb2v16.csv"
crises_file if (!file.exists(crises_file))
stop("Need to download data manually from ICB website")
<- read_csv(crises_file)
df_crises_raw
glimpse(df_crises_raw)
Rows: 1,131
Columns: 95
$ icb2 <chr> "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2…
$ crisno <dbl> 1, 2, 2, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8,…
$ cracno <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ cracid <dbl> 365, 93, 94, 365, 365, 366, 368, 367, 315, 290, 310, …
$ actor <chr> "RUS", "NIC", "COS", "RUS", "RUS", "EST", "LIT", "LAT…
$ systrgyr <dbl> 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1919,…
$ systrgmo <dbl> 5, 5, 5, 6, 11, 11, 11, 11, 1, 1, 3, 3, 3, 3, 3, 3, 4…
$ systrgda <dbl> NA, 25, 25, 23, 18, 18, 18, 18, 15, 15, 20, 20, 20, 2…
$ crisname <chr> "RUSSIAN CIVIL WAR I", "COSTA RICAN COUP", "COSTA RIC…
$ triggr <dbl> 9, 7, 4, 7, 6, 9, 9, 9, 2, 7, 2, 9, 2, 9, 7, 7, 2, 7,…
$ yrtrig <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ motrig <dbl> 5, 5, 1, 6, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 3, 5, 4…
$ datrig <dbl> NA, 25, 25, 23, 18, 22, NA, NA, 15, 23, 20, 11, 8, 20…
$ trigent <dbl> 996, 94, 996, 997, 366, 365, 365, 365, 290, 315, 997,…
$ trigloc <dbl> 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ southv <dbl> 220, 94, 93, 200, 366, 365, 365, 365, 290, 315, 360, …
$ southpow <dbl> 3, 1, 1, 3, 1, 3, 3, 3, 2, 1, 1, 1, 3, 1, 3, 2, 3, 1,…
$ sizedu <dbl> 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, 3, 3, 3, NA, 3, …
$ strcdu <dbl> 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, NA, 1,…
$ comlev <dbl> 7, 1, 1, 7, 8, 8, 8, 8, 3, 3, 1, NA, 1, NA, NA, 3, 1,…
$ majres <dbl> 8, 3, 6, 8, 8, 9, 9, 9, 8, 8, 8, 8, 1, 8, 6, 8, 8, 8,…
$ yerres <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ monres <dbl> 5, 5, 1, 7, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 5, 5, 5…
$ dayres <dbl> 28, 30, 28, 1, 22, 22, NA, NA, 23, 23, 28, 11, 16, 24…
$ trgresra <dbl> 14, 6, 4, 9, 5, 1, NA, NA, 9, 1, 9, 1, 9, 5, 76, NA, …
$ crismg <dbl> 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 8, 7, 7, 7, 7,…
$ cenvio <dbl> 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 2, 3, 3, 3,…
$ sevvio <dbl> 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ usinv <dbl> 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 1,…
$ usfavr <dbl> 3, 1, 3, 3, 3, 1, 1, 1, 3, 3, 5, 5, 5, 5, 1, 3, 5, 5,…
$ suinv <dbl> 9, 1, 1, 9, 9, 8, 8, 8, 1, 1, 3, 3, 1, 3, 1, 1, 5, 5,…
$ sufavr <dbl> 8, 5, 5, 8, 8, 3, 3, 3, 5, 5, 1, 3, 5, 3, 5, 5, 2, 2,…
$ gbinv <dbl> 7, 1, 1, 8, 3, 6, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 8, 9,…
$ gbfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ frinv <dbl> 2, 1, 1, 8, 3, 1, 3, 3, 3, 3, 8, 8, 3, 1, 3, 3, 1, 1,…
$ frfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ itinv <dbl> 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 7, 7, 1, 1, 8, 9, 1, 1,…
$ itfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 3…
$ grinv <dbl> 1, 1, 1, 1, 8, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ grfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ jpinv <dbl> 7, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,…
$ jpfavr <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 5…
$ globorg <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globact <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globfavr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ regorg <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ regact <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rofavr <dbl> 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ outcom <dbl> 1, 1, 4, 1, 4, 1, 1, 1, 2, 2, 4, 1, 4, 1, 2, 2, 2, 2,…
$ outfor <dbl> 6, 4, 4, 6, 9, 8, 8, 8, 9, 9, 6, 1, 7, 4, 1, 1, 1, 1,…
$ outevl <dbl> 2, 2, 3, 2, 3, 2, 2, 2, 4, 4, 3, 2, 3, 2, 1, 1, 1, 1,…
$ outesr <dbl> 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2,…
$ yrterm <dbl> 1920, 1918, 1919, 1919, 1920, 1920, 1920, 1920, 1920,…
$ moterm <dbl> 4, 12, 9, 9, 8, 2, 7, 8, 7, 7, 8, 6, 6, 8, 7, 7, 8, 8…
$ daterm <dbl> 1, 15, 3, 27, 11, 2, 12, 11, 28, 28, 3, 24, 16, 3, 29…
$ trgterra <dbl> 686, 205, 222, 462, 632, 438, 574, 603, 560, 552, 137…
$ resterra <dbl> 673, 199, 218, 453, 627, 438, 574, 603, 551, 552, 128…
$ actloc <dbl> 30, 42, 42, 30, 30, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ geog <dbl> 30, 42, 42, 30, 34, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ cractloc <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 3, 2, 4, 1, 1,…
$ noactr <dbl> 7, 5, 6, 5, 8, 8, 8, 8, 3, 3, 7, 7, 3, 7, 5, 5, 3, 3,…
$ stainsys <dbl> 47, 47, 49, 47, 47, 47, 47, 47, 49, 49, 49, 49, 49, 4…
$ period <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ syslev <dbl> 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pc <dbl> 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pcid <dbl> 27, 6, 6, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ viol <dbl> 3, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ iwc <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3, 7, 1, 1, 1, 1,…
$ powdis <dbl> NA, 1, -1, NA, 12, -12, -12, -12, -1, 1, NA, NA, NA, …
$ gpinv <dbl> 7, 4, 4, 7, 7, 7, 7, 7, 3, 3, 5, 4, 4, 4, 6, 6, 6, 6,…
$ powinv <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ age <dbl> 1, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,…
$ territ <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2,…
$ regime <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 1, 4, 2, 1, 2, 2, 1, 1, 2, 1,…
$ durreg <dbl> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3,…
$ allycap <dbl> 4, 2, 1, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 4,…
$ globmemb <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
$ nuclear <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ powsta <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 3,…
$ issue <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,…
$ chissu <dbl> 4, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6,…
$ gravty <dbl> 2, 1, 2, 2, 3, 6, 6, 6, 3, 3, 6, 3, 5, 3, 3, 4, 2, 4,…
$ pethin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ col <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ unemp <dbl> NA, NA, NA, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, N…
$ inflat <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, NA,…
$ foodpr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ labstr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ short <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, N…
$ econdt <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ regrep <dbl> NA, 1, 1, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA,…
$ socunr <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, …
$ massvl <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, NA,…
$ gvinst <dbl> 1, 2, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, N…
$ sourdt <dbl> 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, NA, 3, 1, 1…
Save to crises.csv
:
write_csv(df_crises_raw, "data/crises.csv")
B.4 fed_papers.csv
: Federalist Papers corpus
This file contains the full text of each of the Federalist Papers, per the public domain text archived by Project Gutenberg.
Obtain the raw text from Project Gutenberg:
<- "https://www.gutenberg.org/files/18/18-0.txt"
fed_papers_url <- "_raw/fed_papers.txt"
fed_papers_file
if (!file.exists(fed_papers_file))
download.file(url = fed_papers_url, destfile = fed_papers_file)
<- readLines(fed_papers_file) fed_papers_raw
Parse text and assemble into data frame:
## Eliminate table of contents and other non-text content
<- fed_papers_raw |>
fed_papers tail(-98) |>
head(-2)
## Combine into single string
<- str_c(fed_papers, collapse = "\n")
fed_papers
## Split into individual papers
<- fed_papers |>
fed_papers str_split("THE FEDERALIST.\n") |>
unlist()
## Eliminate the empty first entry, as well as the duplicate of #70
<- fed_papers[-1]
fed_papers <- fed_papers[-70]
fed_papers
## Extract author(s) of each paper
<- "\\n\\n(HAMILTON|JAY|MADISON|HAMILTON AND MADISON|HAMILTON OR MADISON)\\n\\n\\n"
author_id_regex <- fed_papers |>
paper_author str_extract(author_id_regex) |>
str_remove_all("\\n") |>
str_to_lower()
## Start each paper text after author identifier
##
## This will keep our classifiers from "peeking" by directly using author info
<- fed_papers |>
paper_text str_split_i(author_id_regex, i = 2)
## Combine into a data frame
<- tibble(
df_fed_papers paper_id = seq_along(fed_papers),
author = paper_author,
text = paper_text
)
glimpse(df_fed_papers)
Rows: 85
Columns: 3
$ paper_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ author <chr> "hamilton", "jay", "jay", "jay", "jay", "hamilton", "…
$ text <chr> "To the People of the State of New York:\n\nAfter an …
Save to fed_papers.csv
:
write_csv(df_fed_papers, "data/fed_papers.csv")
B.6 military.csv
: Military spending and personnel
This file contains data from the Correlates of War project’s dataset on National Material Capabilities, version 6.0.
Obtain the raw data by extracting from the zip on the COW website:
# Download zip file containing raw data
#
# This is convoluted because the csv is inside a zip within the zip
<- "https://correlatesofwar.org/wp-content/uploads/NMC_Documentation-6.0.zip"
military_url <- "_raw/NMC-60-abridged.csv"
military_file if (!file.exists(military_file)) {
<- tempfile(fileext = ".zip")
military_zip_outer download.file(url = military_url, destfile = military_zip_outer)
<- archive_read(military_zip_outer, "NMC-60-abridged.zip")
military_zip_inner <- read_csv(archive_read(military_zip_inner, "NMC-60-abridged.csv"))
military_csv write_csv(military_csv, military_file)
}
# Read in raw data
<- read_csv(military_file)
df_military_raw
glimpse(df_military_raw)
Rows: 15,951
Columns: 11
$ stateabb <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA"…
$ ccode <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ year <dbl> 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1…
$ milex <dbl> 3823, 2466, 1910, 2301, 1556, 1612, 1079, 1170, 1261, 1336, 1…
$ milper <dbl> 17, 15, 14, 13, 15, 11, 10, 11, 11, 11, 12, 12, 11, 12, 12, 1…
$ irst <dbl> 80, 80, 90, 90, 110, 100, 100, 110, 110, 120, 120, 130, 130, …
$ pec <dbl> 254, 277, 302, 293, 303, 321, 332, 345, 390, 424, 502, 556, 6…
$ tpop <dbl> 8659, 8899, 9139, 9379, 9618, 9939, 10268, 10596, 10924, 1125…
$ upop <dbl> 101, 106, 112, 118, 124, 130, 136, 143, 151, 158, 166, 175, 1…
$ cinc <dbl> 0.03969749, 0.03581661, 0.03612655, 0.03713325, 0.03708687, 0…
$ version <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2…
Convert to “long” format containing only spending and personnel, for pedagogical purposes:
<- df_military_raw |>
df_military select(ccode, stateabb, year, spending = milex, personnel = milper) |>
pivot_longer(
cols = c(spending, personnel),
names_to = "mil_indicator",
values_to = "amount"
)
glimpse(df_military)
Rows: 31,902
Columns: 5
$ ccode <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ stateabb <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA",…
$ year <dbl> 1816, 1816, 1817, 1817, 1818, 1818, 1819, 1819, …
$ mil_indicator <chr> "spending", "personnel", "spending", "personnel"…
$ amount <dbl> 3823, 17, 2466, 15, 1910, 14, 2301, 13, 1556, 15…
Save to military.csv
:
write_csv(df_military, "data/military.csv")
B.7 turnout.csv
: US voter turnout, 2000–2022
This file uses data from the University of Florida Election Lab, specifically version 1.2 of the General Election Turnout Rates dataset.
Obtain the raw data:
# Download raw data
<- "https://election.lab.ufl.edu/data-downloads/turnoutdata/Turnout_1980_2022_v1.2.csv"
turnout_url <- "_raw/Turnout_1980_2022_v1.2.csv"
turnout_file if (!file.exists(turnout_file))
download.file(url = turnout_url, destfile = turnout_file)
# Read in raw data
<- read_csv(turnout_file)
df_turnout_raw
glimpse(df_turnout_raw)
Rows: 1,144
Columns: 15
$ YEAR <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022…
$ STATE <chr> "United States", "Alabama", "Alaska", "Arizona…
$ STATE_ABV <chr> NA, "AL", "AK", "AZ", "AR", "CA", "CO", "CT", …
$ TOTAL_BALLOTS_COUNTED <dbl> 112030874, 1424087, 267047, 2592313, 914227, 1…
$ VOTE_FOR_HIGHEST_OFFICE <chr> NA, "https://www.eac.gov/sites/default/files/2…
$ VAP <dbl> 260725069, 3956111, 556592, 5796801, 2347291, …
$ NONCITIZEN_PCT <chr> "7.50%", "2.54%", "3.56%", "7.78%", "3.76%", "…
$ INELIGIBLE_PRISON <dbl> 1175823, 25403, 4778, 31441, 17331, 97608, 163…
$ INELIGIBLE_PROBATION <dbl> 1074600, 27469, 1872, 47515, 28009, 0, 0, 0, 7…
$ INELIGIBLE_PAROLE <dbl> 412595, 7815, 865, 7022, 23829, 0, 0, 0, 344, …
$ INELIGIBLE_FELONS_TOTAL <dbl> 2663018, 60687, 7515, 85978, 69169, 97608, 163…
$ ELIGIBLE_OVERSEAS <dbl> 4400000, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VEP <dbl> 242907672, 3794939, 529263, 5259832, 2189865, …
$ VEP_TURNOUT_RATE <chr> "46.12%", "37.53%", "50.46%", "49.29%", "41.75…
$ VAP_TURNOUT_RATE <chr> "42.97%", "36.00%", "47.98%", "44.72%", "38.95…
Cleaning up into the data file used for class:
<- df_turnout_raw |>
df_turnout # Only want national level data
filter(STATE == "United States") |>
# Grab and rename the columns we want
mutate(
year = YEAR,
voting_age_pop = VAP,
voting_eligible_pop = VEP,
ballots_counted = TOTAL_BALLOTS_COUNTED,
highest_office = VOTE_FOR_HIGHEST_OFFICE,
noncitizen_pct = NONCITIZEN_PCT,
ineligible_felons = INELIGIBLE_FELONS_TOTAL,
eligible_overseas = ELIGIBLE_OVERSEAS,
.keep = "none",
|>
) # Clean up highest_office and noncitizen_pct columns to be numeric
mutate(
noncitizen_pct = str_replace(noncitizen_pct, "\\%", ""),
noncitizen_pct = as.numeric(noncitizen_pct) / 100,
highest_office = str_replace_all(highest_office, ",", ""),
highest_office = as.numeric(highest_office),
|>
) # Calculate number of noncitizens
mutate(
ineligible_noncitizens = noncitizen_pct * voting_age_pop,
|>
) select(-noncitizen_pct) |>
# For vote total, use ballots counted where available, otherwise just use
# votes for highest office
mutate(
votes_counted = if_else(
!is.na(ballots_counted),
ballots_counted,
highest_office
)|>
) # Convert population counts to millions
mutate(across(-year, \(x) x / 1e6)) |>
# Remove columns no longer needed
select(
year, votes_counted, voting_age_pop, voting_eligible_pop,
ineligible_felons, ineligible_noncitizens, eligible_overseas|>
) # Order from earliest to latest
arrange(year)
glimpse(df_turnout)
Rows: 22
Columns: 7
$ year <dbl> 1980, 1982, 1984, 1986, 1988, 1990, 199…
$ votes_counted <dbl> 86.51522, 67.61558, 92.65268, 64.99113,…
$ voting_age_pop <dbl> 164.4455, 166.0276, 173.9946, 177.9223,…
$ voting_eligible_pop <dbl> 159.6909, 160.4088, 167.7085, 170.4089,…
$ ineligible_felons <dbl> 0.801977, 0.959637, 1.165246, 1.367117,…
$ ineligible_noncitizens <dbl> 5.755592, 6.641105, 7.481768, 8.362350,…
$ eligible_overseas <dbl> 1.803021, 1.981895, 2.360867, 2.216053,…
Double check that the manually calculated voting eligible calculation lines up with the one reported in the data frame:
|>
df_turnout mutate(
vep_manual = voting_age_pop - ineligible_felons -
+ eligible_overseas,
ineligible_noncitizens vep_difference = abs(voting_eligible_pop - vep_manual) / voting_eligible_pop,
|>
) select(year, voting_eligible_pop, vep_manual, vep_difference) |>
print(n = Inf)
# A tibble: 22 × 4
year voting_eligible_pop vep_manual vep_difference
<dbl> <dbl> <dbl> <dbl>
1 1980 160. 160. 2.35e- 9
2 1982 160. 160. 1.99e- 9
3 1984 168. 168. 1.37e- 9
4 1986 170. 170. 2.88e- 9
5 1988 174. 174. 1.82e- 9
6 1990 177. 177. 1.44e- 9
7 1992 180. 180. 2.12e- 9
8 1994 183. 183. 2.19e- 9
9 1996 186. 186. 1.91e- 9
10 1998 190. 190. 1.50e- 9
11 2000 194. 194. 2.14e- 9
12 2002 198. 198. 4.03e-10
13 2004 203. 203. 2.18e- 9
14 2006 207. 207. 1.52e- 9
15 2008 213. 213. 2.25e- 9
16 2010 222. 222. 1.94e- 4
17 2012 222. 222. 2.66e- 9
18 2014 227. 227. 7.00e-10
19 2016 231. 231. 7.06e-10
20 2018 237. 237. 4.22e-10
21 2020 242. 242. 6.20e-10
22 2022 243. 243. 4.84e- 9
Save cleaned data to turnout.csv
:
write_csv(df_turnout, "data/turnout.csv")
B.8 wdi.csv
: World Development Indicators, 2019
This file contains data from the World Bank’s World Development Indicators dataset.
Obtain raw data using the WDI package:
# Download raw data via WDI package
<- "_raw/wdi_2019.csv"
wdi_file if (!file.exists(wdi_file)) {
<- WDI(
df_wdi_pkg country = "all",
indicator = c(
"gdp_per_capita" = "NY.GDP.PCAP.CD",
"gdp_growth" = "NY.GDP.MKTP.KD.ZG",
"population" = "SP.POP.TOTL",
"inflation" = "FP.CPI.TOTL.ZG",
"unemployment" = "SL.UEM.TOTL.ZS",
"life_expectancy" = "SP.DYN.LE00.IN"
),start = 2019, end = 2019,
extra = TRUE
)write_csv(df_wdi_pkg, wdi_file)
}
<- read_csv(wdi_file)
df_wdi_raw
glimpse(df_wdi_raw)
Rows: 266
Columns: 18
$ country <chr> "Afghanistan", "Africa Eastern and Southern", "Africa …
$ iso2c <chr> "AF", "ZH", "ZI", "AL", "DZ", "AS", "AD", "AO", "AG", …
$ iso3c <chr> "AFG", "AFE", "AFW", "ALB", "DZA", "ASM", "AND", "AGO"…
$ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
$ status <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ lastupdated <date> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01, 2025-…
$ gdp_per_capita <dbl> 496.6025, 1493.8179, 1798.3407, 5460.4305, 4468.4534, …
$ gdp_growth <dbl> 3.9116034, 2.2003404, 3.2821630, 2.0625779, 0.9000000,…
$ population <dbl> 37856121, 675950189, 463365429, 2854191, 43294546, 502…
$ inflation <dbl> 2.3023725, 4.6449672, 1.9830923, 1.4110908, 1.9517682,…
$ unemployment <dbl> 11.185000, 7.584419, 4.395271, 11.466000, 12.259000, N…
$ life_expectancy <dbl> 62.94100, 63.85726, 57.14985, 79.46700, 75.68200, 72.7…
$ region <chr> "South Asia", "Aggregates", "Aggregates", "Europe & Ce…
$ capital <chr> "Kabul", NA, NA, "Tirane", "Algiers", "Pago Pago", "An…
$ longitude <dbl> 69.17610, NA, NA, 19.81720, 3.05097, -170.69100, 1.521…
$ latitude <dbl> 34.52280, NA, NA, 41.33170, 36.73970, -14.28460, 42.50…
$ income <chr> "Low income", "Aggregates", "Aggregates", "Upper middl…
$ lending <chr> "IDA", "Aggregates", "Aggregates", "IBRD", "IBRD", "No…
Minor cleaning to remove unwanted rows and columns:
<- df_wdi_raw |>
df_wdi as_tibble() |>
select(-iso2c, -status, -lastupdated, -capital, -longitude, -latitude) |>
filter(region != "Aggregates") |>
filter(income != "Not classified") |>
mutate(income = case_match(
income,"Low income" ~ "1. Low",
"Lower middle income" ~ "2. Lower-middle",
"Upper middle income" ~ "3. Upper-middle",
"High income" ~ "4. High"
))
glimpse(df_wdi)
Rows: 215
Columns: 12
$ country <chr> "Afghanistan", "Albania", "Algeria", "American…
$ iso3c <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG…
$ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019…
$ gdp_per_capita <dbl> 496.6025, 5460.4305, 4468.4534, 12886.1360, 41…
$ gdp_growth <dbl> 3.9116034, 2.0625779, 0.9000000, -0.4878049, 2…
$ population <dbl> 37856121, 2854191, 43294546, 50209, 76474, 323…
$ inflation <dbl> 2.3023725, 1.4110908, 1.9517682, NA, NA, 17.08…
$ unemployment <dbl> 11.185, 11.466, 12.259, NA, NA, 16.497, NA, 9.…
$ life_expectancy <dbl> 62.94100, 79.46700, 75.68200, 72.75100, 84.098…
$ region <chr> "South Asia", "Europe & Central Asia", "Middle…
$ income <chr> "1. Low", "3. Upper-middle", "3. Upper-middle"…
$ lending <chr> "IDA", "IBRD", "IBRD", "Not classified", "Not …
Save to wdi.csv
:
write_csv(df_wdi, "data/wdi.csv")