Appendix B — Data Sources

# Required packages
library("tidyverse")
library("archive")
library("assertr")
library("dataverse")
library("gssr")
library("WDI")

# Create '_raw' subdirectory to store downloads, and 'data' for final cleaned files
if (!dir.exists("_raw"))
  dir.create("_raw")
if (!dir.exists("data"))
  dir.create("data")

B.1 `anes_2020.csv`: Subset of 2020 ANES survey

This file contains a small number of variables from the 2020 wave of the American National Election Studies Time Series Study.

Despite the direct download link on ANES’s website, they’ve got it locked down to prevent programmatic access from utilities like download.file(). (Not running glimpse() this time because the data has 1700+ columns.)

anes_file <- "_raw/anes_timeseries_2020_csv_20220210.csv"
if (!file.exists(anes_file))
  stop("Need to download data manually from ANES website")

df_anes_raw <- read_csv(anes_file)

Extract the few columns we care about and convert numeric codes to understandable values:

df_anes <- df_anes_raw |>
  mutate(
    id = row_number(),
    state = case_match(
      V201014b,
      1 ~ "Alabama",
      2 ~ "Alaska",
      4 ~ "Arizona",
      5 ~ "Arkansas",
      6 ~ "California",
      8 ~ "Colorado",
      9 ~ "Connecticut",
      10 ~ "Delaware",
      11 ~ "District of Columbia",
      12 ~ "Florida",
      13 ~ "Georgia",
      15 ~ "Hawaii",
      16 ~ "Idaho",
      17 ~ "Illinois",
      18 ~ "Indiana",
      19 ~ "Iowa",
      20 ~ "Kansas",
      21 ~ "Kentucky",
      22 ~ "Louisiana",
      23 ~ "Maine",
      24 ~ "Maryland",
      25 ~ "Massachusetts",
      26 ~ "Michigan",
      27 ~ "Minnesota",
      28 ~ "Mississippi",
      29 ~ "Missouri",
      30 ~ "Montana",
      31 ~ "Nebraska",
      32 ~ "Nevada",
      33 ~ "New Hampshire",
      34 ~ "New Jersey",
      35 ~ "New Mexico",
      36 ~ "New York",
      37 ~ "North Carolina",
      38 ~ "North Dakota",
      39 ~ "Ohio",
      40 ~ "Oklahoma",
      41 ~ "Oregon",
      42 ~ "Pennsylvania",
      44 ~ "Rhode Island",
      45 ~ "South Carolina",
      46 ~ "South Dakota",
      47 ~ "Tennessee",
      48 ~ "Texas",
      49 ~ "Utah",
      50 ~ "Vermont",
      51 ~ "Virginia",
      53 ~ "Washington",
      54 ~ "West Virginia",
      55 ~ "Wisconsin",
      56 ~ "Wyoming"
    ),
    female = case_match(
      V201600,
      1 ~ 0,
      2 ~ 1
    ),
    lgbt = case_match(
      V201601,
      1 ~ 0,
      2:4 ~ 1
    ),
    race = case_match(
      V201549x,
      1 ~ "White",
      2 ~ "Black",
      3 ~ "Hispanic",
      4 ~ "Asian",
      5 ~ "Native American",
      6 ~ "Multiracial"
    ),
    age = if_else(V201507x > 0, V201507x, NA),
    education = case_match(
      V201511x,
      1 ~ "Less than high school",
      2 ~ "High school",
      3 ~ "Some college",
      4 ~ "Bachelor's degree",
      5 ~ "Graduate degree"
    ),
    employed = case_match(
      V201517,
      1 ~ 1,
      2 ~ 0
    ),
    hours_worked = case_when(
      V201527 == -1 ~ 0,
      V201527 > 0 ~ V201527,
      TRUE ~ NA
    ),
    watch_tucker = case_match(
      V201630c,
      c(-1, 0) ~ 0,
      1 ~ 1
    ),
    watch_maddow = case_match(
      V201630d,
      c(-1, 0) ~ 0,
      1 ~ 1
    ),
    therm_biden = if_else(V201151 %in% 0:100, V201151, NA),
    therm_trump = if_else(V201152 %in% 0:100, V201152, NA),
    therm_harris = if_else(V201153 %in% 0:100, V201153, NA),
    therm_pence = if_else(V201154 %in% 0:100, V201154, NA),
    therm_obama = if_else(V201155 %in% 0:100, V201155, NA),
    therm_dem_party = if_else(V201156 %in% 0:100, V201156, NA),
    therm_rep_party = if_else(V201157 %in% 0:100, V201157, NA),
    therm_feminists = if_else(V202160 %in% 0:100, V202160, NA),
    therm_liberals = if_else(V202161 %in% 0:100, V202161, NA),
    therm_labor_unions = if_else(V202162 %in% 0:100, V202162, NA),
    therm_big_business = if_else(V202163 %in% 0:100, V202163, NA),
    therm_conservatives = if_else(V202164 %in% 0:100, V202164, NA),
    therm_supreme_court = if_else(V202165 %in% 0:100, V202165, NA),
    therm_congress = if_else(V202167 %in% 0:100, V202167, NA),
    therm_police = if_else(V202171 %in% 0:100, V202171, NA),
    therm_scientists = if_else(V202173 %in% 0:100, V202173, NA),
    contributed_to_party = case_match(
      V202019,
      1 ~ 1,
      2 ~ 0
    ),
    voted = case_match(
      V202068x,
      0:1 ~ 0,
      2 ~ 1
    ),
    voted_for_biden = if_else(V202073 < 0, NA, V202073),
    voted_for_biden = case_match(
      voted_for_biden,
      1 ~ 1,
      2:8 ~ 0
    ),
    voted_for_trump = if_else(V202073 < 0, NA, V202073),
    voted_for_trump = case_match(
      voted_for_trump,
      2 ~ 1,
      c(1, 3:8) ~ 0
    ),
    .keep = "none"
  )

glimpse(df_anes)

Rows: 8,280
Columns: 31
$ id                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ state                <chr> "Oklahoma", "Idaho", "Virginia", "Califor…
$ female               <dbl> 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,…
$ lgbt                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ race                 <chr> "Hispanic", "Asian", "White", "Asian", "N…
$ age                  <dbl> 46, 37, 40, 41, 72, 71, 37, 45, 70, 43, 3…
$ education            <chr> "Bachelor's degree", "Some college", "Hig…
$ employed             <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,…
$ hours_worked         <dbl> 40, 40, 0, 40, 0, 0, 30, 40, 0, 30, 25, 5…
$ watch_tucker         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ watch_maddow         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ therm_biden          <dbl> 0, 0, 65, 70, 15, 85, 50, 50, 85, 85, 100…
$ therm_trump          <dbl> 100, 0, 0, 15, 85, 0, 75, 100, 0, 0, 0, 0…
$ therm_harris         <dbl> 0, 0, 65, 85, 15, 85, 15, 50, 85, 50, 100…
$ therm_pence          <dbl> 85, 0, 0, 15, 90, 0, 75, 50, 0, 50, 0, 50…
$ therm_obama          <dbl> 0, 50, 90, 85, 10, 60, 15, 50, 60, 100, 1…
$ therm_dem_party      <dbl> 0, 0, 60, 50, 20, 85, 15, 50, NA, 60, 100…
$ therm_rep_party      <dbl> 85, 50, 0, 70, 70, 15, 75, 100, NA, 50, 0…
$ therm_feminists      <dbl> 65, 100, 75, 70, 30, 60, 60, 100, 50, 50,…
$ therm_liberals       <dbl> 30, 0, 75, 70, 10, 70, 0, NA, 30, 50, 50,…
$ therm_labor_unions   <dbl> 30, 70, 75, 70, 50, 50, 50, 0, 30, 50, 50…
$ therm_big_business   <dbl> 70, 50, 0, 85, 0, 40, 50, 0, 50, 15, 50, …
$ therm_conservatives  <dbl> 85, 15, 0, 70, 60, 40, 60, NA, 50, 50, 50…
$ therm_supreme_court  <dbl> 100, 50, 25, 85, 60, 60, 70, 50, 50, 50, …
$ therm_congress       <dbl> 40, 15, 0, 100, 10, 85, 50, 50, 50, 40, 5…
$ therm_police         <dbl> 85, 90, 40, 100, 70, 70, 60, 100, 60, 70,…
$ therm_scientists     <dbl> 100, 70, 100, 85, 60, 85, 85, NA, 60, 50,…
$ contributed_to_party <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ voted                <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,…
$ voted_for_biden      <dbl> NA, 0, 1, 1, 0, 1, 0, NA, NA, 1, 1, 1, 0,…
$ voted_for_trump      <dbl> NA, 0, 0, 0, 1, 0, 1, NA, NA, 0, 0, 0, 1,…

Save to anes_2020.csv:

write_csv(df_anes, "data/anes_2020.csv")

B.2 `county_pres.csv`: County-level presidential election returns, 2000–2024

This file contains data from the MIT Election Lab dataset on County Presidential Election returns.

Obtain raw data from Harvard Dataverse:

# Download raw data from Harvard Dataverse repository
county_pres_file <- "_raw/countypres_2000-2024.csv"
if (!file.exists(county_pres_file)) {
  df_county_pres_dataverse <- get_dataframe_by_name(
    filename = "countypres_2000-2024.tab",
    dataset = "10.7910/DVN/VOQCHQ",
    server = "dataverse.harvard.edu",
    version = "15.0"
  )
  write_csv(df_county_pres_dataverse, county_pres_file)
}

df_county_pres_raw <- read_csv(county_pres_file)

glimpse(df_county_pres_raw)

Rows: 94,409
Columns: 12
$ year           <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2…
$ state          <chr> "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", …
$ state_po       <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "…
$ county_name    <chr> "AUTAUGA", "AUTAUGA", "AUTAUGA", "AUTAUGA", "BALDWIN", …
$ county_fips    <chr> "01001", "01001", "01001", "01001", "01003", "01003", "…
$ office         <chr> "US PRESIDENT", "US PRESIDENT", "US PRESIDENT", "US PRE…
$ candidate      <chr> "AL GORE", "GEORGE W. BUSH", "OTHER", "RALPH NADER", "A…
$ party          <chr> "DEMOCRAT", "REPUBLICAN", "OTHER", "GREEN", "DEMOCRAT",…
$ candidatevotes <dbl> 4942, 11993, 113, 160, 13997, 40872, 578, 1033, 5188, 5…
$ totalvotes     <dbl> 17208, 17208, 17208, 17208, 56480, 56480, 56480, 56480,…
$ version        <dbl> 20250712, 20250712, 20250712, 20250712, 20250712, 20250…
$ mode           <chr> "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "TOTAL", "…

Clean data to have one row per county-year:

df_county_pres <- df_county_pres_raw |>
  filter(!is.na(party), totalvotes > 0) |>
  rename(county = county_name) |>
  group_by(year, state, county) |>
  summarize(
    county_fips = first(county_fips),
    total_votes = first(totalvotes),
    dem_votes = sum(candidatevotes[party == "DEMOCRAT"]),
    rep_votes = sum(candidatevotes[party == "REPUBLICAN"]),
    .groups = "drop"
  ) |>
  mutate(
    region = fct_collapse(
      state,
      Northeast = c(
        "CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND",
        "VERMONT", "NEW JERSEY", "NEW YORK", "PENNSYLVANIA"
      ),
      Midwest = c(
        "ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN", "IOWA", "KANSAS",
        "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA"
      ),
      South = c(
        "DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "MARYLAND",
        "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "WEST VIRGINIA", "ALABAMA",
        "KENTUCKY", "MISSISSIPPI", "TENNESSEE", "ARKANSAS", "LOUISIANA", "OKLAHOMA",
        "TEXAS"
      ),
      West = c(
        "ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH",
        "WYOMING", "ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON"
      ),
      other_level = "Unknown"
    ),
    margin = dem_votes - rep_votes,
    pct_margin = margin / total_votes,
    competitiveness = case_when(
      pct_margin < -0.2 ~ -3,
      pct_margin < -0.1 ~ -2,
      pct_margin < -0.04 ~ -1,
      pct_margin < 0.04 ~ 0,
      pct_margin < 0.1 ~ 1,
      pct_margin < 0.2 ~ 2,
      TRUE ~ 3
    ),
  ) |>
  group_by(state) |>
  mutate(
    dem_win_state = as.numeric(sum(dem_votes) > sum(rep_votes)),
  ) |>
  ungroup() |>
  assert(not_na, everything()) |>
  select(year, state, region, everything())

Save to county_pres.csv:

write_csv(df_county_pres, "data/county_pres.csv")

B.3 `crises.csv`: International crises

This file contains the International Crisis Behavior actor-level data, version 16. There’s purposely no additional cleaning since it’s used in the data wrangling lecture.

The raw data is stored via a Box link that doesn’t work with download.file(), and my efforts to get ChatGPT to help me get to the underlying data were unsuccessful.

# Read in raw data
crises_file <- "_raw/icb2v16.csv"
if (!file.exists(crises_file))
  stop("Need to download data manually from ICB website")
df_crises_raw <- read_csv(crises_file)

glimpse(df_crises_raw)

Rows: 1,131
Columns: 95
$ icb2     <chr> "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2", "ICB2…
$ crisno   <dbl> 1, 2, 2, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8,…
$ cracno   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ cracid   <dbl> 365, 93, 94, 365, 365, 366, 368, 367, 315, 290, 310, …
$ actor    <chr> "RUS", "NIC", "COS", "RUS", "RUS", "EST", "LIT", "LAT…
$ systrgyr <dbl> 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1918, 1919,…
$ systrgmo <dbl> 5, 5, 5, 6, 11, 11, 11, 11, 1, 1, 3, 3, 3, 3, 3, 3, 4…
$ systrgda <dbl> NA, 25, 25, 23, 18, 18, 18, 18, 15, 15, 20, 20, 20, 2…
$ crisname <chr> "RUSSIAN CIVIL WAR I", "COSTA RICAN COUP", "COSTA RIC…
$ triggr   <dbl> 9, 7, 4, 7, 6, 9, 9, 9, 2, 7, 2, 9, 2, 9, 7, 7, 2, 7,…
$ yrtrig   <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ motrig   <dbl> 5, 5, 1, 6, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 3, 5, 4…
$ datrig   <dbl> NA, 25, 25, 23, 18, 22, NA, NA, 15, 23, 20, 11, 8, 20…
$ trigent  <dbl> 996, 94, 996, 997, 366, 365, 365, 365, 290, 315, 997,…
$ trigloc  <dbl> 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ southv   <dbl> 220, 94, 93, 200, 366, 365, 365, 365, 290, 315, 360, …
$ southpow <dbl> 3, 1, 1, 3, 1, 3, 3, 3, 2, 1, 1, 1, 3, 1, 3, 2, 3, 1,…
$ sizedu   <dbl> 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, 3, 3, 3, NA, 3, …
$ strcdu   <dbl> 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, NA, 1,…
$ comlev   <dbl> 7, 1, 1, 7, 8, 8, 8, 8, 3, 3, 1, NA, 1, NA, NA, 3, 1,…
$ majres   <dbl> 8, 3, 6, 8, 8, 9, 9, 9, 8, 8, 8, 8, 1, 8, 6, 8, 8, 8,…
$ yerres   <dbl> 1918, 1918, 1919, 1918, 1918, 1918, 1918, 1918, 1919,…
$ monres   <dbl> 5, 5, 1, 7, 11, 11, 12, 12, 1, 1, 3, 5, 6, 7, 5, 5, 5…
$ dayres   <dbl> 28, 30, 28, 1, 22, 22, NA, NA, 23, 23, 28, 11, 16, 24…
$ trgresra <dbl> 14, 6, 4, 9, 5, 1, NA, NA, 9, 1, 9, 1, 9, 5, 76, NA, …
$ crismg   <dbl> 8, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 8, 7, 7, 7, 7,…
$ cenvio   <dbl> 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 2, 3, 3, 3,…
$ sevvio   <dbl> 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ usinv    <dbl> 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 1,…
$ usfavr   <dbl> 3, 1, 3, 3, 3, 1, 1, 1, 3, 3, 5, 5, 5, 5, 1, 3, 5, 5,…
$ suinv    <dbl> 9, 1, 1, 9, 9, 8, 8, 8, 1, 1, 3, 3, 1, 3, 1, 1, 5, 5,…
$ sufavr   <dbl> 8, 5, 5, 8, 8, 3, 3, 3, 5, 5, 1, 3, 5, 3, 5, 5, 2, 2,…
$ gbinv    <dbl> 7, 1, 1, 8, 3, 6, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 8, 9,…
$ gbfavr   <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ frinv    <dbl> 2, 1, 1, 8, 3, 1, 3, 3, 3, 3, 8, 8, 3, 1, 3, 3, 1, 1,…
$ frfavr   <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 1…
$ itinv    <dbl> 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 7, 7, 1, 1, 8, 9, 1, 1,…
$ itfavr   <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 3…
$ grinv    <dbl> 1, 1, 1, 1, 8, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ grfavr   <dbl> NA, 5, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ jpinv    <dbl> 7, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,…
$ jpfavr   <dbl> NA, 5, 5, NA, NA, NA, NA, NA, 3, 3, NA, NA, NA, NA, 5…
$ globorg  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globact  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ globfavr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ regorg   <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ regact   <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ rofavr   <dbl> 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ outcom   <dbl> 1, 1, 4, 1, 4, 1, 1, 1, 2, 2, 4, 1, 4, 1, 2, 2, 2, 2,…
$ outfor   <dbl> 6, 4, 4, 6, 9, 8, 8, 8, 9, 9, 6, 1, 7, 4, 1, 1, 1, 1,…
$ outevl   <dbl> 2, 2, 3, 2, 3, 2, 2, 2, 4, 4, 3, 2, 3, 2, 1, 1, 1, 1,…
$ outesr   <dbl> 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2,…
$ yrterm   <dbl> 1920, 1918, 1919, 1919, 1920, 1920, 1920, 1920, 1920,…
$ moterm   <dbl> 4, 12, 9, 9, 8, 2, 7, 8, 7, 7, 8, 6, 6, 8, 7, 7, 8, 8…
$ daterm   <dbl> 1, 15, 3, 27, 11, 2, 12, 11, 28, 28, 3, 24, 16, 3, 29…
$ trgterra <dbl> 686, 205, 222, 462, 632, 438, 574, 603, 560, 552, 137…
$ resterra <dbl> 673, 199, 218, 453, 627, 438, 574, 603, 551, 552, 128…
$ actloc   <dbl> 30, 42, 42, 30, 30, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ geog     <dbl> 30, 42, 42, 30, 34, 34, 34, 34, 31, 31, 31, 31, 31, 3…
$ cractloc <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 3, 2, 4, 1, 1,…
$ noactr   <dbl> 7, 5, 6, 5, 8, 8, 8, 8, 3, 3, 7, 7, 3, 7, 5, 5, 3, 3,…
$ stainsys <dbl> 47, 47, 49, 47, 47, 47, 47, 47, 49, 49, 49, 49, 49, 4…
$ period   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ syslev   <dbl> 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pc       <dbl> 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ pcid     <dbl> 27, 6, 6, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ viol     <dbl> 3, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 1, 4, 2, 2, 4, 4,…
$ iwc      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3, 7, 1, 1, 1, 1,…
$ powdis   <dbl> NA, 1, -1, NA, 12, -12, -12, -12, -1, 1, NA, NA, NA, …
$ gpinv    <dbl> 7, 4, 4, 7, 7, 7, 7, 7, 3, 3, 5, 4, 4, 4, 6, 6, 6, 6,…
$ powinv   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ age      <dbl> 1, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,…
$ territ   <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2,…
$ regime   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 1, 4, 2, 1, 2, 2, 1, 1, 2, 1,…
$ durreg   <dbl> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3,…
$ allycap  <dbl> 4, 2, 1, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 4,…
$ globmemb <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
$ nuclear  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ powsta   <dbl> 3, 1, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 3,…
$ issue    <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,…
$ chissu   <dbl> 4, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6,…
$ gravty   <dbl> 2, 1, 2, 2, 3, 6, 6, 6, 3, 3, 6, 3, 5, 3, 3, 4, 2, 4,…
$ pethin   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ col      <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ unemp    <dbl> NA, NA, NA, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, N…
$ inflat   <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, NA,…
$ foodpr   <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ labstr   <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ short    <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, N…
$ econdt   <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 1, N…
$ regrep   <dbl> NA, 1, 1, NA, NA, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA,…
$ socunr   <dbl> 1, NA, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, …
$ massvl   <dbl> 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, 2, NA,…
$ gvinst   <dbl> 1, 2, NA, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, NA, NA, NA, N…
$ sourdt   <dbl> 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, NA, 3, 1, 1…

Save to crises.csv:

write_csv(df_crises_raw, "data/crises.csv")

B.4 `fed_papers.csv`: Federalist Papers corpus

This file contains the full text of each of the Federalist Papers, per the public domain text archived by Project Gutenberg.

Obtain the raw text from Project Gutenberg:

fed_papers_url <- "https://www.gutenberg.org/files/18/18-0.txt"
fed_papers_file <- "_raw/fed_papers.txt"

if (!file.exists(fed_papers_file))
  download.file(url = fed_papers_url, destfile = fed_papers_file)

fed_papers_raw <- readLines(fed_papers_file)

Parse text and assemble into data frame:

## Eliminate table of contents and other non-text content
fed_papers <- fed_papers_raw |>
  tail(-98) |>
  head(-2)

## Combine into single string
fed_papers <- str_c(fed_papers, collapse = "\n")

## Split into individual papers
fed_papers <- fed_papers |>
  str_split("THE FEDERALIST.\n") |>
  unlist()

## Eliminate the empty first entry, as well as the duplicate of #70
fed_papers <- fed_papers[-1]
fed_papers <- fed_papers[-70]

## Extract author(s) of each paper
author_id_regex <- "\\n\\n(HAMILTON|JAY|MADISON|HAMILTON AND MADISON|HAMILTON OR MADISON)\\n\\n\\n"
paper_author <- fed_papers |>
  str_extract(author_id_regex) |>
  str_remove_all("\\n") |>
  str_to_lower()

## Start each paper text after author identifier
##
## This will keep our classifiers from "peeking" by directly using author info
paper_text <- fed_papers |>
  str_split_i(author_id_regex, i = 2)

## Combine into a data frame
df_fed_papers <- tibble(
  paper_id = seq_along(fed_papers),
  author = paper_author,
  text = paper_text
)

glimpse(df_fed_papers)

Rows: 85
Columns: 3
$ paper_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ author   <chr> "hamilton", "jay", "jay", "jay", "jay", "hamilton", "…
$ text     <chr> "To the People of the State of New York:\n\nAfter an …

Save to fed_papers.csv:

write_csv(df_fed_papers, "data/fed_papers.csv")

B.5 `gss_2024.csv`: Subset of 2024 General Social Survey

This file contains a small number of variables from the 2024 wave of the General Social Survey.

Using Kieran Healy’s gssr package to download the data:

gss_file <- "_raw/gss_2024.csv"
if (!file.exists(gss_file)) {
  df_gss_pkg <- gss_get_yr(2024)
  write_csv(df_gss_pkg, gss_file)
}

df_gss_raw <- read_csv(gss_file)

df_gss_raw

# A tibble: 3,309 × 639
    year    id wrkstat  hrs1  hrs2 evwork marital martype divorce
   <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
 1  2024     1       1    43    NA     NA       5      NA      NA
 2  2024     2       5    NA    NA      1       5      NA      NA
 3  2024     3       5    NA    NA      1       1       1       2
 4  2024     4       2    20    NA     NA       5      NA      NA
 5  2024     5       5    NA    NA      1       3      NA      NA
 6  2024     6       4    NA    NA     NA       1       1       2
 7  2024     7       1    80    NA     NA       1       1       1
 8  2024     8       6    NA    NA      1       3      NA      NA
 9  2024     9       1    50    NA     NA       1      NA      NA
10  2024    10       4    NA    NA     NA       5      NA      NA
# ℹ 3,299 more rows
# ℹ 630 more variables: widowed <dbl>, spwrksta <dbl>, sphrs1 <dbl>,
#   sphrs2 <dbl>, spevwork <dbl>, cowrksta <dbl>, coevwork <dbl>,
#   cohrs1 <dbl>, cohrs2 <dbl>, sibs <dbl>, childs <dbl>, age <dbl>,
#   educ <dbl>, speduc <dbl>, coeduc <dbl>, codeg <dbl>, degree <dbl>,
#   padeg <dbl>, madeg <dbl>, spdeg <dbl>, sex <dbl>, race <dbl>,
#   res16 <dbl>, reg16 <dbl>, mobile16 <dbl>, family16 <dbl>, …

Extract the few columns we care about and do some mild cleaning:

df_gss <- df_gss_raw |>
  mutate(
    id = id,
    region = case_match(
      region,
      1 ~ "New England",
      2 ~ "Middle Atlantic",
      c(3, 4) ~ "North Central",
      5 ~ "South Atlantic",
      c(6, 7) ~ "South Central",
      8 ~ "Mountain",
      9 ~ "Pacific"
    ),
    age = age,
    educ = educ,
    female = if_else(sex == 2, 1, 0),
    race = case_match(
      racecen1,
      1 ~ "White",
      2 ~ "Black",
      3 ~ "American Indian",
      4 ~ "Asian",
      5 ~ "Pacific Islander",
      6 ~ "Other"
    ),
    hispanic = if_else(hispanic == 1, 0, 1),
    polviews = polviews,
    trump_rating = ratechall124,
    repub_rating = raterepp,
    maga_rating = ratemaga,
    .keep = "none"
  ) |>
  select(
    id, region, age, educ, female, race, hispanic,
    polviews, trump_rating, repub_rating, maga_rating
  )

glimpse(df_gss)

Rows: 3,309
Columns: 11
$ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
$ region       <chr> "New England", "New England", "New England", "New…
$ age          <dbl> 33, 64, 69, 19, 70, 53, 48, 30, 60, 25, 23, 68, 4…
$ educ         <dbl> 16, 16, 14, 12, 13, 14, 13, 14, 14, 12, 18, 5, 14…
$ female       <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0…
$ race         <chr> "Black", "White", "White", "Other", "White", "Oth…
$ hispanic     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
$ polviews     <dbl> 4, 3, 1, NA, 4, NA, 2, 4, 4, 4, 2, 1, NA, 1, 4, 3…
$ trump_rating <dbl> 0, NA, 0, 50, 60, NA, 0, 30, 0, 50, 0, 0, NA, 0, …
$ repub_rating <dbl> 6, NA, 0, 50, 60, NA, 15, 50, 30, 50, 0, 15, NA, …
$ maga_rating  <dbl> 0, NA, 0, 50, 60, NA, 0, 40, 0, 50, 0, 15, NA, 0,…

Save to gss_2024.csv:

write_csv(df_gss, "data/gss_2024.csv")

B.6 `military.csv`: Military spending and personnel

This file contains data from the Correlates of War project’s dataset on National Material Capabilities, version 6.0.

Obtain the raw data by extracting from the zip on the COW website:

# Download zip file containing raw data
#
# This is convoluted because the csv is inside a zip within the zip
military_url <- "https://correlatesofwar.org/wp-content/uploads/NMC_Documentation-6.0.zip"
military_file <- "_raw/NMC-60-abridged.csv"
if (!file.exists(military_file)) {
  military_zip_outer <- tempfile(fileext = ".zip")
  download.file(url = military_url, destfile = military_zip_outer)
  military_zip_inner <- archive_read(military_zip_outer, "NMC-60-abridged.zip")
  military_csv <- read_csv(archive_read(military_zip_inner, "NMC-60-abridged.csv"))
  write_csv(military_csv, military_file)
}

# Read in raw data
df_military_raw <- read_csv(military_file)

glimpse(df_military_raw)

Rows: 15,951
Columns: 11
$ stateabb <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA"…
$ ccode    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ year     <dbl> 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1…
$ milex    <dbl> 3823, 2466, 1910, 2301, 1556, 1612, 1079, 1170, 1261, 1336, 1…
$ milper   <dbl> 17, 15, 14, 13, 15, 11, 10, 11, 11, 11, 12, 12, 11, 12, 12, 1…
$ irst     <dbl> 80, 80, 90, 90, 110, 100, 100, 110, 110, 120, 120, 130, 130, …
$ pec      <dbl> 254, 277, 302, 293, 303, 321, 332, 345, 390, 424, 502, 556, 6…
$ tpop     <dbl> 8659, 8899, 9139, 9379, 9618, 9939, 10268, 10596, 10924, 1125…
$ upop     <dbl> 101, 106, 112, 118, 124, 130, 136, 143, 151, 158, 166, 175, 1…
$ cinc     <dbl> 0.03969749, 0.03581661, 0.03612655, 0.03713325, 0.03708687, 0…
$ version  <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2…

Convert to “long” format containing only spending and personnel, for pedagogical purposes:

df_military <- df_military_raw |>
  select(ccode, stateabb, year, spending = milex, personnel = milper) |>
  pivot_longer(
    cols = c(spending, personnel),
    names_to = "mil_indicator",
    values_to = "amount"
  )

glimpse(df_military)

Rows: 31,902
Columns: 5
$ ccode         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ stateabb      <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA",…
$ year          <dbl> 1816, 1816, 1817, 1817, 1818, 1818, 1819, 1819, …
$ mil_indicator <chr> "spending", "personnel", "spending", "personnel"…
$ amount        <dbl> 3823, 17, 2466, 15, 1910, 14, 2301, 13, 1556, 15…

Save to military.csv:

write_csv(df_military, "data/military.csv")

B.7 `turnout.csv`: US voter turnout, 2000–2022

This file uses data from the University of Florida Election Lab, specifically version 1.2 of the General Election Turnout Rates dataset.

Obtain the raw data:

# Download raw data
turnout_url <- "https://election.lab.ufl.edu/data-downloads/turnoutdata/Turnout_1980_2022_v1.2.csv"
turnout_file <- "_raw/Turnout_1980_2022_v1.2.csv"
if (!file.exists(turnout_file))
  download.file(url = turnout_url, destfile = turnout_file)

# Read in raw data
df_turnout_raw <- read_csv(turnout_file)

glimpse(df_turnout_raw)

Rows: 1,144
Columns: 15
$ YEAR                    <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022…
$ STATE                   <chr> "United States", "Alabama", "Alaska", "Arizona…
$ STATE_ABV               <chr> NA, "AL", "AK", "AZ", "AR", "CA", "CO", "CT", …
$ TOTAL_BALLOTS_COUNTED   <dbl> 112030874, 1424087, 267047, 2592313, 914227, 1…
$ VOTE_FOR_HIGHEST_OFFICE <chr> NA, "https://www.eac.gov/sites/default/files/2…
$ VAP                     <dbl> 260725069, 3956111, 556592, 5796801, 2347291, …
$ NONCITIZEN_PCT          <chr> "7.50%", "2.54%", "3.56%", "7.78%", "3.76%", "…
$ INELIGIBLE_PRISON       <dbl> 1175823, 25403, 4778, 31441, 17331, 97608, 163…
$ INELIGIBLE_PROBATION    <dbl> 1074600, 27469, 1872, 47515, 28009, 0, 0, 0, 7…
$ INELIGIBLE_PAROLE       <dbl> 412595, 7815, 865, 7022, 23829, 0, 0, 0, 344, …
$ INELIGIBLE_FELONS_TOTAL <dbl> 2663018, 60687, 7515, 85978, 69169, 97608, 163…
$ ELIGIBLE_OVERSEAS       <dbl> 4400000, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ VEP                     <dbl> 242907672, 3794939, 529263, 5259832, 2189865, …
$ VEP_TURNOUT_RATE        <chr> "46.12%", "37.53%", "50.46%", "49.29%", "41.75…
$ VAP_TURNOUT_RATE        <chr> "42.97%", "36.00%", "47.98%", "44.72%", "38.95…

Cleaning up into the data file used for class:

df_turnout <- df_turnout_raw |>
  # Only want national level data
  filter(STATE == "United States") |>
  # Grab and rename the columns we want
  mutate(
    year = YEAR,
    voting_age_pop = VAP,
    voting_eligible_pop = VEP,
    ballots_counted = TOTAL_BALLOTS_COUNTED,
    highest_office = VOTE_FOR_HIGHEST_OFFICE,
    noncitizen_pct = NONCITIZEN_PCT,
    ineligible_felons = INELIGIBLE_FELONS_TOTAL,
    eligible_overseas = ELIGIBLE_OVERSEAS,
    .keep = "none",
  ) |>
  # Clean up highest_office and noncitizen_pct columns to be numeric
  mutate(
    noncitizen_pct = str_replace(noncitizen_pct, "\\%", ""),
    noncitizen_pct = as.numeric(noncitizen_pct) / 100,
    highest_office = str_replace_all(highest_office, ",", ""),
    highest_office = as.numeric(highest_office),
  ) |>
  # Calculate number of noncitizens
  mutate(
    ineligible_noncitizens = noncitizen_pct * voting_age_pop,
  ) |>
  select(-noncitizen_pct) |>
  # For vote total, use ballots counted where available, otherwise just use
  # votes for highest office
  mutate(
    votes_counted = if_else(
      !is.na(ballots_counted),
      ballots_counted,
      highest_office
    )
  ) |>
  # Convert population counts to millions
  mutate(across(-year, \(x) x / 1e6)) |>
  # Remove columns no longer needed
  select(
    year, votes_counted, voting_age_pop, voting_eligible_pop,
    ineligible_felons, ineligible_noncitizens, eligible_overseas
  ) |>
  # Order from earliest to latest
  arrange(year)

glimpse(df_turnout)

Rows: 22
Columns: 7
$ year                   <dbl> 1980, 1982, 1984, 1986, 1988, 1990, 199…
$ votes_counted          <dbl> 86.51522, 67.61558, 92.65268, 64.99113,…
$ voting_age_pop         <dbl> 164.4455, 166.0276, 173.9946, 177.9223,…
$ voting_eligible_pop    <dbl> 159.6909, 160.4088, 167.7085, 170.4089,…
$ ineligible_felons      <dbl> 0.801977, 0.959637, 1.165246, 1.367117,…
$ ineligible_noncitizens <dbl> 5.755592, 6.641105, 7.481768, 8.362350,…
$ eligible_overseas      <dbl> 1.803021, 1.981895, 2.360867, 2.216053,…

Double check that the manually calculated voting eligible calculation lines up with the one reported in the data frame:

df_turnout |>
  mutate(
    vep_manual = voting_age_pop - ineligible_felons -
      ineligible_noncitizens + eligible_overseas,
    vep_difference = abs(voting_eligible_pop - vep_manual) / voting_eligible_pop,
  ) |>
  select(year, voting_eligible_pop, vep_manual, vep_difference) |>
  print(n = Inf)

# A tibble: 22 × 4
    year voting_eligible_pop vep_manual vep_difference
   <dbl>               <dbl>      <dbl>          <dbl>
 1  1980                160.       160.       2.35e- 9
 2  1982                160.       160.       1.99e- 9
 3  1984                168.       168.       1.37e- 9
 4  1986                170.       170.       2.88e- 9
 5  1988                174.       174.       1.82e- 9
 6  1990                177.       177.       1.44e- 9
 7  1992                180.       180.       2.12e- 9
 8  1994                183.       183.       2.19e- 9
 9  1996                186.       186.       1.91e- 9
10  1998                190.       190.       1.50e- 9
11  2000                194.       194.       2.14e- 9
12  2002                198.       198.       4.03e-10
13  2004                203.       203.       2.18e- 9
14  2006                207.       207.       1.52e- 9
15  2008                213.       213.       2.25e- 9
16  2010                222.       222.       1.94e- 4
17  2012                222.       222.       2.66e- 9
18  2014                227.       227.       7.00e-10
19  2016                231.       231.       7.06e-10
20  2018                237.       237.       4.22e-10
21  2020                242.       242.       6.20e-10
22  2022                243.       243.       4.84e- 9

Save cleaned data to turnout.csv:

write_csv(df_turnout, "data/turnout.csv")

B.8 `wdi.csv`: World Development Indicators, 2019

This file contains data from the World Bank’s World Development Indicators dataset.

Obtain raw data using the WDI package:

# Download raw data via WDI package
wdi_file <- "_raw/wdi_2019.csv"
if (!file.exists(wdi_file)) {
  df_wdi_pkg <- WDI(
    country = "all",
    indicator = c(
      "gdp_per_capita" = "NY.GDP.PCAP.CD",
      "gdp_growth" = "NY.GDP.MKTP.KD.ZG",
      "population" = "SP.POP.TOTL",
      "inflation" = "FP.CPI.TOTL.ZG",
      "unemployment" = "SL.UEM.TOTL.ZS",
      "life_expectancy" = "SP.DYN.LE00.IN"
    ),
    start = 2019, end = 2019,
    extra = TRUE
  )
  write_csv(df_wdi_pkg, wdi_file)
}

df_wdi_raw <- read_csv(wdi_file)

glimpse(df_wdi_raw)

Rows: 266
Columns: 18
$ country         <chr> "Afghanistan", "Africa Eastern and Southern", "Africa …
$ iso2c           <chr> "AF", "ZH", "ZI", "AL", "DZ", "AS", "AD", "AO", "AG", …
$ iso3c           <chr> "AFG", "AFE", "AFW", "ALB", "DZA", "ASM", "AND", "AGO"…
$ year            <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
$ status          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ lastupdated     <date> 2025-07-01, 2025-07-01, 2025-07-01, 2025-07-01, 2025-…
$ gdp_per_capita  <dbl> 496.6025, 1493.8179, 1798.3407, 5460.4305, 4468.4534, …
$ gdp_growth      <dbl> 3.9116034, 2.2003404, 3.2821630, 2.0625779, 0.9000000,…
$ population      <dbl> 37856121, 675950189, 463365429, 2854191, 43294546, 502…
$ inflation       <dbl> 2.3023725, 4.6449672, 1.9830923, 1.4110908, 1.9517682,…
$ unemployment    <dbl> 11.185000, 7.584419, 4.395271, 11.466000, 12.259000, N…
$ life_expectancy <dbl> 62.94100, 63.85726, 57.14985, 79.46700, 75.68200, 72.7…
$ region          <chr> "South Asia", "Aggregates", "Aggregates", "Europe & Ce…
$ capital         <chr> "Kabul", NA, NA, "Tirane", "Algiers", "Pago Pago", "An…
$ longitude       <dbl> 69.17610, NA, NA, 19.81720, 3.05097, -170.69100, 1.521…
$ latitude        <dbl> 34.52280, NA, NA, 41.33170, 36.73970, -14.28460, 42.50…
$ income          <chr> "Low income", "Aggregates", "Aggregates", "Upper middl…
$ lending         <chr> "IDA", "Aggregates", "Aggregates", "IBRD", "IBRD", "No…

Minor cleaning to remove unwanted rows and columns:

df_wdi <- df_wdi_raw |>
  as_tibble() |>
  select(-iso2c, -status, -lastupdated, -capital, -longitude, -latitude) |>
  filter(region != "Aggregates") |>
  filter(income != "Not classified") |>
  mutate(income = case_match(
    income,
    "Low income" ~ "1. Low",
    "Lower middle income" ~ "2. Lower-middle",
    "Upper middle income" ~ "3. Upper-middle",
    "High income" ~ "4. High"
  ))


glimpse(df_wdi)

Rows: 215
Columns: 12
$ country         <chr> "Afghanistan", "Albania", "Algeria", "American…
$ iso3c           <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG…
$ year            <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019…
$ gdp_per_capita  <dbl> 496.6025, 5460.4305, 4468.4534, 12886.1360, 41…
$ gdp_growth      <dbl> 3.9116034, 2.0625779, 0.9000000, -0.4878049, 2…
$ population      <dbl> 37856121, 2854191, 43294546, 50209, 76474, 323…
$ inflation       <dbl> 2.3023725, 1.4110908, 1.9517682, NA, NA, 17.08…
$ unemployment    <dbl> 11.185, 11.466, 12.259, NA, NA, 16.497, NA, 9.…
$ life_expectancy <dbl> 62.94100, 79.46700, 75.68200, 72.75100, 84.098…
$ region          <chr> "South Asia", "Europe & Central Asia", "Middle…
$ income          <chr> "1. Low", "3. Upper-middle", "3. Upper-middle"…
$ lending         <chr> "IDA", "IBRD", "IBRD", "Not classified", "Not …

Save to wdi.csv:

write_csv(df_wdi, "data/wdi.csv")

B.1 anes_2020.csv: Subset of 2020 ANES survey

B.2 county_pres.csv: County-level presidential election returns, 2000–2024

B.3 crises.csv: International crises

B.4 fed_papers.csv: Federalist Papers corpus

B.5 gss_2024.csv: Subset of 2024 General Social Survey

B.6 military.csv: Military spending and personnel

B.7 turnout.csv: US voter turnout, 2000–2022

B.8 wdi.csv: World Development Indicators, 2019