library(tidyverse)
library(stringr)

continent_codes <- read_csv("continent-abbreviations.csv",
                            col_names = c("code","continent"),
                            na = c(""))

# Some countries lie in more than one continent in the continents.txt
# table from Wikipedia; we simplify it by deciding on one of them,
# even though this is not exactly correct.
continents_raw <- read_delim("continents.txt", delim = "|", comment = "|-",
                         trim_ws = TRUE, na = c("","null")) %>%
  # ignore the warning about missing column names...
  select(-starts_with("X"))

continents <- continents_raw %>%
  # some countries span multiple continents; we retain only one of them
  filter(!(Name %in% c("Armenia, Republic of", "Azerbaijan, Republic of",
                       "Cyprus, Republic of", "Georgia",
                       "Kazakhstan, Republic of", "Turkey, Republic of")
           & CC == "EU")) %>%
  filter(!(Name == "Russian Federation" & CC == "AS")) %>%
  filter(!(Name == "United States Minor Outlying Islands" & CC == "OC")) %>%
  mutate(Name = str_to_upper(Name)) %>%
  separate(Name, into = c("name.main", "name.pre"), sep = ", ",
           fill = "right") %>%   # many of the country names are written
                                 # as "name, prefix"
  replace_na(list(name.pre = "")) %>%
  unite("name.full", "name.pre", "name.main", sep = " ", remove = FALSE) %>%
  mutate(name.full = trimws(name.full)) %>%
  select(-"name.pre") %>%
  # two countries' main names are duplicated; we remove the duplication
  filter(! name.full %in% c("REPUBLIC OF THE CONGO",
                            "DEMOCRATIC PEOPLE'S REPUBLIC OF KOREA")) %>%
  # Kosovo doesn't appear in the Wikipedia list
  add_row(CC = "EU", `a-2` = "XK", name.full = "REPUBLIC OF KOSOVO",
          name.main = "KOSOVO")

caa_data <- read_csv("caa-flight-delays.csv") %>%
    filter(arrival_departure == "D", number_flights_matched > 0,
           reporting_airport %in% c("GATWICK", "HEATHROW", "LONDON CITY",
                                    "LUTON", "STANSTED", "SOUTHEND")) %>%
    filter(origin_destination_country != "Unknown") %>%
    mutate(destination_country =
             str_replace(origin_destination_country, "\\(.*\\)$", ""))

# This is for exploration...
# Which countries appear in the CAA data?
caa_countries <- caa_data %>%
  group_by(destination_country) %>% count()
# Which of those don't appear in the continents table?
missing_countries <- caa_countries %>%
  anti_join(continents, by = c("destination_country" = "name.main"))

# Now we know which countries need renaming, we go ahead and do so...
# This table was created manually; the list of CAA names was taken
# from missing_countries, but the Wikipedia name was found by looking
# through the table by hand.  It was not overly time-consuming for
# this small number of special cases.
renames <- tribble(
  ~continents_name, ~caa_name,
  "BOSNIA AND HERZEGOVINA", "BOSNIA-HERZEGOVINA",
  "BRUNEI DARUSSALAM", "BRUNEI",
  "CAPE VERDE", "CAPE VERDE ISLANDS",
  "CONGO", "DEMOCRATIC REPUBLIC OF CONGO",
  "IRELAND", "IRISH REPUBLIC",
  "CURAÇAO", "ISLE OF CURACAO NETH.ANTILLES",
  "KYRGYZ REPUBLIC", "KYRGYZSTAN",
  "MALDIVES", "MALDIVE ISLANDS",
  "KOREA", "REPUBLIC OF KOREA",
  "MOLDOVA", "REPUBLIC OF MOLDOVA",
  "MONTENEGRO", "REPUBLIC OF MONTENEGRO",
  "SERBIA", "REPUBLIC OF SERBIA",
  "SOUTH AFRICA", "REPUBLIC OF SOUTH AFRICA",
  "RUSSIAN FEDERATION", "RUSSIA",
  "SLOVAKIA (SLOVAK REPUBLIC)", "SLOVAK REPUBLIC",
  "SAINT LUCIA", "ST LUCIA",
  "UNITED KINGDOM OF GREAT BRITAIN & NORTHERN IRELAND", "UNITED KINGDOM",
  "UNITED STATES OF AMERICA", "USA"
)

# and add a column to continents with the CAA name, either the same as
# name.main, or the replacement in the renames tibble we've just made
continents <- continents %>%
  left_join(renames, by = c("name.main" = "continents_name")) %>%
  mutate(caa_name = if_else(! is.na(caa_name), caa_name, name.main))

# We count the UK separately
continent_codes <- continent_codes %>%
  add_row(code = "UK", continent = "Domestic")
continents$CC[continents$caa_name == "UNITED KINGDOM"] <- "UK"

# And now, after all of the preparatory work, we can do the counts
# we're actually interested in!
caa_continents_data <- caa_data %>%
  left_join(continents, by = c("destination_country" = "caa_name"))

continents_count <- caa_continents_data %>%
  group_by(CC) %>%
  summarise(total_flights = sum(number_flights_matched)) %>%
  left_join(continent_codes, by = c("CC" = "code")) %>%
  transmute(Continent = continent, `Total flights` = total_flights) %>%
  mutate(Proportion = `Total flights` / sum(`Total flights`)) %>%
  arrange(desc(`Total flights`))

write_csv(continents_count, "continents-count.csv")