diff --git a/bin/scrape-arrl-fcc.R b/bin/scrape-arrl-fcc.R new file mode 100644 index 0000000..9a6eaa1 --- /dev/null +++ b/bin/scrape-arrl-fcc.R @@ -0,0 +1,42 @@ + +library(rvest) +library(dplyr) +library(tidyr) + +arrl_url <- "https://www.arrl.org/fcc-license-counts" + +# Read the page +d_raw <- read_html(arrl_url) + +# Get date +date_raw <- d_raw %>% + html_nodes(xpath = "/html/body/div[1]/div/div[2]/div/div[2]/div[1]/p[2]/em") %>% + # Get date + html_text() %>% + gsub(".*, ","",.) %>% + as.Date(format = "%d-%b-%Y") + +# Get table and clean up +tab <- d_raw %>% + html_nodes(xpath="/html/body/div[1]/div/div[2]/div/div[2]/div[1]/table") %>% + html_table() %>% + .[[1]] %>% + # Add date col + bind_cols(Date = date_raw, .) %>% + # Insert Tech Plus for compatibility + mutate("Tech-Plus" = NA, + .before = General) + +# Append table +write.table(tab, file = "out/arrl-fcc-licenses-scraped.csv", sep = ",", + append = TRUE, quote = FALSE, + col.names = TRUE, row.names = FALSE) + +# Clean up table to remove any duplicates (sometimes the page isn't updated regularly) +db <- read.csv("out/arrl-fcc-licenses-scraped.csv") +db2 <- db %>% distinct(.keep_all = TRUE) %>% + filter(Date != "Date") +write.csv(db2, "out/arrl-fcc-licenses-scraped.csv", + quote = F, + row.names = F) +