library(rvest) library(dplyr) library(tidyr) arrl_url <- "https://www.arrl.org/fcc-license-counts" # Read the page d_raw <- read_html(arrl_url) # Get date date_raw <- d_raw %>% html_nodes(xpath = "/html/body/div[1]/div/div[2]/div/div[2]/div[1]/p[2]/em") %>% # Get date html_text() %>% gsub(".*, ","",.) %>% as.Date(format = "%d-%b-%Y") # Get table and clean up tab <- d_raw %>% html_nodes(xpath="/html/body/div[1]/div/div[2]/div/div[2]/div[1]/table") %>% html_table() %>% .[[1]] %>% # Add date col bind_cols(Date = date_raw, .) %>% # Insert Tech Plus for compatibility mutate("Tech-Plus" = NA, .before = General) # Append table write.table(tab, file = "out/arrl-fcc-licenses-scraped.csv", sep = ",", append = TRUE, quote = FALSE, col.names = TRUE, row.names = FALSE) # Clean up table to remove any duplicates (sometimes the page isn't updated regularly) db <- read.csv("out/arrl-fcc-licenses-scraped.csv") db2 <- db %>% distinct(.keep_all = TRUE) %>% filter(Date != "Date") write.csv(db2, "out/arrl-fcc-licenses-scraped.csv", quote = F, row.names = F)