library(rvest) library(dplyr) library(tidyr) # sudo crontab -e # 5 9 * * * su matt -c "cd /home/matt/ham-radio-licenses/; Rscript /home/matt/ham-radio-licenses/scrape-license-counts.R">/dev/null 2>&1 arrl_url <- "https://www.arrl.org/fcc-license-counts" # Read the page d_raw <- read_html(arrl_url) # Get date date_raw <- d_raw %>% html_nodes(xpath = "/html/body/div[1]/div/div[2]/div/div[2]/div[1]/p[2]/em") %>% # Get date html_text() %>% gsub(".*, ","",.) %>% as.Date(format = "%d-%b-%Y") # Get table and clean up tab <- d_raw %>% html_nodes(xpath="/html/body/div[1]/div/div[2]/div/div[2]/div[1]/table") %>% html_table() %>% .[[1]] %>% # Add date col bind_cols(Date = date_raw, .) %>% # Insert Tech Plus for compatibility mutate("Tech-Plus" = NA, .before = General) %>% mutate(a=NA, b=NA, c=NA, d=NA, e=NA, f=NA, g=NA, source_name = "ARRL FCC License Counts", source_detail = "http://www.arrl.org/fcc-license-counts") # Append table write.table(tab, file = "out/arrl-fcc-licenses-scraped.csv", sep = ",", append = TRUE, quote = FALSE, col.names = F, row.names = FALSE, na = "") # Clean up table to remove any duplicates (sometimes the page isn't updated regularly) # db <- read.csv("out/arrl-fcc-licenses-scraped.csv") # db2 <- db %>% distinct(.keep_all = TRUE) %>% # filter(Date != "Date") # write.csv(db2, "out/arrl-fcc-licenses-scraped.csv", # quote = F, # row.names = F, # na = "") ###### HamCall ###### hamcall_url <- "https://hamcall.net/hamcallcounts.html" # Read the page hamcall_raw <- read_html(hamcall_url) # Get date (weird for HamCall because no xpath) hamcall_date <- hamcall_raw %>% html_text() %>% as.character() %>% gsub(".*All counts current as of ", "", .) %>% gsub("\n\r\n\r\nAll Current US Hams.*", "", .) %>% as.Date() # Get tables and clean up hamcall_tables <- hamcall_raw %>% html_elements(xpath = "//table") %>% html_table() hamcall_table_all_hams_raw <- hamcall_tables[[2]] hamcall_table_class_raw <- hamcall_tables[[3]] hamcall_table_city_raw <- hamcall_tables[[4]] hamcall_table_state_raw <- hamcall_tables[[5]] # Total licenses and class counts hamcall_table_class_pivot <- hamcall_table_class_raw %>% pivot_wider(names_from = "Class", values_from = "Count") %>% # Grab total and date mutate(date = hamcall_date, state = "TOTAL", techplus = NA, total = hamcall_table_all_hams_raw[1,2] %>% pull()) %>% # Arrange columns relocate(date, state, N, T, techplus, G, A, E, total) %>% select(1:9) %>% mutate(a = NA, b = NA, club = hamcall_table_all_hams_raw[2,2] %>% pull(), military = hamcall_table_all_hams_raw[3,2] %>% pull(), c = NA, d = NA, e = NA, source_name = "HamCall", source_detail = hamcall_url) # City counts hamcall_table_city <- hamcall_table_city_raw %>% mutate(date = hamcall_date, source_name = "HamCall", source_detail = hamcall_url) %>% relocate(date) # State counts hamcall_table_state <- hamcall_table_state_raw %>% mutate(date = hamcall_date, source_name = "HamCall", source_detail = hamcall_url) %>% relocate(date, State, Count, source_name, source_detail) ###### AE7Q ###### ae7q_url <- "https://www.ae7q.com/query/stat/LicenseUSA.php" # Read the page ae7q_raw <- read_html(ae7q_url) # Get tables and clean up ae7q_tables <- ae7q_raw %>% html_elements(xpath = "//table") %>% html_table() ae7q_table_state <- ae7q_tables[[20]] # Fix names names(ae7q_table_state) <- ae7q_table_state[1,] ae7q_table_state <- ae7q_table_state[-1,] # TODO # split percents out into other columns (separate_wider_delim() ?) # etc. ##### Append tables ##### write.table(hamcall_table_class_pivot, file = "out/hamcall-licenses-scraped.csv", sep = ",", append = TRUE, quote = FALSE, col.names = F, row.names = FALSE, na = "") write.table(hamcall_table_city, file = "out/hamcall-cities-scraped.csv", sep = ",", append = TRUE, quote = FALSE, col.names = F, row.names = FALSE, na = "") write.table(hamcall_table_state, file = "out/hamcall-states-scraped.csv", sep = ",", append = TRUE, quote = FALSE, col.names = F, row.names = FALSE, na = "")