Add a script to scrape the ARRL FCC page.

2024-04-07 09:40:36 -05:00
parent ddcc8aa5c8
commit 3b9d756f6f
1 changed files with 42 additions and 0 deletions
--- a/bin/scrape-arrl-fcc.R
+++ b/bin/scrape-arrl-fcc.R
@@ -0,0 +1,42 @@
 library(rvest)
 library(dplyr)
 library(tidyr)
 arrl_url <- "https://www.arrl.org/fcc-license-counts"
 # Read the page
 d_raw <- read_html(arrl_url)
 # Get date
 date_raw <- d_raw %>% 
            html_nodes(xpath = "/html/body/div[1]/div/div[2]/div/div[2]/div[1]/p[2]/em") %>% 
            # Get date
            html_text() %>% 
            gsub(".*, ","",.) %>% 
            as.Date(format = "%d-%b-%Y")
 # Get table and clean up
 tab <- d_raw %>%
    html_nodes(xpath="/html/body/div[1]/div/div[2]/div/div[2]/div[1]/table") %>%
    html_table() %>% 
    .[[1]] %>% 
    # Add date col
    bind_cols(Date = date_raw, .) %>% 
    # Insert Tech Plus for compatibility
    mutate("Tech-Plus" = NA,
           .before = General)
 # Append table
 write.table(tab, file = "out/arrl-fcc-licenses-scraped.csv", sep = ",", 
            append = TRUE, quote = FALSE, 
            col.names = TRUE, row.names = FALSE) 
 # Clean up table to remove any duplicates (sometimes the page isn't updated regularly)
 db <- read.csv("out/arrl-fcc-licenses-scraped.csv")
 db2 <- db %>% distinct(.keep_all = TRUE) %>% 
        filter(Date != "Date")
 write.csv(db2, "out/arrl-fcc-licenses-scraped.csv",
          quote = F,
          row.names = F)