2024-04-07 09:40:36 -05:00
|
|
|
|
|
|
|
library(rvest)
|
|
|
|
library(dplyr)
|
|
|
|
library(tidyr)
|
|
|
|
|
2024-04-07 10:17:22 -05:00
|
|
|
# sudo crontab -e
|
|
|
|
# 5 9 * * * su matt -c "cd /home/matt/ham-radio-licenses/; Rscript /home/matt/ham-radio-licenses/scrape-arrl-fcc.R">/dev/null 2>&1
|
|
|
|
|
2024-04-07 09:40:36 -05:00
|
|
|
arrl_url <- "https://www.arrl.org/fcc-license-counts"
|
|
|
|
|
|
|
|
# Read the page
|
|
|
|
d_raw <- read_html(arrl_url)
|
|
|
|
|
|
|
|
# Get date
|
|
|
|
date_raw <- d_raw %>%
|
|
|
|
html_nodes(xpath = "/html/body/div[1]/div/div[2]/div/div[2]/div[1]/p[2]/em") %>%
|
|
|
|
# Get date
|
|
|
|
html_text() %>%
|
|
|
|
gsub(".*, ","",.) %>%
|
|
|
|
as.Date(format = "%d-%b-%Y")
|
|
|
|
|
|
|
|
# Get table and clean up
|
|
|
|
tab <- d_raw %>%
|
|
|
|
html_nodes(xpath="/html/body/div[1]/div/div[2]/div/div[2]/div[1]/table") %>%
|
|
|
|
html_table() %>%
|
|
|
|
.[[1]] %>%
|
|
|
|
# Add date col
|
|
|
|
bind_cols(Date = date_raw, .) %>%
|
|
|
|
# Insert Tech Plus for compatibility
|
|
|
|
mutate("Tech-Plus" = NA,
|
2024-04-07 10:17:22 -05:00
|
|
|
.before = General) %>%
|
|
|
|
mutate(a=NA, b=NA, c=NA, d=NA, e=NA, f=NA, g=NA,
|
|
|
|
source_name = "ARRL FCC License Counts",
|
|
|
|
source_detail = "http://www.arrl.org/fcc-license-counts")
|
2024-04-07 09:40:36 -05:00
|
|
|
|
|
|
|
# Append table
|
|
|
|
write.table(tab, file = "out/arrl-fcc-licenses-scraped.csv", sep = ",",
|
|
|
|
append = TRUE, quote = FALSE,
|
2024-04-07 10:17:22 -05:00
|
|
|
col.names = TRUE, row.names = FALSE,
|
|
|
|
na = "")
|
2024-04-07 09:40:36 -05:00
|
|
|
|
|
|
|
# Clean up table to remove any duplicates (sometimes the page isn't updated regularly)
|
|
|
|
db <- read.csv("out/arrl-fcc-licenses-scraped.csv")
|
|
|
|
db2 <- db %>% distinct(.keep_all = TRUE) %>%
|
|
|
|
filter(Date != "Date")
|
|
|
|
write.csv(db2, "out/arrl-fcc-licenses-scraped.csv",
|
|
|
|
quote = F,
|
2024-04-07 10:17:22 -05:00
|
|
|
row.names = F,
|
|
|
|
na = "")
|
2024-04-07 09:40:36 -05:00
|
|
|
|