@@ -14845,24 +14845,77 @@ reformat_camels_for_ms <- function(){
1484514845}
1484614846
1484714847scrape_data_download_urls <- function() {
14848- raw_fp <- "./vault/raw_documentation_files/"
14848+ # connecting to gsheet
14849+ citation_gsheet <- googlesheets4::read_sheet(
14850+ conf$citation_gsheet,
14851+ na = c('', 'NA'),
14852+ col_types = citation_cols <- paste0(rep('c', 24), collapse=''))
14853+ colnames(citation_gsheet) <- citation_gsheet[6,]
1484914854
14855+ raw_fp <- "./vault/raw_documentation_files/"
1485014856
14851- raw_networks <- list.files(raw_fp)
14852- raw_networks_fp <- list.files(raw_fp, full.names = TRUE)
14857+ networks <- list.files(raw_fp)
1485314858
14854- for(network_fp in raw_networks_fp) {
14855- domains <- list.files(raw_networks_fp[1])
14859+ for(network in networks) {
14860+ ## network <- networks[5]
14861+ domains <- list.files(file.path(raw_fp, network))
1485614862
1485714863 for(domain in domains) {
14858- writeLines('reading documentation for data source:', domain)
14859- product_names <- list.files(file.path(network_fp, domain, "raw", "documentation"))
14860- product_docs <- list.files(file.path(network_fp, domain, "raw", "documentation"), full.names = TRUE)
14864+ ## domain <- domains[4]
14865+ writeLines(paste('reading documentation for data source:', domain))
1486114866
14862- for(file in product_docs) {
14863- data_source_doc <- readLines(file)
14867+ # list all files in this domain of this network
14868+ product_names <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"))
14869+ product_docs <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"), full.names = TRUE)
14870+
14871+
14872+ # filter gsheet to domain, get all prodcodes
14873+ dmn_citation_gsheet <- citation_gsheet %>%
14874+ filter(network == !!network,
14875+ domain == !!domain)
14876+
14877+ dmn_prodcodes <- dmn_citation_gsheet$macrosheds_prodcode
14878+ dmn_prodcodes_grep <- paste0(dmn_prodcodes, collapse="|")
14879+
14880+ # filter product docs to only those with prodcode text matching gsheet
14881+ cited_products <- product_docs[ifelse(grepl(dmn_prodcodes_grep, product_docs), TRUE, FALSE)]
1486414882
14883+ for(file in cited_products) {
14884+ ## file <- cited_products[1]
14885+
14886+ cited_filename <- stringr::str_split(file, "__", simplify =TRUE)[2]
14887+ cited_prodcode <- stringr::str_split(cited_filename, ".txt", simplify =TRUE)[1]
14888+
14889+ data_source_doc <- readLines(file)
14890+ data_source_link <- trimws(stringr::str_split(data_source_doc, "[^)][0-9]{4}\\-", simplify =TRUE)[1])
14891+ data_source_dt <- trimws(stringr::str_extract_all(data_source_doc, "[0-9]{4}\\-.*[^)]", simplify =TRUE))[1]
14892+
14893+ print(paste(network, domain, data_source_dt))
14894+
14895+ if(grepl("https://", data_source_link)) {
14896+ data_source_link <- stringr::str_split(data_source_link, " ", simplify = TRUE)[1]
14897+ }
14898+
14899+ # NOTE: documentation files are not mecha-standardized, this function scrapes the best standard
14900+ # this *should* capture all prodcodes in citation gsheet, giving NA for versionlesss
14901+
14902+ # now, for this prodcode in the citation_gsheet df, we put "docs" in the "link" column
14903+ citation_gsheet <- citation_gsheet %>%
14904+ mutate(
14905+ link = case_when(
14906+ ifelse(
14907+ grepl(
14908+ cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_link, TRUE ~ link),
14909+ link_download_datetime = case_when(
14910+ ifelse(
14911+ grepl(
14912+ cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_dt, TRUE ~ link_download_datetime)
14913+ ## link = case_when(link == "NA" ~ NA, TRUE ~ link)
14914+ )
1486514915 }
1486614916 }
1486714917 }
14918+ # then, we write the edited df to the actual google sheet
14919+ googlesheets4::sheet_write(citation_gsheet, ss = conf$citation_gsheet, sheet = "timeseries")
14920+ ## return(citation_gsheet)
1486814921}
0 commit comments