Skip to content

Commit 489e698

Browse files
func write raw data download link and dt to citation gsheet
1 parent 7b608ed commit 489e698

1 file changed

Lines changed: 64 additions & 11 deletions

File tree

src/global/global_helpers.R

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9367,7 +9367,7 @@ ms_write_confdata <- function(x,
93679367

93689368
type_string <- case_when(
93699369
which_dataset == 'ms_vars' ~ 'cccccccnnccnn',
9370-
which_dataset == 'site_data' ~ 'ccccccccnnnnncccc',
9370+
which_dataset == 'site_data' ~ 'ccccccccnnnnnccccc',
93719371
which_dataset == 'ws_delin_specs' ~ 'cccncnnccl',
93729372
TRUE ~ 'placeholder')
93739373

@@ -14845,24 +14845,77 @@ reformat_camels_for_ms <- function(){
1484514845
}
1484614846

1484714847
scrape_data_download_urls <- function() {
14848-
raw_fp <- "./vault/raw_documentation_files/"
14848+
# connecting to gsheet
14849+
citation_gsheet <- googlesheets4::read_sheet(
14850+
conf$citation_gsheet,
14851+
na = c('', 'NA'),
14852+
col_types = citation_cols <- paste0(rep('c', 24), collapse=''))
14853+
colnames(citation_gsheet) <- citation_gsheet[6,]
1484914854

14855+
raw_fp <- "./vault/raw_documentation_files/"
1485014856

14851-
raw_networks <- list.files(raw_fp)
14852-
raw_networks_fp <- list.files(raw_fp, full.names = TRUE)
14857+
networks <- list.files(raw_fp)
1485314858

14854-
for(network_fp in raw_networks_fp) {
14855-
domains <- list.files(raw_networks_fp[1])
14859+
for(network in networks) {
14860+
## network <- networks[5]
14861+
domains <- list.files(file.path(raw_fp, network))
1485614862

1485714863
for(domain in domains) {
14858-
writeLines('reading documentation for data source:', domain)
14859-
product_names <- list.files(file.path(network_fp, domain, "raw", "documentation"))
14860-
product_docs <- list.files(file.path(network_fp, domain, "raw", "documentation"), full.names = TRUE)
14864+
## domain <- domains[4]
14865+
writeLines(paste('reading documentation for data source:', domain))
1486114866

14862-
for(file in product_docs) {
14863-
data_source_doc <- readLines(file)
14867+
# list all files in this domain of this network
14868+
product_names <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"))
14869+
product_docs <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"), full.names = TRUE)
14870+
14871+
14872+
# filter gsheet to domain, get all prodcodes
14873+
dmn_citation_gsheet <- citation_gsheet %>%
14874+
filter(network == !!network,
14875+
domain == !!domain)
14876+
14877+
dmn_prodcodes <- dmn_citation_gsheet$macrosheds_prodcode
14878+
dmn_prodcodes_grep <- paste0(dmn_prodcodes, collapse="|")
14879+
14880+
# filter product docs to only those with prodcode text matching gsheet
14881+
cited_products <- product_docs[ifelse(grepl(dmn_prodcodes_grep, product_docs), TRUE, FALSE)]
1486414882

14883+
for(file in cited_products) {
14884+
## file <- cited_products[1]
14885+
14886+
cited_filename <- stringr::str_split(file, "__", simplify =TRUE)[2]
14887+
cited_prodcode <- stringr::str_split(cited_filename, ".txt", simplify =TRUE)[1]
14888+
14889+
data_source_doc <- readLines(file)
14890+
data_source_link <- trimws(stringr::str_split(data_source_doc, "[^)][0-9]{4}\\-", simplify =TRUE)[1])
14891+
data_source_dt <- trimws(stringr::str_extract_all(data_source_doc, "[0-9]{4}\\-.*[^)]", simplify =TRUE))[1]
14892+
14893+
print(paste(network, domain, data_source_dt))
14894+
14895+
if(grepl("https://", data_source_link)) {
14896+
data_source_link <- stringr::str_split(data_source_link, " ", simplify = TRUE)[1]
14897+
}
14898+
14899+
# NOTE: documentation files are not mecha-standardized, this function scrapes the best standard
14900+
# this *should* capture all prodcodes in citation gsheet, giving NA for versionlesss
14901+
14902+
# now, for this prodcode in the citation_gsheet df, we put "docs" in the "link" column
14903+
citation_gsheet <- citation_gsheet %>%
14904+
mutate(
14905+
link = case_when(
14906+
ifelse(
14907+
grepl(
14908+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_link, TRUE ~ link),
14909+
link_download_datetime = case_when(
14910+
ifelse(
14911+
grepl(
14912+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_dt, TRUE ~ link_download_datetime)
14913+
## link = case_when(link == "NA" ~ NA, TRUE ~ link)
14914+
)
1486514915
}
1486614916
}
1486714917
}
14918+
# then, we write the edited df to the actual google sheet
14919+
googlesheets4::sheet_write(citation_gsheet, ss = conf$citation_gsheet, sheet = "timeseries")
14920+
## return(citation_gsheet)
1486814921
}

0 commit comments

Comments
 (0)