Skip to content

Commit 453a858

Browse files
authored
Merge pull request #162 from westonslaughter/master
scrape download URL and datetime from raw docs
2 parents 2b839bd + 2833138 commit 453a858

6 files changed

Lines changed: 1788 additions & 18 deletions

File tree

src/acquisition_master.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,13 @@ ms_globals <- c(ls(all.names = TRUE), 'ms_globals')
316316

317317
dir.create('logs', showWarnings = FALSE)
318318

319+
# NOTE: this should be moved I believe, and made to work with the raw data
320+
# dcumentation of the latest iteration...
321+
# this function will update the citation sheet with the data and url of raw data download
322+
scrape_data_download_urls()
323+
324+
## change string in line below to find row index of your desired domain
325+
## dmnrow <- which(network_domain$domain == 'loch_vale')
319326
for(dmnrow in 1:nrow(network_domain)){
320327

321328
# drop_automated_entries('.') #use with caution!
@@ -346,12 +353,15 @@ for(dmnrow in 1:nrow(network_domain)){
346353
n = network,
347354
d = domain))
348355

356+
# this should only run when you have your producs.csv
357+
# and processing kernels prod information matching
349358
update_product_statuses(network = network,
350359
domain = domain)
351360

352361
get_all_local_helpers(network = network,
353362
domain = domain)
354363

364+
# stop here and go to processing_kernels.R to continue
355365
ms_retrieve(network = network,
356366
# prodname_filter = c('stream_chemistry'),
357367
domain = domain)

src/global/global_helpers.R

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14845,24 +14845,77 @@ reformat_camels_for_ms <- function(){
1484514845
}
1484614846

1484714847
scrape_data_download_urls <- function() {
14848-
raw_fp <- "./vault/raw_documentation_files/"
14848+
# connecting to gsheet
14849+
citation_gsheet <- googlesheets4::read_sheet(
14850+
conf$citation_gsheet,
14851+
na = c('', 'NA'),
14852+
col_types = citation_cols <- paste0(rep('c', 24), collapse=''))
14853+
colnames(citation_gsheet) <- citation_gsheet[6,]
1484914854

14855+
raw_fp <- "./vault/raw_documentation_files/"
1485014856

14851-
raw_networks <- list.files(raw_fp)
14852-
raw_networks_fp <- list.files(raw_fp, full.names = TRUE)
14857+
networks <- list.files(raw_fp)
1485314858

14854-
for(network_fp in raw_networks_fp) {
14855-
domains <- list.files(raw_networks_fp[1])
14859+
for(network in networks) {
14860+
## network <- networks[5]
14861+
domains <- list.files(file.path(raw_fp, network))
1485614862

1485714863
for(domain in domains) {
14858-
writeLines('reading documentation for data source:', domain)
14859-
product_names <- list.files(file.path(network_fp, domain, "raw", "documentation"))
14860-
product_docs <- list.files(file.path(network_fp, domain, "raw", "documentation"), full.names = TRUE)
14864+
## domain <- domains[4]
14865+
writeLines(paste('reading documentation for data source:', domain))
1486114866

14862-
for(file in product_docs) {
14863-
data_source_doc <- readLines(file)
14867+
# list all files in this domain of this network
14868+
product_names <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"))
14869+
product_docs <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"), full.names = TRUE)
14870+
14871+
14872+
# filter gsheet to domain, get all prodcodes
14873+
dmn_citation_gsheet <- citation_gsheet %>%
14874+
filter(network == !!network,
14875+
domain == !!domain)
14876+
14877+
dmn_prodcodes <- dmn_citation_gsheet$macrosheds_prodcode
14878+
dmn_prodcodes_grep <- paste0(dmn_prodcodes, collapse="|")
14879+
14880+
# filter product docs to only those with prodcode text matching gsheet
14881+
cited_products <- product_docs[ifelse(grepl(dmn_prodcodes_grep, product_docs), TRUE, FALSE)]
1486414882

14883+
for(file in cited_products) {
14884+
## file <- cited_products[1]
14885+
14886+
cited_filename <- stringr::str_split(file, "__", simplify =TRUE)[2]
14887+
cited_prodcode <- stringr::str_split(cited_filename, ".txt", simplify =TRUE)[1]
14888+
14889+
data_source_doc <- readLines(file)
14890+
data_source_link <- trimws(stringr::str_split(data_source_doc, "[^)][0-9]{4}\\-", simplify =TRUE)[1])
14891+
data_source_dt <- trimws(stringr::str_extract_all(data_source_doc, "[0-9]{4}\\-.*[^)]", simplify =TRUE))[1]
14892+
14893+
print(paste(network, domain, data_source_dt))
14894+
14895+
if(grepl("https://", data_source_link)) {
14896+
data_source_link <- stringr::str_split(data_source_link, " ", simplify = TRUE)[1]
14897+
}
14898+
14899+
# NOTE: documentation files are not mecha-standardized, this function scrapes the best standard
14900+
# this *should* capture all prodcodes in citation gsheet, giving NA for versionlesss
14901+
14902+
# now, for this prodcode in the citation_gsheet df, we put "docs" in the "link" column
14903+
citation_gsheet <- citation_gsheet %>%
14904+
mutate(
14905+
link = case_when(
14906+
ifelse(
14907+
grepl(
14908+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_link, TRUE ~ link),
14909+
link_download_datetime = case_when(
14910+
ifelse(
14911+
grepl(
14912+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_dt, TRUE ~ link_download_datetime)
14913+
## link = case_when(link == "NA" ~ NA, TRUE ~ link)
14914+
)
1486514915
}
1486614916
}
1486714917
}
14918+
# then, we write the edited df to the actual google sheet
14919+
googlesheets4::sheet_write(citation_gsheet, ss = conf$citation_gsheet, sheet = "timeseries")
14920+
## return(citation_gsheet)
1486814921
}

src/webb/loch_vale/domain_helpers.R

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
## run these before working inside retrieve_product func
2-
## network = network
3-
## domain = domain
4-
## prodname_ms = prodname_ms
5-
## site_code = site_code
6-
## tracker = held_data
7-
## url = prod_info$url[i]
8-
91
retrieve_sleepers_product <- function(network,
102
domain,
113
prodname_ms,

0 commit comments

Comments
 (0)