Skip to content

Commit a7cc63c

Browse files
committed
merged webb updates, gitignored .RData
2 parents 093389b + 453a858 commit a7cc63c

29 files changed

Lines changed: 5928 additions & 19 deletions

.RData

-1.33 MB
Binary file not shown.

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,5 @@ old_dataset_dirs.tar.gz
2929
old_logs
3030
eml/data_links
3131
eml/eml_out
32+
vault/*
33+
.Rdata

src/acquisition_master.R

Lines changed: 85 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ ms_init <- function(use_gpu = FALSE,
103103
successes <- 0
104104
which_machine <- 'unknown'
105105

106+
# example dev computer 'registration' code block
107+
## res <- try(setwd('~/your/file/path/to/macrosheds/data_processing'), silent=TRUE) # example
108+
## if(! 'try-error' %in% class(res)){
109+
## successes <- successes + 1
110+
## which_machine <- 'your_machine') # machine name is completely up to you, does not matter
111+
## instance_type <- 'dev' # instance type is 'dev' for all personal computers
112+
## machine_status <- 'n00b' # unless you have > 32GB of RAM and > 8 CPUS, your 'n00b'
113+
## op_system <- 'mac' # whats your OS?
114+
## }
115+
106116
res <- try(setwd('~/macrosheds_data_processing'), silent=TRUE) #DCC
107117
if(! 'try-error' %in% class(res)){
108118
successes <- successes + 1
@@ -129,6 +139,15 @@ ms_init <- function(use_gpu = FALSE,
129139
machine_status <- '1337'
130140
op_system <- 'windows'
131141
}
142+
143+
res <- try(setwd('/Users/hectorontiveros/Applications/data_processing'), silent=FALSE) #Hector
144+
if(! 'try-error' %in% class(res)){
145+
successes <- successes + 1
146+
which_machine <- 'hec'
147+
instance_type <- 'dev'
148+
machine_status <- 'n00b'
149+
op_system <- 'macOS'
150+
}
132151

133152
res <- try(setwd('~/desktop/macrosheds/data_acquisition'), silent=TRUE) #spencer
134153
if(! 'try-error' %in% class(res)){
@@ -139,6 +158,15 @@ ms_init <- function(use_gpu = FALSE,
139158
op_system <- 'mac'
140159
}
141160

161+
res <- try(setwd('~/Desktop/MacroSheds/data_processing/src/data_acquisition'), silent=TRUE) #pranavi
162+
if(! 'try-error' %in% class(res)){
163+
successes <- successes + 1
164+
which_machine <- 'Pranavi'
165+
instance_type <- 'dev'
166+
machine_status <- 'n00b'
167+
op_system <- 'mac'
168+
}
169+
142170
res <- try(setwd('C:/Users/gubbi/Documents/macrosheds/data_processing'), silent=TRUE) #Nick
143171
if(! 'try-error' %in% class(res)){
144172
successes <- successes + 1
@@ -174,6 +202,15 @@ ms_init <- function(use_gpu = FALSE,
174202
# machine_status <- '1337'
175203
# }
176204

205+
res <- try(setwd('/home/weston/science/macrosheds/data_processing'), silent=TRUE) # wes
206+
if(! 'try-error' %in% class(res)){
207+
successes <- successes + 1
208+
which_machine <- 'wes'
209+
instance_type <- 'dev'
210+
machine_status <- '1337'
211+
op_system <- 'linux'
212+
}
213+
177214
res <- try(setwd('/home/macrosheds/data_acquisition'), silent=TRUE) #server
178215
if(! 'try-error' %in% class(res)){
179216
successes <- successes + 1
@@ -183,6 +220,16 @@ ms_init <- function(use_gpu = FALSE,
183220
op_system <- NA
184221
}
185222

223+
224+
res <- try(setwd('C:/Users/Dell/Documents/Projects/data_processing'), silent=TRUE) #server
225+
if(! 'try-error' %in% class(res)){
226+
successes <- successes + 1
227+
which_machine <- 'bini'
228+
instance_type <- 'dev'
229+
machine_status <- 'noob'
230+
op_system <- 'windows'
231+
}
232+
186233
if(successes > 1){
187234
stop(glue('more than one working directory was available. must set the ',
188235
'correct one manually'))
@@ -210,24 +257,29 @@ ms_instance <- ms_init(use_ms_error_handling = FALSE,
210257
config_storage_location = 'remote')
211258

212259
#load authorization file for macrosheds google sheets
213-
googlesheets4::gs4_auth(path = 'googlesheet_service_accnt.json')
260+
## googlesheets4::gs4_auth(path = 'googlesheet_service_accnt.json')
214261

215262
#read in secrets
216263
conf <- jsonlite::fromJSON('config.json',
217264
simplifyDataFrame = FALSE)
218265

266+
219267
#connect rgee to earth engine and python
220268
gee_login <- case_when(
221269
ms_instance$which_machine %in% c('Mike', 'BM1') ~ conf$gee_login_mike,
222-
ms_instance$which_machine %in% c('Spencer', 'BM0', 'BM2') ~ conf$gee_login_spencer,
223-
ms_instance$which_machine %in% c('Nick') ~ conf$gee_login_spencer,
270+
ms_instance$which_machine %in% c('Spencer', 'BM0', 'BM2', 'Nick') ~ conf$gee_login_spencer,
271+
ms_instance$which_machine %in% c('Hector','Biniam','Pranavi', 'Wes') ~conf$gee_login_ms,
224272
TRUE ~ 'UNKNOWN')
225273

226-
try(rgee::ee_Initialize(user = gee_login,
227-
drive = TRUE))
228-
274+
#load authorization file for macrosheds google sheets and drive
275+
#same account must have GEE and GDrive access
276+
googlesheets4::gs4_auth(email = gee_login)
229277
googledrive::drive_auth(email = gee_login)
230278

279+
#initialize and authorize GEE account
280+
try(rgee::ee_Initialize(user = gee_login,
281+
drive = TRUE))
282+
231283
#set up global logger. network-domain loggers are set up later
232284
logging::basicConfig()
233285
logging::addHandler(logging::writeToFile,
@@ -241,7 +293,7 @@ if(ms_instance$use_ms_error_handling){
241293
source_decoratees('src/global/global_helpers.R') #parse decorators
242294
}
243295

244-
#puts ms_vars, site_data, ws_delin_specs, univ_products into the global environment
296+
#puts (google sheets) ms_vars, site_data, ws_delin_specs, univ_products into the global environment
245297
load_config_datasets(from_where = ms_instance$config_data_storage)
246298

247299

@@ -266,8 +318,13 @@ ms_globals <- c(ls(all.names = TRUE), 'ms_globals')
266318

267319
dir.create('logs', showWarnings = FALSE)
268320

269-
# dmnrow = 8
270-
# print(network_domain, n=50)
321+
# NOTE: this should be moved I believe, and made to work with the raw data
322+
# dcumentation of the latest iteration...
323+
# this function will update the citation sheet with the data and url of raw data download
324+
scrape_data_download_urls()
325+
326+
## change string in line below to find row index of your desired domain
327+
## dmnrow <- which(network_domain$domain == 'loch_vale')
271328
for(dmnrow in 1:nrow(network_domain)){
272329

273330
# drop_automated_entries('.') #use with caution!
@@ -276,17 +333,19 @@ for(dmnrow in 1:nrow(network_domain)){
276333
network <- network_domain$network[dmnrow]
277334
domain <- network_domain$domain[dmnrow]
278335

279-
# held_data = get_data_tracker(network, domain)
336+
held_data = get_data_tracker(network, domain)
280337

281-
# held_data = invalidate_tracked_data(network, domain, 'munge')
282-
# owrite_tracker(network, domain)
283-
# held_data = invalidate_tracked_data(network, domain, 'derive')
284-
# owrite_tracker(network, domain)
338+
## dangerous lines - use at your own risk! :0
339+
## held_data = invalidate_tracked_data(network, domain, 'munge')
340+
## owrite_tracker(network, domain)
341+
## held_data = invalidate_tracked_data(network, domain, 'derive')
342+
## owrite_tracker(network, domain)
285343

286-
# held_data = invalidate_tracked_data(network, domain, 'munge', 'stream_chemistry')
287-
# owrite_tracker(network, domain)
288-
# held_data = invalidate_tracked_data(network, domain, 'derive', 'stream_flux_inst')
289-
# owrite_tracker(network, domain)
344+
## less dangerous version below, clears tracker for just a specified product
345+
## held_data = invalidate_tracked_data(network, domain, 'munge', 'stream_chemistry')
346+
## owrite_tracker(network, domain)
347+
## held_data = invalidate_tracked_data(network, domain, 'derive', 'stream_flux_inst')
348+
## owrite_tracker(network, domain)
290349

291350
logger_module <- set_up_logger(network = network,
292351
domain = domain)
@@ -296,23 +355,30 @@ for(dmnrow in 1:nrow(network_domain)){
296355
n = network,
297356
d = domain))
298357

358+
# this should only run when you have your producs.csv
359+
# and processing kernels prod information matching
299360
update_product_statuses(network = network,
300361
domain = domain)
362+
301363
get_all_local_helpers(network = network,
302364
domain = domain)
303365

366+
# stop here and go to processing_kernels.R to continue
304367
ms_retrieve(network = network,
305368
# prodname_filter = c('stream_chemistry'),
306369
domain = domain)
370+
307371
ms_munge(network = network,
308372
# prodname_filter = c('stream_chemistry'),
309373
domain = domain)
374+
310375
if(domain != 'mcmurdo'){
311376
sw(ms_delineate(network = network,
312377
domain = domain,
313378
dev_machine_status = ms_instance$machine_status,
314379
verbose = TRUE))
315380
}
381+
316382
ms_derive(network = network,
317383
prodname_filter = c('stream_chemistry'),
318384
domain = domain)
@@ -348,3 +414,4 @@ if(length(email_err_msgs)){
348414

349415
loginfo(msg = 'Run complete',
350416
logger = logger_module)
417+

src/global/global_helpers.R

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9372,7 +9372,7 @@ ms_write_confdata <- function(x,
93729372

93739373
type_string <- case_when(
93749374
which_dataset == 'ms_vars' ~ 'cccccccnnccnn',
9375-
which_dataset == 'site_data' ~ 'ccccccccnnnnncccc',
9375+
which_dataset == 'site_data' ~ 'ccccccccnnnnnccccc',
93769376
which_dataset == 'ws_delin_specs' ~ 'cccncnnccl',
93779377
TRUE ~ 'placeholder')
93789378

@@ -12130,6 +12130,7 @@ retrieve_versionless_product <- function(network,
1213012130

1213112131
rt <- tracker[[prodname_ms]][[site_code]]$retrieve
1213212132

12133+
## i = 1
1213312134
for(i in 1:nrow(rt)){
1213412135

1213512136
held_dt <- as.POSIXct(rt$held_version[i],
@@ -15177,3 +15178,79 @@ reformat_camels_for_ms <- function(){
1517715178
}
1517815179
}
1517915180
}
15181+
15182+
scrape_data_download_urls <- function() {
15183+
# connecting to gsheet
15184+
citation_gsheet <- googlesheets4::read_sheet(
15185+
conf$citation_gsheet,
15186+
na = c('', 'NA'),
15187+
col_types = citation_cols <- paste0(rep('c', 24), collapse=''))
15188+
colnames(citation_gsheet) <- citation_gsheet[6,]
15189+
15190+
raw_fp <- "./vault/raw_documentation_files/"
15191+
15192+
networks <- list.files(raw_fp)
15193+
15194+
for(network in networks) {
15195+
## network <- networks[5]
15196+
domains <- list.files(file.path(raw_fp, network))
15197+
15198+
for(domain in domains) {
15199+
## domain <- domains[4]
15200+
writeLines(paste('reading documentation for data source:', domain))
15201+
15202+
# list all files in this domain of this network
15203+
product_names <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"))
15204+
product_docs <- list.files(file.path(raw_fp, network, domain, "raw", "documentation"), full.names = TRUE)
15205+
15206+
15207+
# filter gsheet to domain, get all prodcodes
15208+
dmn_citation_gsheet <- citation_gsheet %>%
15209+
filter(network == !!network,
15210+
domain == !!domain)
15211+
15212+
dmn_prodcodes <- dmn_citation_gsheet$macrosheds_prodcode
15213+
dmn_prodcodes_grep <- paste0(dmn_prodcodes, collapse="|")
15214+
15215+
# filter product docs to only those with prodcode text matching gsheet
15216+
cited_products <- product_docs[ifelse(grepl(dmn_prodcodes_grep, product_docs), TRUE, FALSE)]
15217+
15218+
for(file in cited_products) {
15219+
## file <- cited_products[1]
15220+
15221+
cited_filename <- stringr::str_split(file, "__", simplify =TRUE)[2]
15222+
cited_prodcode <- stringr::str_split(cited_filename, ".txt", simplify =TRUE)[1]
15223+
15224+
data_source_doc <- readLines(file)
15225+
data_source_link <- trimws(stringr::str_split(data_source_doc, "[^)][0-9]{4}\\-", simplify =TRUE)[1])
15226+
data_source_dt <- trimws(stringr::str_extract_all(data_source_doc, "[0-9]{4}\\-.*[^)]", simplify =TRUE))[1]
15227+
15228+
print(paste(network, domain, data_source_dt))
15229+
15230+
if(grepl("https://", data_source_link)) {
15231+
data_source_link <- stringr::str_split(data_source_link, " ", simplify = TRUE)[1]
15232+
}
15233+
15234+
# NOTE: documentation files are not mecha-standardized, this function scrapes the best standard
15235+
# this *should* capture all prodcodes in citation gsheet, giving NA for versionlesss
15236+
15237+
# now, for this prodcode in the citation_gsheet df, we put "docs" in the "link" column
15238+
citation_gsheet <- citation_gsheet %>%
15239+
mutate(
15240+
link = case_when(
15241+
ifelse(
15242+
grepl(
15243+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_link, TRUE ~ link),
15244+
link_download_datetime = case_when(
15245+
ifelse(
15246+
grepl(
15247+
cited_prodcode, macrosheds_prodcode), TRUE, FALSE) & domain == !!domain ~ data_source_dt, TRUE ~ link_download_datetime)
15248+
## link = case_when(link == "NA" ~ NA, TRUE ~ link)
15249+
)
15250+
}
15251+
}
15252+
}
15253+
# then, we write the edited df to the actual google sheet
15254+
googlesheets4::sheet_write(citation_gsheet, ss = conf$citation_gsheet, sheet = "timeseries")
15255+
## return(citation_gsheet)
15256+
}

0 commit comments

Comments
 (0)