Skip to content

Commit 4199ae2

Browse files
committed
#*# wildcard, niwot fixes
1 parent dae8726 commit 4199ae2

6 files changed

Lines changed: 178 additions & 162 deletions

File tree

src/acquisition_master.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ ms_globals <- c(ls(all.names = TRUE), 'ms_globals')
264264

265265
dir.create('logs', showWarnings = FALSE)
266266

267-
# dmnrow = 14
267+
# dmnrow = 12
268268
# print(network_domain, n=50)
269269
for(dmnrow in 1:nrow(network_domain)){
270270

src/dev/dev_helpers.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,3 +1423,17 @@ insert_retrieval_datetimes <- function(){
14231423
write_lines(rt, f)
14241424
}
14251425
}
1426+
1427+
get_nonnumerics <- function(d){
1428+
1429+
#gets unique nonnumeric values by row. useful for identifying quality codes
1430+
#within data columns
1431+
1432+
nonnumerics = apply(d, 2, function(x){
1433+
xx = as.numeric(x)
1434+
nonnumerics = is.na(xx)
1435+
out = unique(x[nonnumerics])
1436+
})
1437+
1438+
return(nonnumerics)
1439+
}

src/global/function_aliases.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,9 @@ map = purrr::map
5050
map2 = purrr::map
5151
st_read = sf::st_read
5252
errors = errors::errors
53+
drop_errors = errors::drop_errors
54+
set_errors = errors::set_errors
5355
pivot_wider = tidyr::pivot_wider
5456
pivot_longer = tidyr::pivot_longer
57+
rename = dplyr::rename
58+
where = tidyselect:::where

src/global/general_kernels.R

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1822,36 +1822,24 @@ process_3_ms824 <- function(network, domain, prodname_ms, site_code,
18221822

18231823
googledrive::drive_rm('GEE/rgee.csv', verbose = FALSE)
18241824

1825-
1826-
final <- fin_table %>%
1825+
fin_table <- fin_table %>%
18271826
select(date, site_code, dayl, prcp, srad, swe, tmax, tmin, vp)
18281827

1829-
if(nrow(final) == 0){
1828+
if(nrow(fin_table) == 0){
18301829
return(generate_ms_exception(glue('No data was retrived for {s}',
18311830
s = site_code)))
18321831
}
18331832

18341833
dir.create(glue('data/{n}/{d}/ws_traits/daymet/',
18351834
n = network,
1836-
d = domain))
1835+
d = domain),
1836+
showWarnings = FALSE)
18371837

18381838
file_path <- glue('data/{n}/{d}/ws_traits/daymet/domain_climate.feather',
18391839
n = network,
18401840
d = domain)
18411841

1842-
write_feather(final, file_path)
1843-
1844-
# type <- str_split_fixed(prodname_ms, '__', n = Inf)[,1]
1845-
#
1846-
# dir <- glue('data/{n}/{d}/ws_traits/{v}/',
1847-
# n = network, d = domain, v = type)
1848-
#
1849-
# final <- append_unprod_prefix(final, prodname_ms)
1850-
# final_sum <- append_unprod_prefix(final_sum, prodname_ms)
1851-
#
1852-
# save_general_files(final_file = final_sum,
1853-
# raw_file = final,
1854-
# domain_dir = dir)
1842+
write_feather(fin_table, file_path)
18551843

18561844
return()
18571845
}

src/global/global_helpers.R

Lines changed: 101 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -409,8 +409,8 @@ identify_sampling_bypass <- function(df,
409409
prodname_ms,
410410
sampling_type = NULL){
411411

412-
#This case is used (primarily for neon) when use of d_raw and
413-
# ms_cast_flag are not used because of incaomptable data structures
412+
#This case is used (primarily for neon) when use of ms_read_raw_csv and
413+
# ms_cast_and_reflag are prohibited because of incompatible data structures
414414

415415
#checks
416416
if(!is.logical(is_sensor)){
@@ -700,8 +700,10 @@ ms_read_raw_csv <- function(filepath,
700700
#set_to_NA: For values such as 9999 that are proxies for NA values.
701701
#convert_to_BDL_flag: character vector of QC flags that should be interpreted
702702
# as "below detection limit". For numeric codes, e.g. -888, give their
703-
# character representations, i.e. "-888".
704-
# This is only for below-detection-limit flags within data columns.
703+
# character representations, i.e. "-888". Accepts '#*#' as a wildcard that
704+
# can stand in for any numeral or a decimal. Wildcard is useful for forms like
705+
# "<0.03", "<0.05", etc. Instead of listing these, you can just pass "<#*#".
706+
# This parameter is only for below-detection-limit flags within data columns.
705707
# Codes will be standardized to "BDL" and extracted into the variable-flag column
706708
# corresponding to each data variable. Variable-flag columns will be created
707709
# as necessary. See ms_cast_and_reflag for the next step in handling BDL data.
@@ -973,18 +975,30 @@ ms_read_raw_csv <- function(filepath,
973975
#which will be converted to 1/2 detlim downstream
974976
bdl_cols_do_not_drop <- c()
975977
new_varflag_cols <- c()
978+
all_datacols <- c(data_cols, alt_datacols)
976979
for(i in seq_along(convert_to_BDL_flag)){
977980

978981
bdl_flag <- convert_to_BDL_flag[i]
979-
all_datacols <- c(data_cols, alt_datacols)
982+
if(grepl('#*#', bdl_flag)){
983+
bdl_flag <- sub('#*#', '[0-9\\.]+', bdl_flag, fixed = TRUE)
984+
has_wildcard <- TRUE
985+
} else {
986+
has_wildcard <- FALSE
987+
}
988+
980989
for(j in seq_along(all_datacols)){
981990

982991
d_varcode <- unname(all_datacols)[j]
983992
d_colname <- names(all_datacols)[j]
984993
d_clm <- d[[d_colname]]
985994
if(is.null(d_clm)) next #column doesn't exist
986995

987-
bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
996+
if(has_wildcard){
997+
bdl_inds <- ! is.na(d_clm) & grepl(bdl_flag, d_clm)
998+
} else {
999+
bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
1000+
}
1001+
9881002
if(! any(bdl_inds)) next #this bdl code doesn't exist in this column
9891003

9901004
if(! (length(var_flagcols) == 1 && is.na(var_flagcols))){
@@ -995,6 +1009,10 @@ ms_read_raw_csv <- function(filepath,
9951009
var_flagcol_already_exists <- FALSE
9961010
}
9971011

1012+
if(candidate_flagcol %in% new_varflag_cols){
1013+
var_flagcol_already_exists <- TRUE
1014+
}
1015+
9981016
if(! var_flagcol_already_exists){
9991017
d[[candidate_flagcol]] <- NA_character_
10001018
new_varflag_cols <- c(new_varflag_cols, candidate_flagcol)
@@ -5947,11 +5965,9 @@ shortcut_idw <- function(encompassing_dem,
59475965
d_from_elev[d_from_elev < 0] <- 0
59485966

59495967
#get weighted mean of both approaches:
5950-
#weight on idw is 1; weight on elev-predicted is |R^2|
5951-
abs_r2 <- abs(cor(d_elev$d, mod$fitted.values)^2)
5952-
# abs_adjr2 <- abs(1 - (1 - r2) * ((nobs(mod) - 1) / mod$df.residual))
5953-
5954-
d_idw <- (d_idw + d_from_elev * abs_r2) / (1 + abs_r2)
5968+
#weight on idw is 1; weight on elev-predicted is R^2
5969+
rsq <- cor(d_elev$d, mod$fitted.values)^2
5970+
d_idw <- (d_idw + d_from_elev * rsq) / (1 + rsq)
59555971
}
59565972

59575973
ws_mean[k] <- mean(d_idw, na.rm=TRUE)
@@ -11549,6 +11565,12 @@ approxjoin_datetime <- function(x,
1154911565
indices_only = FALSE){
1155011566
#direction = 'forward'){
1155111567

11568+
#TODO: update to match nearest non-NA value by column if we ever go sub-daily.
11569+
#some code for this in place below, but note that implementing this will
11570+
#break incides_only. also there will be no good way to select the matching
11571+
#date in cases where there are multiple data columns. i.e. there will be
11572+
#seprate matched dates for each column... not sure how to handle.
11573+
1155211574
#x and y: macrosheds standard tibbles with only one site_code,
1155311575
# which must be the same in x and y. Nonstandard tibbles may also work,
1155411576
# so long as they have datetime columns, but the only case where we need
@@ -11608,37 +11630,47 @@ approxjoin_datetime <- function(x,
1160811630
}
1160911631
if(! is.logical(indices_only)) stop('indices_only must be a logical')
1161011632

11611-
#deal with the case of x or y being a specialized "flow" tibble
11612-
# x_is_flowtibble <- y_is_flowtibble <- FALSE
11613-
# if('flow' %in% colnames(x)) x_is_flowtibble <- TRUE
11614-
# if('flow' %in% colnames(y)) y_is_flowtibble <- TRUE
11615-
# if(x_is_flowtibble && ! y_is_flowtibble){
11616-
# varname <- y$var[1]
11617-
# y$var = NULL
11618-
# } else if(y_is_flowtibble && ! x_is_flowtibble){
11619-
# varname <- x$var[1]
11620-
# x$var = NULL
11621-
# } else if(! x_is_flowtibble && ! y_is_flowtibble){
11622-
# varname <- x$var[1]
11623-
# x$var = NULL
11624-
# y$var = NULL
11625-
# } else {
11626-
# stop('x and y are both "flow" tibbles. There should be no need for this')
11627-
# }
11628-
# if(x_is_flowtibble) x <- rename(x, val = flow)
11629-
# if(y_is_flowtibble) y <- rename(y, val = flow)
11630-
1163111633
#data.table doesn't work with the errors package, so error needs
11632-
#to be separated into its own column. also give same-name columns suffixes
11634+
#to be separated into its own column and handled with care.
11635+
11636+
# #this will be useful if we go sub-daily
11637+
# if(any(c('val', 'ms_status') %in% colnames(x))){
11638+
#
11639+
# x <- x %>%
11640+
# mutate(
11641+
# # across(where(~inherits(., 'errors')),
11642+
# # ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
11643+
# across(where(~inherits(., 'errors')),
11644+
# ~errors(.),
11645+
# .names = '{.col}_err'),
11646+
# across(where(~inherits(., 'errors')),
11647+
# ~drop_errors(.))) %>%
11648+
# rename_with(.fn = ~paste0(., '_x'),
11649+
# .cols = everything()) %>%
11650+
# # rename(datetime_x = datetime) %>%
11651+
# as.data.table()
11652+
#
11653+
# y <- y %>%
11654+
# mutate(
11655+
# # across(where(~inherits(., 'errors')),
11656+
# # ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
11657+
# across(where(~inherits(., 'errors')),
11658+
# ~errors(.),
11659+
# .names = '{.col}_err'),
11660+
# across(where(~inherits(., 'errors')),
11661+
# ~drop_errors(.))) %>%
11662+
# rename_with(.fn = ~paste0(., '_y'),
11663+
# .cols = everything()) %>%
11664+
# # rename(datetime_y = datetime) %>%
11665+
# as.data.table()
11666+
11667+
if('val' %in% colnames(x)){
1163311668

11634-
if('val' %in% colnames(x)){ #crude catch for nonstandard ms tibbles (fine for now)
1163511669
x <- x %>%
1163611670
mutate(err = errors(val),
1163711671
val = errors::drop_errors(val)) %>%
1163811672
rename_with(.fn = ~paste0(., '_x'),
1163911673
.cols = everything()) %>%
11640-
# .cols = any_of(c('site_code', 'var', 'val',
11641-
# 'ms_status', 'ms_interp'))) %>%
1164211674
as.data.table()
1164311675

1164411676
y <- y %>%
@@ -11647,9 +11679,23 @@ approxjoin_datetime <- function(x,
1164711679
rename_with(.fn = ~paste0(., '_y'),
1164811680
.cols = everything()) %>%
1164911681
as.data.table()
11682+
1165011683
} else {
11651-
x <- dplyr::rename(x, datetime_x = datetime) %>% as.data.table()
11652-
y <- dplyr::rename(y, datetime_y = datetime) %>% as.data.table()
11684+
11685+
if(indices_only){
11686+
x <- rename(x, datetime_x = datetime) %>%
11687+
mutate(across(where(~inherits(., 'errors')),
11688+
~drop_errors(.))) %>%
11689+
as.data.table()
11690+
11691+
y <- rename(y, datetime_y = datetime) %>%
11692+
mutate(across(where(~inherits(., 'errors')),
11693+
~drop_errors(.))) %>%
11694+
as.data.table()
11695+
} else {
11696+
stop('this case not yet handled')
11697+
}
11698+
1165311699
}
1165411700

1165511701
#alternative implementation of the "on" argument in data.table joins...
@@ -11683,9 +11729,12 @@ approxjoin_datetime <- function(x,
1168311729
#for any datetimes in x or y that were matched more than once, keep only
1168411730
#the nearest match
1168511731
joined[, `:=` (datetime_match_diff = abs(datetime_x - datetime_y_orig))]
11686-
joined = joined[order(datetime_match_diff),
11687-
lapply(.SD, function(z) first(na.omit(z))),
11688-
by = datetime_x]
11732+
joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_x]
11733+
joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_y_orig]
11734+
#this will grab the nearest non-NA for each column, but that messes up the datatime indices
11735+
# joined = joined[order(datetime_match_diff),
11736+
# lapply(.SD, function(z) dplyr::first(na.omit(z))),
11737+
# by = datetime_x]
1168911738

1169011739
if(indices_only){
1169111740
y_indices <- which(y$datetime_y %in% joined$datetime_y_orig)
@@ -11702,31 +11751,25 @@ approxjoin_datetime <- function(x,
1170211751
setnames(joined, 'datetime_y_orig', 'datetime')
1170311752
}
1170411753

11705-
#restore error objects, var column, original column names (with suffixes).
11706-
#original column order
11754+
# #restore error objects, var column, original column names (with suffixes).
11755+
# #original column order (incomplete. execution always returns before this point
11756+
# #in idw, which is the only place where it would be necessary)
11757+
# ernames = grep('_err_[xy]$', colnames(joined), value = TRUE)
11758+
# ernames = ernames[sub('err_', '', ernames) %in% colnames(joined)]
11759+
# for(erc in ernames){
11760+
# dac = sub('err_', '', erc)
11761+
# if(dac %in%
11762+
# set(joined, j = dac,
11763+
# value = set_errors(joined[[dac]], joined[[erc]]))
11764+
# }
11765+
1170711766
joined <- as_tibble(joined) %>%
1170811767
mutate(val_x = errors::set_errors(val_x, err_x),
1170911768
val_y = errors::set_errors(val_y, err_y)) %>%
1171011769
select(-err_x, -err_y)
11711-
# mutate(var = !!varname)
11712-
11713-
# if(x_is_flowtibble) joined <- rename(joined,
11714-
# flow = val_x,
11715-
# ms_status_flow = ms_status_x,
11716-
# ms_interp_flow = ms_interp_x)
11717-
# if(y_is_flowtibble) joined <- rename(joined,
11718-
# flow = val_y,
11719-
# ms_status_flow = ms_status_y,
11720-
# ms_interp_flow = ms_interp_y)
11721-
11722-
# if(! sum(grepl('^val_[xy]$', colnames(joined))) > 1){
11723-
# joined <- rename(joined, val = matches('^val_[xy]$'))
11724-
# }
1172511770

1172611771
joined <- select(joined,
1172711772
datetime,
11728-
# matches('^val_?[xy]?$'),
11729-
# any_of('flow'),
1173011773
starts_with('site_code'),
1173111774
any_of(c(starts_with('var_'), matches('^var$'))),
1173211775
any_of(c(starts_with('val_'), matches('^val$'))),

0 commit comments

Comments
 (0)