@@ -409,8 +409,8 @@ identify_sampling_bypass <- function(df,
409409 prodname_ms,
410410 sampling_type = NULL){
411411
412- #This case is used (primarily for neon) when use of d_raw and
413- # ms_cast_flag are not used because of incaomptable data structures
412+ #This case is used (primarily for neon) when use of ms_read_raw_csv and
413+ # ms_cast_and_reflag are prohibited because of incompatible data structures
414414
415415 #checks
416416 if(!is.logical(is_sensor)){
@@ -700,8 +700,10 @@ ms_read_raw_csv <- function(filepath,
700700 #set_to_NA: For values such as 9999 that are proxies for NA values.
701701 #convert_to_BDL_flag: character vector of QC flags that should be interpreted
702702 # as "below detection limit". For numeric codes, e.g. -888, give their
703- # character representations, i.e. "-888".
704- # This is only for below-detection-limit flags within data columns.
703+ # character representations, i.e. "-888". Accepts '#*#' as a wildcard that
704+ # can stand in for any numeral or a decimal. Wildcard is useful for forms like
705+ # "<0.03", "<0.05", etc. Instead of listing these, you can just pass "<#*#".
706+ # This parameter is only for below-detection-limit flags within data columns.
705707 # Codes will be standardized to "BDL" and extracted into the variable-flag column
706708 # corresponding to each data variable. Variable-flag columns will be created
707709 # as necessary. See ms_cast_and_reflag for the next step in handling BDL data.
@@ -973,18 +975,30 @@ ms_read_raw_csv <- function(filepath,
973975 #which will be converted to 1/2 detlim downstream
974976 bdl_cols_do_not_drop <- c()
975977 new_varflag_cols <- c()
978+ all_datacols <- c(data_cols, alt_datacols)
976979 for(i in seq_along(convert_to_BDL_flag)){
977980
978981 bdl_flag <- convert_to_BDL_flag[i]
979- all_datacols <- c(data_cols, alt_datacols)
982+ if(grepl('#*#', bdl_flag)){
983+ bdl_flag <- sub('#*#', '[0-9\\.]+', bdl_flag, fixed = TRUE)
984+ has_wildcard <- TRUE
985+ } else {
986+ has_wildcard <- FALSE
987+ }
988+
980989 for(j in seq_along(all_datacols)){
981990
982991 d_varcode <- unname(all_datacols)[j]
983992 d_colname <- names(all_datacols)[j]
984993 d_clm <- d[[d_colname]]
985994 if(is.null(d_clm)) next #column doesn't exist
986995
987- bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
996+ if(has_wildcard){
997+ bdl_inds <- ! is.na(d_clm) & grepl(bdl_flag, d_clm)
998+ } else {
999+ bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
1000+ }
1001+
9881002 if(! any(bdl_inds)) next #this bdl code doesn't exist in this column
9891003
9901004 if(! (length(var_flagcols) == 1 && is.na(var_flagcols))){
@@ -995,6 +1009,10 @@ ms_read_raw_csv <- function(filepath,
9951009 var_flagcol_already_exists <- FALSE
9961010 }
9971011
1012+ if(candidate_flagcol %in% new_varflag_cols){
1013+ var_flagcol_already_exists <- TRUE
1014+ }
1015+
9981016 if(! var_flagcol_already_exists){
9991017 d[[candidate_flagcol]] <- NA_character_
10001018 new_varflag_cols <- c(new_varflag_cols, candidate_flagcol)
@@ -5947,11 +5965,9 @@ shortcut_idw <- function(encompassing_dem,
59475965 d_from_elev[d_from_elev < 0] <- 0
59485966
59495967 #get weighted mean of both approaches:
5950- #weight on idw is 1; weight on elev-predicted is |R^2|
5951- abs_r2 <- abs(cor(d_elev$d, mod$fitted.values)^2)
5952- # abs_adjr2 <- abs(1 - (1 - r2) * ((nobs(mod) - 1) / mod$df.residual))
5953-
5954- d_idw <- (d_idw + d_from_elev * abs_r2) / (1 + abs_r2)
5968+ #weight on idw is 1; weight on elev-predicted is R^2
5969+ rsq <- cor(d_elev$d, mod$fitted.values)^2
5970+ d_idw <- (d_idw + d_from_elev * rsq) / (1 + rsq)
59555971 }
59565972
59575973 ws_mean[k] <- mean(d_idw, na.rm=TRUE)
@@ -11549,6 +11565,12 @@ approxjoin_datetime <- function(x,
1154911565 indices_only = FALSE){
1155011566 #direction = 'forward'){
1155111567
11568+ #TODO: update to match nearest non-NA value by column if we ever go sub-daily.
11569+ #some code for this in place below, but note that implementing this will
11570+ #break incides_only. also there will be no good way to select the matching
11571+ #date in cases where there are multiple data columns. i.e. there will be
11572+ #seprate matched dates for each column... not sure how to handle.
11573+
1155211574 #x and y: macrosheds standard tibbles with only one site_code,
1155311575 # which must be the same in x and y. Nonstandard tibbles may also work,
1155411576 # so long as they have datetime columns, but the only case where we need
@@ -11608,37 +11630,47 @@ approxjoin_datetime <- function(x,
1160811630 }
1160911631 if(! is.logical(indices_only)) stop('indices_only must be a logical')
1161011632
11611- #deal with the case of x or y being a specialized "flow" tibble
11612- # x_is_flowtibble <- y_is_flowtibble <- FALSE
11613- # if('flow' %in% colnames(x)) x_is_flowtibble <- TRUE
11614- # if('flow' %in% colnames(y)) y_is_flowtibble <- TRUE
11615- # if(x_is_flowtibble && ! y_is_flowtibble){
11616- # varname <- y$var[1]
11617- # y$var = NULL
11618- # } else if(y_is_flowtibble && ! x_is_flowtibble){
11619- # varname <- x$var[1]
11620- # x$var = NULL
11621- # } else if(! x_is_flowtibble && ! y_is_flowtibble){
11622- # varname <- x$var[1]
11623- # x$var = NULL
11624- # y$var = NULL
11625- # } else {
11626- # stop('x and y are both "flow" tibbles. There should be no need for this')
11627- # }
11628- # if(x_is_flowtibble) x <- rename(x, val = flow)
11629- # if(y_is_flowtibble) y <- rename(y, val = flow)
11630-
1163111633 #data.table doesn't work with the errors package, so error needs
11632- #to be separated into its own column. also give same-name columns suffixes
11634+ #to be separated into its own column and handled with care.
11635+
11636+ # #this will be useful if we go sub-daily
11637+ # if(any(c('val', 'ms_status') %in% colnames(x))){
11638+ #
11639+ # x <- x %>%
11640+ # mutate(
11641+ # # across(where(~inherits(., 'errors')),
11642+ # # ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
11643+ # across(where(~inherits(., 'errors')),
11644+ # ~errors(.),
11645+ # .names = '{.col}_err'),
11646+ # across(where(~inherits(., 'errors')),
11647+ # ~drop_errors(.))) %>%
11648+ # rename_with(.fn = ~paste0(., '_x'),
11649+ # .cols = everything()) %>%
11650+ # # rename(datetime_x = datetime) %>%
11651+ # as.data.table()
11652+ #
11653+ # y <- y %>%
11654+ # mutate(
11655+ # # across(where(~inherits(., 'errors')),
11656+ # # ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
11657+ # across(where(~inherits(., 'errors')),
11658+ # ~errors(.),
11659+ # .names = '{.col}_err'),
11660+ # across(where(~inherits(., 'errors')),
11661+ # ~drop_errors(.))) %>%
11662+ # rename_with(.fn = ~paste0(., '_y'),
11663+ # .cols = everything()) %>%
11664+ # # rename(datetime_y = datetime) %>%
11665+ # as.data.table()
11666+
11667+ if('val' %in% colnames(x)){
1163311668
11634- if('val' %in% colnames(x)){ #crude catch for nonstandard ms tibbles (fine for now)
1163511669 x <- x %>%
1163611670 mutate(err = errors(val),
1163711671 val = errors::drop_errors(val)) %>%
1163811672 rename_with(.fn = ~paste0(., '_x'),
1163911673 .cols = everything()) %>%
11640- # .cols = any_of(c('site_code', 'var', 'val',
11641- # 'ms_status', 'ms_interp'))) %>%
1164211674 as.data.table()
1164311675
1164411676 y <- y %>%
@@ -11647,9 +11679,23 @@ approxjoin_datetime <- function(x,
1164711679 rename_with(.fn = ~paste0(., '_y'),
1164811680 .cols = everything()) %>%
1164911681 as.data.table()
11682+
1165011683 } else {
11651- x <- dplyr::rename(x, datetime_x = datetime) %>% as.data.table()
11652- y <- dplyr::rename(y, datetime_y = datetime) %>% as.data.table()
11684+
11685+ if(indices_only){
11686+ x <- rename(x, datetime_x = datetime) %>%
11687+ mutate(across(where(~inherits(., 'errors')),
11688+ ~drop_errors(.))) %>%
11689+ as.data.table()
11690+
11691+ y <- rename(y, datetime_y = datetime) %>%
11692+ mutate(across(where(~inherits(., 'errors')),
11693+ ~drop_errors(.))) %>%
11694+ as.data.table()
11695+ } else {
11696+ stop('this case not yet handled')
11697+ }
11698+
1165311699 }
1165411700
1165511701 #alternative implementation of the "on" argument in data.table joins...
@@ -11683,9 +11729,12 @@ approxjoin_datetime <- function(x,
1168311729 #for any datetimes in x or y that were matched more than once, keep only
1168411730 #the nearest match
1168511731 joined[, `:=` (datetime_match_diff = abs(datetime_x - datetime_y_orig))]
11686- joined = joined[order(datetime_match_diff),
11687- lapply(.SD, function(z) first(na.omit(z))),
11688- by = datetime_x]
11732+ joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_x]
11733+ joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_y_orig]
11734+ #this will grab the nearest non-NA for each column, but that messes up the datatime indices
11735+ # joined = joined[order(datetime_match_diff),
11736+ # lapply(.SD, function(z) dplyr::first(na.omit(z))),
11737+ # by = datetime_x]
1168911738
1169011739 if(indices_only){
1169111740 y_indices <- which(y$datetime_y %in% joined$datetime_y_orig)
@@ -11702,31 +11751,25 @@ approxjoin_datetime <- function(x,
1170211751 setnames(joined, 'datetime_y_orig', 'datetime')
1170311752 }
1170411753
11705- #restore error objects, var column, original column names (with suffixes).
11706- #original column order
11754+ # #restore error objects, var column, original column names (with suffixes).
11755+ # #original column order (incomplete. execution always returns before this point
11756+ # #in idw, which is the only place where it would be necessary)
11757+ # ernames = grep('_err_[xy]$', colnames(joined), value = TRUE)
11758+ # ernames = ernames[sub('err_', '', ernames) %in% colnames(joined)]
11759+ # for(erc in ernames){
11760+ # dac = sub('err_', '', erc)
11761+ # if(dac %in%
11762+ # set(joined, j = dac,
11763+ # value = set_errors(joined[[dac]], joined[[erc]]))
11764+ # }
11765+
1170711766 joined <- as_tibble(joined) %>%
1170811767 mutate(val_x = errors::set_errors(val_x, err_x),
1170911768 val_y = errors::set_errors(val_y, err_y)) %>%
1171011769 select(-err_x, -err_y)
11711- # mutate(var = !!varname)
11712-
11713- # if(x_is_flowtibble) joined <- rename(joined,
11714- # flow = val_x,
11715- # ms_status_flow = ms_status_x,
11716- # ms_interp_flow = ms_interp_x)
11717- # if(y_is_flowtibble) joined <- rename(joined,
11718- # flow = val_y,
11719- # ms_status_flow = ms_status_y,
11720- # ms_interp_flow = ms_interp_y)
11721-
11722- # if(! sum(grepl('^val_[xy]$', colnames(joined))) > 1){
11723- # joined <- rename(joined, val = matches('^val_[xy]$'))
11724- # }
1172511770
1172611771 joined <- select(joined,
1172711772 datetime,
11728- # matches('^val_?[xy]?$'),
11729- # any_of('flow'),
1173011773 starts_with('site_code'),
1173111774 any_of(c(starts_with('var_'), matches('^var$'))),
1173211775 any_of(c(starts_with('val_'), matches('^val$'))),
0 commit comments