#*# wildcard, niwot fixes

vlahm · vlahm · commit 4199ae229664 · 2022-10-14T10:48:20.000-06:00
diff --git a/src/acquisition_master.R b/src/acquisition_master.R
@@ -264,7 +264,7 @@ ms_globals <- c(ls(all.names = TRUE), 'ms_globals')
 
 dir.create('logs', showWarnings = FALSE)
 
-# dmnrow = 14
+# dmnrow = 12
 # print(network_domain, n=50)
 for(dmnrow in 1:nrow(network_domain)){
 
diff --git a/src/dev/dev_helpers.R b/src/dev/dev_helpers.R
@@ -1423,3 +1423,17 @@ insert_retrieval_datetimes <- function(){
         write_lines(rt, f)
     }
 }
+
+get_nonnumerics <- function(d){
+
+    #gets unique nonnumeric values by row. useful for identifying quality codes
+    #within data columns
+
+    nonnumerics = apply(d, 2, function(x){
+        xx = as.numeric(x)
+        nonnumerics = is.na(xx)
+        out = unique(x[nonnumerics])
+    })
+
+    return(nonnumerics)
+}
diff --git a/src/global/function_aliases.R b/src/global/function_aliases.R
@@ -50,5 +50,9 @@ map = purrr::map
 map2 = purrr::map
 st_read = sf::st_read
 errors = errors::errors
+drop_errors = errors::drop_errors
+set_errors = errors::set_errors
 pivot_wider = tidyr::pivot_wider
 pivot_longer = tidyr::pivot_longer
+rename = dplyr::rename
+where = tidyselect:::where
diff --git a/src/global/general_kernels.R b/src/global/general_kernels.R
@@ -1822,36 +1822,24 @@ process_3_ms824 <- function(network, domain, prodname_ms, site_code,
 
   googledrive::drive_rm('GEE/rgee.csv', verbose = FALSE)
 
-
-  final <- fin_table %>%
+  fin_table <- fin_table %>%
     select(date, site_code, dayl, prcp, srad, swe, tmax, tmin, vp)
 
-  if(nrow(final) == 0){
+  if(nrow(fin_table) == 0){
     return(generate_ms_exception(glue('No data was retrived for {s}',
                                       s = site_code)))
   }
 
   dir.create(glue('data/{n}/{d}/ws_traits/daymet/',
                   n = network,
-                  d = domain))
+                  d = domain),
+             showWarnings = FALSE)
 
   file_path <- glue('data/{n}/{d}/ws_traits/daymet/domain_climate.feather',
                     n = network,
                     d = domain)
 
-  write_feather(final, file_path)
-
-  # type <- str_split_fixed(prodname_ms, '__', n = Inf)[,1]
-  #
-  # dir <- glue('data/{n}/{d}/ws_traits/{v}/',
-  #             n = network, d = domain, v = type)
-  #
-  # final <- append_unprod_prefix(final, prodname_ms)
-  # final_sum <- append_unprod_prefix(final_sum, prodname_ms)
-  #
-  # save_general_files(final_file = final_sum,
-  #                    raw_file = final,
-  #                    domain_dir = dir)
+  write_feather(fin_table, file_path)
 
   return()
 }
diff --git a/src/global/global_helpers.R b/src/global/global_helpers.R
@@ -409,8 +409,8 @@ identify_sampling_bypass <- function(df,
                               prodname_ms,
                               sampling_type = NULL){
 
-    #This case is used (primarily for neon) when use of d_raw and
-    # ms_cast_flag are not used because of incaomptable data structures
+    #This case is used (primarily for neon) when use of ms_read_raw_csv and
+    # ms_cast_and_reflag are prohibited because of incompatible data structures
 
     #checks
     if(!is.logical(is_sensor)){
@@ -700,8 +700,10 @@ ms_read_raw_csv <- function(filepath,
     #set_to_NA: For values such as 9999 that are proxies for NA values.
     #convert_to_BDL_flag: character vector of QC flags that should be interpreted
     #   as "below detection limit". For numeric codes, e.g. -888, give their
-    #   character representations, i.e. "-888".
-    #   This is only for below-detection-limit flags within data columns.
+    #   character representations, i.e. "-888". Accepts '#*#' as a wildcard that
+    #   can stand in for any numeral or a decimal. Wildcard is useful for forms like
+    #   "<0.03", "<0.05", etc. Instead of listing these, you can just pass "<#*#".
+    #   This parameter is only for below-detection-limit flags within data columns.
     #   Codes will be standardized to "BDL" and extracted into the variable-flag column
     #   corresponding to each data variable. Variable-flag columns will be created
     #   as necessary. See ms_cast_and_reflag for the next step in handling BDL data.
@@ -973,18 +975,30 @@ ms_read_raw_csv <- function(filepath,
     #which will be converted to 1/2 detlim downstream
     bdl_cols_do_not_drop <- c()
     new_varflag_cols <- c()
+    all_datacols <- c(data_cols, alt_datacols)
     for(i in seq_along(convert_to_BDL_flag)){
 
         bdl_flag <- convert_to_BDL_flag[i]
-        all_datacols <- c(data_cols, alt_datacols)
+        if(grepl('#*#', bdl_flag)){
+            bdl_flag <- sub('#*#', '[0-9\\.]+', bdl_flag, fixed = TRUE)
+            has_wildcard <- TRUE
+        } else {
+            has_wildcard <- FALSE
+        }
+
         for(j in seq_along(all_datacols)){
 
             d_varcode <- unname(all_datacols)[j]
             d_colname <- names(all_datacols)[j]
             d_clm <- d[[d_colname]]
             if(is.null(d_clm)) next #column doesn't exist
 
-            bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
+            if(has_wildcard){
+                bdl_inds <- ! is.na(d_clm) & grepl(bdl_flag, d_clm)
+            } else {
+                bdl_inds <- ! is.na(d_clm) & d_clm == bdl_flag
+            }
+
             if(! any(bdl_inds)) next #this bdl code doesn't exist in this column
 
             if(! (length(var_flagcols) == 1 && is.na(var_flagcols))){
@@ -995,6 +1009,10 @@ ms_read_raw_csv <- function(filepath,
                 var_flagcol_already_exists <- FALSE
             }
 
+            if(candidate_flagcol %in% new_varflag_cols){
+                var_flagcol_already_exists <- TRUE
+            }
+
             if(! var_flagcol_already_exists){
                 d[[candidate_flagcol]] <- NA_character_
                 new_varflag_cols <- c(new_varflag_cols, candidate_flagcol)
@@ -5947,11 +5965,9 @@ shortcut_idw <- function(encompassing_dem,
             d_from_elev[d_from_elev < 0] <- 0
 
             #get weighted mean of both approaches:
-            #weight on idw is 1; weight on elev-predicted is |R^2|
-            abs_r2 <- abs(cor(d_elev$d, mod$fitted.values)^2)
-            # abs_adjr2 <- abs(1 - (1 - r2) * ((nobs(mod) - 1) / mod$df.residual))
-
-            d_idw <- (d_idw + d_from_elev * abs_r2) / (1 + abs_r2)
+            #weight on idw is 1; weight on elev-predicted is R^2
+            rsq <- cor(d_elev$d, mod$fitted.values)^2
+            d_idw <- (d_idw + d_from_elev * rsq) / (1 + rsq)
         }
 
         ws_mean[k] <- mean(d_idw, na.rm=TRUE)
@@ -11549,6 +11565,12 @@ approxjoin_datetime <- function(x,
                                 indices_only = FALSE){
                                 #direction = 'forward'){
 
+    #TODO: update to match nearest non-NA value by column if we ever go sub-daily.
+    #some code for this in place below, but note that implementing this will
+    #break incides_only. also there will be no good way to select the matching
+    #date in cases where there are multiple data columns. i.e. there will be
+    #seprate matched dates for each column... not sure how to handle.
+
     #x and y: macrosheds standard tibbles with only one site_code,
     #   which must be the same in x and y. Nonstandard tibbles may also work,
     #   so long as they have datetime columns, but the only case where we need
@@ -11608,37 +11630,47 @@ approxjoin_datetime <- function(x,
     }
     if(! is.logical(indices_only)) stop('indices_only must be a logical')
 
-    #deal with the case of x or y being a specialized "flow" tibble
-    # x_is_flowtibble <- y_is_flowtibble <- FALSE
-    # if('flow' %in% colnames(x)) x_is_flowtibble <- TRUE
-    # if('flow' %in% colnames(y)) y_is_flowtibble <- TRUE
-    # if(x_is_flowtibble && ! y_is_flowtibble){
-    #     varname <- y$var[1]
-    #     y$var = NULL
-    # } else if(y_is_flowtibble && ! x_is_flowtibble){
-    #     varname <- x$var[1]
-    #     x$var = NULL
-    # } else if(! x_is_flowtibble && ! y_is_flowtibble){
-    #     varname <- x$var[1]
-    #     x$var = NULL
-    #     y$var = NULL
-    # } else {
-    #     stop('x and y are both "flow" tibbles. There should be no need for this')
-    # }
-    # if(x_is_flowtibble) x <- rename(x, val = flow)
-    # if(y_is_flowtibble) y <- rename(y, val = flow)
-
     #data.table doesn't work with the errors package, so error needs
-    #to be separated into its own column. also give same-name columns suffixes
+    #to be separated into its own column and handled with care.
+
+    # #this will be useful if we go sub-daily
+    # if(any(c('val', 'ms_status') %in% colnames(x))){
+    #
+    #     x <- x %>%
+    #         mutate(
+    #                # across(where(~inherits(., 'errors')),
+    #                #        ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
+    #                across(where(~inherits(., 'errors')),
+    #                       ~errors(.),
+    #                       .names = '{.col}_err'),
+    #                across(where(~inherits(., 'errors')),
+    #                       ~drop_errors(.))) %>%
+    #         rename_with(.fn = ~paste0(., '_x'),
+    #                     .cols = everything()) %>%
+    #         # rename(datetime_x = datetime) %>%
+    #         as.data.table()
+    #
+    #     y <- y %>%
+    #         mutate(
+    #                # across(where(~inherits(., 'errors')),
+    #                #        ~case_when(! is.na(.) & is.na(errors(.)) ~ set_errors(., 0), TRUE ~ .)),
+    #                across(where(~inherits(., 'errors')),
+    #                       ~errors(.),
+    #                       .names = '{.col}_err'),
+    #                across(where(~inherits(., 'errors')),
+    #                       ~drop_errors(.))) %>%
+    #         rename_with(.fn = ~paste0(., '_y'),
+    #                     .cols = everything()) %>%
+    #         # rename(datetime_y = datetime) %>%
+    #         as.data.table()
+
+    if('val' %in% colnames(x)){
 
-    if('val' %in% colnames(x)){ #crude catch for nonstandard ms tibbles (fine for now)
         x <- x %>%
             mutate(err = errors(val),
                    val = errors::drop_errors(val)) %>%
             rename_with(.fn = ~paste0(., '_x'),
                         .cols = everything()) %>%
-                        # .cols = any_of(c('site_code', 'var', 'val',
-                        #                  'ms_status', 'ms_interp'))) %>%
             as.data.table()
 
         y <- y %>%
@@ -11647,9 +11679,23 @@ approxjoin_datetime <- function(x,
             rename_with(.fn = ~paste0(., '_y'),
                         .cols = everything()) %>%
             as.data.table()
+
     } else {
-        x <- dplyr::rename(x, datetime_x = datetime) %>% as.data.table()
-        y <- dplyr::rename(y, datetime_y = datetime) %>% as.data.table()
+
+        if(indices_only){
+            x <- rename(x, datetime_x = datetime) %>%
+                mutate(across(where(~inherits(., 'errors')),
+                              ~drop_errors(.))) %>%
+                as.data.table()
+
+            y <- rename(y, datetime_y = datetime) %>%
+                mutate(across(where(~inherits(., 'errors')),
+                              ~drop_errors(.))) %>%
+                as.data.table()
+        } else {
+            stop('this case not yet handled')
+        }
+
     }
 
     #alternative implementation of the "on" argument in data.table joins...
@@ -11683,9 +11729,12 @@ approxjoin_datetime <- function(x,
     #for any datetimes in x or y that were matched more than once, keep only
     #the nearest match
     joined[, `:=` (datetime_match_diff = abs(datetime_x - datetime_y_orig))]
-    joined = joined[order(datetime_match_diff),
-                    lapply(.SD, function(z) first(na.omit(z))),
-                    by = datetime_x]
+    joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_x]
+    joined <- joined[, .SD[which.min(datetime_match_diff)], by = datetime_y_orig]
+    #this will grab the nearest non-NA for each column, but that messes up the datatime indices
+    # joined = joined[order(datetime_match_diff),
+    #                 lapply(.SD, function(z) dplyr::first(na.omit(z))),
+    #                 by = datetime_x]
 
     if(indices_only){
         y_indices <- which(y$datetime_y %in% joined$datetime_y_orig)
@@ -11702,31 +11751,25 @@ approxjoin_datetime <- function(x,
         setnames(joined, 'datetime_y_orig', 'datetime')
     }
 
-    #restore error objects, var column, original column names (with suffixes).
-    #original column order
+    # #restore error objects, var column, original column names (with suffixes).
+    # #original column order (incomplete. execution always returns before this point
+    # #in idw, which is the only place where it would be necessary)
+    # ernames = grep('_err_[xy]$', colnames(joined), value = TRUE)
+    # ernames = ernames[sub('err_', '', ernames) %in% colnames(joined)]
+    # for(erc in ernames){
+    #     dac = sub('err_', '', erc)
+    #     if(dac %in%
+    #     set(joined, j = dac,
+    #         value = set_errors(joined[[dac]], joined[[erc]]))
+    # }
+
     joined <- as_tibble(joined) %>%
         mutate(val_x = errors::set_errors(val_x, err_x),
                val_y = errors::set_errors(val_y, err_y)) %>%
         select(-err_x, -err_y)
-        # mutate(var = !!varname)
-
-    # if(x_is_flowtibble) joined <- rename(joined,
-    #                                      flow = val_x,
-    #                                      ms_status_flow = ms_status_x,
-    #                                      ms_interp_flow = ms_interp_x)
-    # if(y_is_flowtibble) joined <- rename(joined,
-    #                                      flow = val_y,
-    #                                      ms_status_flow = ms_status_y,
-    #                                      ms_interp_flow = ms_interp_y)
-
-    # if(! sum(grepl('^val_[xy]$', colnames(joined))) > 1){
-    #     joined <- rename(joined, val = matches('^val_[xy]$'))
-    # }
 
     joined <- select(joined,
                      datetime,
-                     # matches('^val_?[xy]?$'),
-                     # any_of('flow'),
                      starts_with('site_code'),
                      any_of(c(starts_with('var_'), matches('^var$'))),
                      any_of(c(starts_with('val_'), matches('^val$'))),
diff --git a/src/lter/niwot/processing_kernels.R b/src/lter/niwot/processing_kernels.R