ethzplus · mmyrte · Jun 11, 2026 · May 12, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.github/workflows/R-CMD-check-light.yml b/.github/workflows/R-CMD-check-light.yml
@@ -21,7 +21,7 @@ jobs:
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:

diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -20,15 +20,18 @@ jobs:
       matrix:
         config:
           - { os: macos-latest, r: "release" }
-          - { os: windows-latest, r: "release" }
           - { os: ubuntu-latest, r: "release" }
+          # r cmd build apparently fails to install the package - no errors or warnings
+          # only becomes evident when trying to build vignette
+          # pak not installing system dependencies?
+          # - { os: windows-latest, r: "release" }
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - uses: r-lib/actions/setup-pandoc@v2
 
@@ -47,13 +50,11 @@ jobs:
       - name: Cache macOS system dependencies
         if: runner.os == 'macos'
         id: cache-brew
-        uses: actions/cache@v4
+        uses: actions/cache@v5
         with:
           path: |
-            /opt/homebrew/Cellar/gdal
-            /opt/homebrew/Cellar/proj
-            /usr/local/Cellar/gdal
-            /usr/local/Cellar/proj
+            /opt/homebrew/Cellar/
+            /usr/local/Cellar/
           key: ${{ runner.os }}-brew-gdal-proj-${{ hashFiles('brew-versions.txt') }}
 
       - name: Install or link macOS dependencies
@@ -74,7 +75,7 @@ jobs:
       - name: Cache Ubuntu system dependencies
         if: runner.os == 'linux'
         id: cache-apt
-        uses: actions/cache@v4
+        uses: actions/cache@v5
         with:
           path: /var/cache/apt/archives
           key: ${{ runner.os }}-apt-gdal-proj-${{ hashFiles('apt-versions.txt') }}

diff --git a/.github/workflows/format-suggest.yaml b/.github/workflows/format-suggest.yaml
@@ -28,7 +28,7 @@ jobs:
       pull-requests: write
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           ref: ${{ github.event.pull_request.head.sha }}
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,7 +10,6 @@ URL: https://ethzplus.github.io/evoland-plus, https://github.com/ethzplus/evolan
 BugReports: https://github.com/ethzplus/evoland-plus/issues
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.3
 Depends:
     R (>= 4.2)
 Imports:
@@ -70,3 +69,4 @@ Collate:
     'util_dinamica.R'
     'util_download.R'
     'util_terra.R'
+Config/roxygen2/version: 8.0.0
diff --git a/R/evoland_db.R b/R/evoland_db.R
@@ -104,6 +104,31 @@ evoland_db <- R6::R6Class(
       create_method_binding(upsert_new_neighbors)
     },
 
+    #' @description Add a predictor using the currently active id_run, see [add_predictor()]
+    #' @param pred_data_raw Data frame with columns `id_coord`, `id_period`, and `value`
+    #' @param name Unique short name
+    #' @param fill_value Value to substitute if a coordinate point in [coords_t] does
+    #' not have an explicit associated value
+    #' @param pretty_name char, Friendly name for use in reporting
+    #' @param description char, For use in reporting.
+    #' @param orig_format char, Format description of the underlying data (raster, vector…)
+    #' @param sources data.frame with url/md5sum columns, used for keeping track of
+    #' underlying raw data, see [download_and_verify()]
+    #' @param unit char, SI unit for physical predictors, or descriptions like
+    #' "bed nights/year" as a proxy for touristic activity
+    add_predictor = function(
+      pred_data_raw,
+      name,
+      fill_value,
+      pretty_name = name,
+      description = NA_character_,
+      orig_format = NA_character_,
+      sources = data.frame(url = character(0), md5sum = character(0)),
+      unit = NA_character_
+    ) {
+      create_method_binding(add_predictor)
+    },
+
     #' @description Get transitions along with their predictor data in a wide
     #' data.table, see [trans_pred_data_v()]
     #' @param id_trans Integer transition ID, see [trans_meta_t]

diff --git a/R/evoland_db_views.R b/R/evoland_db_views.R
@@ -20,8 +20,7 @@
 #'   for a specific period.
 #'
 #' @name evoland_db_views
-#' @aliases lulc_meta_long_v pred_sources_v trans_v coords_minimal
-#' trans_rates_dinamica_v
+#' @aliases lulc_meta_long_v pred_sources_v trans_v coords_minimal trans_rates_dinamica_v
 #' @include evoland_db.R
 NULL
 

diff --git a/R/pred_data_t.R b/R/pred_data_t.R
@@ -78,7 +78,8 @@ print.pred_data_t <- function(x, nrow = 10, ...) {
     n_coords <- data.table::uniqueN(x[["id_coord"]])
 
     cat(glue::glue(
-      "Raw Predictor Data Table\n",
+      "Raw Predictor Data Table (values as floats)\n",
+      "Recover original values through [pred_meta_t]\n",
       "Observations: {nrow(x)}\n",
       "Runs: {n_runs}, Periods: {n_periods}, Predictors: {n_preds}, Coordinates: {n_coords}\n\n"
     ))
@@ -213,23 +214,109 @@ set_pred_coltypes <- function(result, pred_meta_t) {
     cast_type <- dtype <- as.character(meta_row$data_type)
 
     # manually reconstructing factors: cast to int, then add attrs
-    cast_type <- ifelse(dtype == "factor", "int", cast_type)
+    cast_type <- ifelse(dtype %in% c("factor", "ordered"), "int", cast_type)
 
     cast_dt_col(result, col, cast_type)
     if (dtype == "factor") {
       lvls <- meta_row$factor_levels[[1L]]
       data.table::setattr(result[[col]], "levels", lvls)
       data.table::setattr(result[[col]], "class", "factor")
+    } else if (dtype == "ordered") {
+      lvls <- meta_row$factor_levels[[1L]]
+      data.table::setattr(result[[col]], "levels", lvls)
+      data.table::setattr(result[[col]], "class", c("ordered", "factor"))
     }
 
+    # if col is factor, fill_value being a character is safe
+    # dt set() can add a new level if it's not already present
     fill_value <- meta_row$fill_value |> type.convert(as.is = TRUE)
     if (!is.na(fill_value)) {
       data.table::set(
         result,
         i = which(is.na(result[[col]])),
         j = col,
-        value = meta_row$fill_value
+        value = fill_value
       )
     }
   }
 }
+
+#' @describeIn pred_data_t Add a predictor to the database, given a data.table with
+#' columns `id_coord`, `id_period`, and `value` (predictor value). Uses the current
+#' `id_run`.
+#' @param self an [evoland_db] instance
+#' @param pred_data_raw data.table with columns id_coord, id_period, and value
+#' (predictor value); the data type of the value is stored in [pred_meta_t]
+#' @param name Character scalar, unique name of predictor. If already present in
+#' `pred_meta_t`, this will simply update the information.
+#' @param fill_value Value to use for coordinates registered in [coords_t] but not in
+#' the provided `pred_data_raw`. e.g. where no known population is registered,
+#' assume pop. 0. For a factor variable, this could be a base case, e.g. for
+#' different nature reserve types, this could be the "not in a reserve" type.
+#' @param pretty_name opt. Character scalar, friendly name for plots/output
+#' @param description opt. Character scalar. Long description / operationalisation
+#' @param orig_format opt. Character scalar. Original format description
+#' @param sources opt. list of lists: each list containing one `url` and a
+#' `md5sum` field, see [create_pred_meta_t()] and [download_and_verify()]
+#' @param unit opt. Character scalar. SI unit for physical properties, or more complex
+#' descriptors like "bed nights/year" as a proxy for touristic activity
+add_predictor <- function(
+  self,
+  pred_data_raw,
+  name,
+  fill_value,
+  pretty_name = name,
+  description = NA_character_,
+  orig_format = NA_character_,
+  sources = list(),
+  unit = NA_character_
+) {
+  id_pred <- self$column_max("pred_meta_t", "id_pred") + 1L
+  if (id_pred > 1L) {
+    # if id_pred == 1, this is the first entry in pred_meta_t
+    # if higher, we check if this predictor is already in DB
+    existing_pred <- self$fetch("pred_meta_t", where = glue::glue("name = '{name}'"))
+    if (nrow(existing_pred) > 0L) {
+      # use pre-existing id_pred if already exists
+      id_pred <- existing_pred[["id_pred"]][1L]
+    }
+  }
+
+  new_meta_row <- data.table::data.table(
+    id_pred = id_pred,
+    name = name,
+    pretty_name = pretty_name,
+    description = description,
+    orig_format = orig_format,
+    sources = list(sources),
+    unit = unit,
+    data_type = switch(
+      class(pred_data_raw[["value"]])[[1]],
+      integer = "int",
+      numeric = "float",
+      logical = "bool",
+      factor = "factor",
+      ordered = "ordered",
+      stop("Unsupported data type for value column")
+    ),
+    fill_value = fill_value,
+    factor_levels = {
+      if (is.factor(pred_data_raw[["value"]])) {
+        list(levels(pred_data_raw[["value"]]))
+      } else {
+        list(character(0))
+      }
+    }
+  )
+
+  # upsert
+  self$pred_meta_t <- as_pred_meta_t(new_meta_row)
+
+  # construct valid pred data
+  pred_data_to_add <- data.table::copy(pred_data_raw)
+  pred_data_to_add[, id_run := self$id_run]
+  pred_data_to_add[, id_pred := id_pred]
+
+  # upsert
+  self$pred_data_t <- as_pred_data_t(pred_data_to_add)
+}
diff --git a/R/pred_meta_t.R b/R/pred_meta_t.R
@@ -15,7 +15,7 @@
 #'   - `orig_format`: Original format description
 #'   - `sources`: Sources, list column of data.frames with cols `url` and `md5sum`
 #'   - `unit`: SI units for physical properties, or more complex descriptors
-#'     like "number of annual visitors"
+#'     like "bed nights/year" as a proxy for touristic activity
 #'   - `data_type`: Factor with levels "int", "float", "bool", "factor".
 #'     Used for coercion.
 #'   - `fill_value`: Value to use for missing data for [coords_t] coordinate points that
@@ -34,15 +34,44 @@ as_pred_meta_t <- function(x) {
       unit = character(0),
       data_type = factor(
         character(0),
-        levels = c("int", "float", "bool", "factor")
+        # leaving out POSIXct / Date for now; can be operationalized as int/float
+        levels = c("int", "float", "bool", "factor", "ordered")
       ),
-      fill_value = NA,
+      fill_value = NA_character_,
       factor_levels = list(character(0))
     )
   }
 
   data.table::setDT(x) |>
-    cast_dt_col("data_type", "factor", levels = c("int", "float", "bool", "factor"))
+    cast_dt_col("id_pred", "int") |>
+    cast_dt_col("name", "char") |>
+    cast_dt_col("pretty_name", "char") |>
+    cast_dt_col("description", "char") |>
+    cast_dt_col("orig_format", "char") |>
+    cast_dt_col("unit", "char") |>
+    cast_dt_col("data_type", "factor", levels = c("int", "float", "bool", "factor", "ordered")) |>
+    cast_dt_col("fill_value", "char")
+
+  x[,
+    sources := lapply(
+      sources,
+      function(src) {
+        # coerce to data.table with exactly url & md5sum
+        if (
+          inherits(src, "data.frame") &&
+            all(hasName(src, c("url", "md5sum")))
+        ) {
+          return(src[, c("url", "md5sum")])
+        }
+        src_dt <- data.table::rbindlist(src, use.names = TRUE)
+        if (length(src_dt) == 0L) {
+          # length 0 is a null data frame, e.g. if src is NULL or list()
+          src_dt <- data.table::data.table(url = character(), md5sum = character())
+        }
+        src_dt[, .(url, md5sum)]
+      }
+    )
+  ]
 
   as_parquet_db_t(
     x,
@@ -116,13 +145,13 @@ create_pred_meta_t <- function(pred_spec, starting_id = 1L) {
       lapply(pluck_wildcard(pred_spec, NA, "orig_format"), function(x) x %||% NA_character_)
     ),
     sources = lapply(
-      pred_spec,
-      # path is pred_spec > pred_name > sources > listelement > url/md5sum
-      function(pred) {
-        data.table::data.table(
-          url = unlist(pluck_wildcard(pred, "sources", NA, "url") %||% character()),
-          md5sum = unlist(pluck_wildcard(pred, "sources", NA, "md5sum") %||% character())
-        )
+      pluck_wildcard(pred_spec, NA, "sources"),
+      function(src) {
+        src_dt <- data.table::rbindlist(src, use.names = TRUE)
+        if (length(src_dt) == 0L) {
+          src_dt <- data.table::data.table(url = character(), md5sum = character())
+        }
+        src_dt[, .(url, md5sum)]
       }
     ),
     unit = unlist(
@@ -134,11 +163,11 @@ create_pred_meta_t <- function(pred_spec, starting_id = 1L) {
       }) |>
         unlist() |>
         factor(
-          levels = c("int", "float", "bool", "factor")
+          levels = c("int", "float", "bool", "factor", "ordered")
         )
     },
     fill_value = unlist(
-      lapply(pluck_wildcard(pred_spec, NA, "fill_value"), function(x) x %||% NA)
+      lapply(pluck_wildcard(pred_spec, NA, "fill_value"), function(x) x %||% NA_character_)
     ),
     factor_levels = lapply(
       pluck_wildcard(pred_spec, NA, "factor_levels"),
@@ -179,13 +208,17 @@ validate.pred_meta_t <- function(x, ...) {
     is.character(x[["unit"]]),
     is.factor(x[["data_type"]]),
     "data_type must be set" = !any(is.na(x[["data_type"]])),
-    "data_type can only be one of 'integer', 'double','factor', or 'boolean'" = setequal(
+    "data_type can only be one of 'int', 'float', 'bool', 'factor', 'ordered'" = setequal(
       levels(x[["data_type"]]),
-      c("int", "float", "bool", "factor")
+      c("int", "float", "bool", "factor", "ordered")
     ),
     is.list(x[["factor_levels"]]),
     "name cannot be empty" = !any(x[["name"]] == ""),
-    "single URL with multiple checksums present" = !anyDuplicated(sources_dt[["url"]])
+    "single URL with multiple checksums present" = !anyDuplicated(sources_dt[["url"]]),
+    "sources must hold exactly a url and an md5sum field" = setequal(
+      names(sources_dt),
+      c("url", "md5sum")
+    )
   )
 
   return(x)

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/R/trans_models_t.R b/R/trans_models_t.R
@@ -420,7 +420,7 @@ fit_full_models <- function(
         worker_fun = fit_full_model_worker,
         parent_db = self,
         cluster = cluster,
-        learner = learner,
+        learner = learner
       ) |>
       data.table::rbindlist() |>
       as_trans_models_t()
@@ -485,7 +485,7 @@ fit_full_models <- function(
         items = _,
         worker_fun = fit_full_model_worker,
         parent_db = self,
-        cluster = cluster,
+        cluster = cluster
       ) |>
       data.table::rbindlist() |>
       as_trans_models_t()
@@ -494,7 +494,6 @@ fit_full_models <- function(
 
 
 #' @export
-#' @param ... ignored, kept for signature compatibility
 validate.trans_models_t <- function(x, ...) {
   NextMethod()
 

diff --git a/R/util_download.R b/R/util_download.R
@@ -8,7 +8,8 @@ NULL
 
 #' @describeIn util_download Download a set of files and check their integrity against
 #'    their md5 checksum
-#' @param df_in Data frame with 'url' and 'md5sum' columns
+#' @param df_in Data frame with 'url' and 'md5sum' columns. This can be
+#' retrieved from [pred_meta_t] or [intrv_meta_t].
 #' @param target_dir Target directory for downloads, defaults to option
 #'    `evoland.cachedir`
 #' @param overwrite Whether to overwrite existing files (default: FALSE)