Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check-light.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6

- uses: r-lib/actions/setup-r-dependencies@v2
with:
Expand Down
17 changes: 9 additions & 8 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@ jobs:
matrix:
config:
- { os: macos-latest, r: "release" }
- { os: windows-latest, r: "release" }
- { os: ubuntu-latest, r: "release" }
# r cmd build apparently fails to install the package - no errors or warnings
# only becomes evident when trying to build vignette
# pak not installing system dependencies?
# - { os: windows-latest, r: "release" }

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6

- uses: r-lib/actions/setup-pandoc@v2

Expand All @@ -47,13 +50,11 @@ jobs:
- name: Cache macOS system dependencies
if: runner.os == 'macos'
id: cache-brew
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: |
/opt/homebrew/Cellar/gdal
/opt/homebrew/Cellar/proj
/usr/local/Cellar/gdal
/usr/local/Cellar/proj
/opt/homebrew/Cellar/
/usr/local/Cellar/
key: ${{ runner.os }}-brew-gdal-proj-${{ hashFiles('brew-versions.txt') }}

- name: Install or link macOS dependencies
Expand All @@ -74,7 +75,7 @@ jobs:
- name: Cache Ubuntu system dependencies
if: runner.os == 'linux'
id: cache-apt
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: /var/cache/apt/archives
key: ${{ runner.os }}-apt-gdal-proj-${{ hashFiles('apt-versions.txt') }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/format-suggest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
pull-requests: write

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
with:
ref: ${{ github.event.pull_request.head.sha }}

Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ URL: https://ethzplus.github.io/evoland-plus, https://github.com/ethzplus/evolan
BugReports: https://github.com/ethzplus/evoland-plus/issues
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.3
Depends:
R (>= 4.2)
Imports:
Expand Down Expand Up @@ -70,3 +69,4 @@ Collate:
'util_dinamica.R'
'util_download.R'
'util_terra.R'
Config/roxygen2/version: 8.0.0
25 changes: 25 additions & 0 deletions R/evoland_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,31 @@ evoland_db <- R6::R6Class(
create_method_binding(upsert_new_neighbors)
},

#' @description Add a predictor using the currently active id_run, see [add_predictor()]
#' @param pred_data_raw Data frame with columns `id_coord`, `id_period`, and `value`
#' @param name Unique short name
#' @param fill_value Value to substitute if a coordinate point in [coords_t] does
#' not have an explicit associated value
#' @param pretty_name char, Friendly name for use in reporting
#' @param description char, For use in reporting.
#' @param orig_format char, Format description of the underlying data (raster, vector…)
#' @param sources data.frame with url/md5sum columns, used for keeping track of
#' underlying raw data, see [download_and_verify()]
#' @param unit char, SI unit for physical predictors, or descriptions like
#' "bed nights/year" as a proxy for touristic activity
add_predictor = function(
pred_data_raw,
name,
fill_value,
pretty_name = name,
description = NA_character_,
orig_format = NA_character_,
sources = data.frame(url = character(0), md5sum = character(0)),
unit = NA_character_
) {
create_method_binding(add_predictor)
},

#' @description Get transitions along with their predictor data in a wide
#' data.table, see [trans_pred_data_v()]
#' @param id_trans Integer transition ID, see [trans_meta_t]
Expand Down
3 changes: 1 addition & 2 deletions R/evoland_db_views.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
#' for a specific period.
#'
#' @name evoland_db_views
#' @aliases lulc_meta_long_v pred_sources_v trans_v coords_minimal
#' trans_rates_dinamica_v
#' @aliases lulc_meta_long_v pred_sources_v trans_v coords_minimal trans_rates_dinamica_v
#' @include evoland_db.R
NULL

Expand Down
93 changes: 90 additions & 3 deletions R/pred_data_t.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ print.pred_data_t <- function(x, nrow = 10, ...) {
n_coords <- data.table::uniqueN(x[["id_coord"]])

cat(glue::glue(
"Raw Predictor Data Table\n",
"Raw Predictor Data Table (values as floats)\n",
"Recover original values through [pred_meta_t]\n",
"Observations: {nrow(x)}\n",
"Runs: {n_runs}, Periods: {n_periods}, Predictors: {n_preds}, Coordinates: {n_coords}\n\n"
))
Expand Down Expand Up @@ -213,23 +214,109 @@ set_pred_coltypes <- function(result, pred_meta_t) {
cast_type <- dtype <- as.character(meta_row$data_type)

# manually reconstructing factors: cast to int, then add attrs
cast_type <- ifelse(dtype == "factor", "int", cast_type)
cast_type <- ifelse(dtype %in% c("factor", "ordered"), "int", cast_type)

cast_dt_col(result, col, cast_type)
if (dtype == "factor") {
lvls <- meta_row$factor_levels[[1L]]
data.table::setattr(result[[col]], "levels", lvls)
data.table::setattr(result[[col]], "class", "factor")
} else if (dtype == "ordered") {
lvls <- meta_row$factor_levels[[1L]]
data.table::setattr(result[[col]], "levels", lvls)
data.table::setattr(result[[col]], "class", c("ordered", "factor"))
}

# if col is factor, fill_value being a character is safe
# dt set() can add a new level if it's not already present
fill_value <- meta_row$fill_value |> type.convert(as.is = TRUE)
if (!is.na(fill_value)) {
data.table::set(
result,
i = which(is.na(result[[col]])),
j = col,
value = meta_row$fill_value
value = fill_value
)
}
}
}

#' @describeIn pred_data_t Add a predictor to the database, given a data.table with
#' columns `id_coord`, `id_period`, and `value` (predictor value). Uses the current
#' `id_run`.
#' @param self an [evoland_db] instance
#' @param pred_data_raw data.table with columns id_coord, id_period, and value
#' (predictor value); the data type of the value is stored in [pred_meta_t]
#' @param name Character scalar, unique name of predictor. If already present in
#' `pred_meta_t`, this will simply update the information.
#' @param fill_value Value to use for coordinates registered in [coords_t] but not in
#' the provided `pred_data_raw`. e.g. where no known population is registered,
#' assume pop. 0. For a factor variable, this could be a base case, e.g. for
#' different nature reserve types, this could be the "not in a reserve" type.
#' @param pretty_name opt. Character scalar, friendly name for plots/output
#' @param description opt. Character scalar. Long description / operationalisation
#' @param orig_format opt. Character scalar. Original format description
#' @param sources opt. list of lists: each list containing one `url` and a
#' `md5sum` field, see [create_pred_meta_t()] and [download_and_verify()]
#' @param unit opt. Character scalar. SI unit for physical properties, or more complex
#' descriptors like "bed nights/year" as a proxy for touristic activity
add_predictor <- function(
self,
pred_data_raw,
name,
fill_value,
pretty_name = name,
description = NA_character_,
orig_format = NA_character_,
sources = list(),
unit = NA_character_
) {
id_pred <- self$column_max("pred_meta_t", "id_pred") + 1L
if (id_pred > 1L) {
# if id_pred == 1, this is the first entry in pred_meta_t
# if higher, we check if this predictor is already in DB
existing_pred <- self$fetch("pred_meta_t", where = glue::glue("name = '{name}'"))
Comment thread
mmyrte marked this conversation as resolved.
if (nrow(existing_pred) > 0L) {
# use pre-existing id_pred if already exists
id_pred <- existing_pred[["id_pred"]][1L]
}
}

new_meta_row <- data.table::data.table(
id_pred = id_pred,
name = name,
pretty_name = pretty_name,
description = description,
orig_format = orig_format,
sources = list(sources),
unit = unit,
data_type = switch(
class(pred_data_raw[["value"]])[[1]],
integer = "int",
numeric = "float",
logical = "bool",
factor = "factor",
ordered = "ordered",
stop("Unsupported data type for value column")
),
Comment thread
mmyrte marked this conversation as resolved.
fill_value = fill_value,
factor_levels = {
if (is.factor(pred_data_raw[["value"]])) {
list(levels(pred_data_raw[["value"]]))
} else {
list(character(0))
}
}
)

# upsert
self$pred_meta_t <- as_pred_meta_t(new_meta_row)
Comment thread
mmyrte marked this conversation as resolved.

# construct valid pred data
pred_data_to_add <- data.table::copy(pred_data_raw)
pred_data_to_add[, id_run := self$id_run]
pred_data_to_add[, id_pred := id_pred]

# upsert
self$pred_data_t <- as_pred_data_t(pred_data_to_add)
}
65 changes: 49 additions & 16 deletions R/pred_meta_t.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' - `orig_format`: Original format description
#' - `sources`: Sources, list column of data.frames with cols `url` and `md5sum`
#' - `unit`: SI units for physical properties, or more complex descriptors
#' like "number of annual visitors"
#' like "bed nights/year" as a proxy for touristic activity
#' - `data_type`: Factor with levels "int", "float", "bool", "factor".
#' Used for coercion.
#' - `fill_value`: Value to use for missing data for [coords_t] coordinate points that
Expand All @@ -34,15 +34,44 @@ as_pred_meta_t <- function(x) {
unit = character(0),
data_type = factor(
character(0),
levels = c("int", "float", "bool", "factor")
# leaving out POSIXct / Date for now; can be operationalized as int/float
levels = c("int", "float", "bool", "factor", "ordered")
),
fill_value = NA,
fill_value = NA_character_,
factor_levels = list(character(0))
)
}

data.table::setDT(x) |>
cast_dt_col("data_type", "factor", levels = c("int", "float", "bool", "factor"))
cast_dt_col("id_pred", "int") |>
cast_dt_col("name", "char") |>
cast_dt_col("pretty_name", "char") |>
cast_dt_col("description", "char") |>
cast_dt_col("orig_format", "char") |>
cast_dt_col("unit", "char") |>
cast_dt_col("data_type", "factor", levels = c("int", "float", "bool", "factor", "ordered")) |>
cast_dt_col("fill_value", "char")

x[,
sources := lapply(
sources,
function(src) {
# coerce to data.table with exactly url & md5sum
if (
inherits(src, "data.frame") &&
all(hasName(src, c("url", "md5sum")))
) {
return(src[, c("url", "md5sum")])
}
src_dt <- data.table::rbindlist(src, use.names = TRUE)
if (length(src_dt) == 0L) {
# length 0 is a null data frame, e.g. if src is NULL or list()
src_dt <- data.table::data.table(url = character(), md5sum = character())
}
src_dt[, .(url, md5sum)]
}
)
]

as_parquet_db_t(
x,
Expand Down Expand Up @@ -116,13 +145,13 @@ create_pred_meta_t <- function(pred_spec, starting_id = 1L) {
lapply(pluck_wildcard(pred_spec, NA, "orig_format"), function(x) x %||% NA_character_)
),
sources = lapply(
pred_spec,
# path is pred_spec > pred_name > sources > listelement > url/md5sum
function(pred) {
data.table::data.table(
url = unlist(pluck_wildcard(pred, "sources", NA, "url") %||% character()),
md5sum = unlist(pluck_wildcard(pred, "sources", NA, "md5sum") %||% character())
)
pluck_wildcard(pred_spec, NA, "sources"),
function(src) {
src_dt <- data.table::rbindlist(src, use.names = TRUE)
if (length(src_dt) == 0L) {
src_dt <- data.table::data.table(url = character(), md5sum = character())
}
src_dt[, .(url, md5sum)]
}
),
unit = unlist(
Expand All @@ -134,11 +163,11 @@ create_pred_meta_t <- function(pred_spec, starting_id = 1L) {
}) |>
unlist() |>
factor(
levels = c("int", "float", "bool", "factor")
levels = c("int", "float", "bool", "factor", "ordered")
)
},
fill_value = unlist(
lapply(pluck_wildcard(pred_spec, NA, "fill_value"), function(x) x %||% NA)
lapply(pluck_wildcard(pred_spec, NA, "fill_value"), function(x) x %||% NA_character_)
),
factor_levels = lapply(
pluck_wildcard(pred_spec, NA, "factor_levels"),
Expand Down Expand Up @@ -179,13 +208,17 @@ validate.pred_meta_t <- function(x, ...) {
is.character(x[["unit"]]),
is.factor(x[["data_type"]]),
"data_type must be set" = !any(is.na(x[["data_type"]])),
"data_type can only be one of 'integer', 'double','factor', or 'boolean'" = setequal(
"data_type can only be one of 'int', 'float', 'bool', 'factor', 'ordered'" = setequal(
levels(x[["data_type"]]),
c("int", "float", "bool", "factor")
c("int", "float", "bool", "factor", "ordered")
),
is.list(x[["factor_levels"]]),
"name cannot be empty" = !any(x[["name"]] == ""),
"single URL with multiple checksums present" = !anyDuplicated(sources_dt[["url"]])
"single URL with multiple checksums present" = !anyDuplicated(sources_dt[["url"]]),
"sources must hold exactly a url and an md5sum field" = setequal(
names(sources_dt),
c("url", "md5sum")
)
)

return(x)
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
5 changes: 2 additions & 3 deletions R/trans_models_t.R
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ fit_full_models <- function(
worker_fun = fit_full_model_worker,
parent_db = self,
cluster = cluster,
learner = learner,
learner = learner
) |>
data.table::rbindlist() |>
as_trans_models_t()
Expand Down Expand Up @@ -485,7 +485,7 @@ fit_full_models <- function(
items = _,
worker_fun = fit_full_model_worker,
parent_db = self,
cluster = cluster,
cluster = cluster
) |>
data.table::rbindlist() |>
as_trans_models_t()
Expand All @@ -494,7 +494,6 @@ fit_full_models <- function(


#' @export
#' @param ... ignored, kept for signature compatibility
validate.trans_models_t <- function(x, ...) {
NextMethod()

Expand Down
3 changes: 2 additions & 1 deletion R/util_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ NULL

#' @describeIn util_download Download a set of files and check their integrity against
#' their md5 checksum
#' @param df_in Data frame with 'url' and 'md5sum' columns
#' @param df_in Data frame with 'url' and 'md5sum' columns. This can be
#' retrieved from [pred_meta_t] or [intrv_meta_t].
#' @param target_dir Target directory for downloads, defaults to option
#' `evoland.cachedir`
#' @param overwrite Whether to overwrite existing files (default: FALSE)
Expand Down
Loading