nash-0/43-tonnage-predictions.Rmd at main · VandyDataScience/nash-0 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
---
title: "43-tonnage-predictions"
output: html_notebook
---

In this notebook, we'll model the data using appropriate methods.

```{r Imports}
# sourced for loading of train/test sets
source(knitr::purl("30-feature-engineering.Rmd", output="tmp-43.R", quiet=TRUE))
fs::file_delete("tmp-43.R")
p_load(tidymodels, h2o, mlflow, kernlab, randomForest, keras)
```


``` {r driver}
create_model <- function(train, test, model_type, engine, outcome, log_outcome = TRUE,
                         model_recipe = NULL, test_hist = TRUE){
  #' Create a predictive model using tidymodels
  #'
  #' Train a predictive model. While this function is fairly general, the intent here is to create a model which predicts debris generated (`total_debris`) on a per-permit basis. The model is to be trained on permits that have associated values of `total_debris`. If `test_hist==TRUE`, the model will be tested against aggregated calendar-year totals of `total_debris` in Nashville.
  #'
  #' @param train (tibble) Training set
  #' @param test (tibble) Test set
  #' @param model_type (tidymodels model) A selection of models can be found at https://www.tidymodels.org/find/parsnip/
  #' @param engine (chr) tidymodels engine
  #' @param outcome (chr) Name of outcome variable (i.e., truth)
  #' @param log_outcome (bool, default = TRUE) If TRUE, model will be trained/tested using log10(outcome) instead of outcome.
  #' @param model_recipe (tidymodels recipe, default = NULL) Recipe to use in training the model. If NULL, a default recipe is obtained from `get_default_recipe`
  #' @param test_hist (bool, default = TRUE) If TRUE, the test set is historical data from Nashville. We will make per-permit predictions for Nashville and compare the results to aggregated calendar year values of `total_debris`. If FALSE, the test set consists of individual permits with values of `total_debris`.
  #'
  #' @return (named list) Contents: 1. "model" (output of `fit` from `tidymodels`), 2. "log_outcome", 3. "metrics" (output of `metrics`), 4. "train" (tibble with training observations and predictions), 5. "test" (tibble with testing observations and predictions; NULL if `test` is NULL)
  #' @export
  #'
  #' @examples create_model(train, model_type = linear_reg(), engine = "lm", outcome = "total_debris")

  # take log10 of outcome if requested; tidymodels does not have a nice way to do this
  if(log_outcome){
    train <- train %>%
      mutate(across(all_of(outcome), ~log10(.)))
    if(!is.null(test)){
      test <- test %>%
        mutate(across(all_of(outcome), ~log10(.)))
    }
  }

  # set up model recipe if not passed
  if(is.null(model_recipe)){
    model_recipe <- train %>%
      get_default_recipe(outcome)
  }
  model <- get_model(model_type, engine)

  model_workflow <-
    workflow() %>%
    add_model(model) %>%
    add_recipe(model_recipe)

  model_fit <-
    model_workflow %>%
    fit(data = train)

  tmp_model <- list("model" = model_fit,
                   "log_outcome" = log_outcome)

  train_pred <- get_prediction(tmp_model, train)

  # Perform test on historical data if desired
  if(test_hist){
    test_pred <- get_prediction(tmp_model, test) %>%
      mutate(cy = lubridate::year(date_issued)) %>%
      group_by(cy) %>%
      summarise(prediction = sum(prediction, na.rm = TRUE)) %>%
      filter(cy>2016) %>%
      filter(cy<2021) %>%
      left_join(load_nash_cy_debris(), by = c("cy" = "cy"))
  }
  else{
    test_pred <- get_prediction(tmp_model, test)
  }

  # fix values if prediction ws done in log-space
  if(log_outcome){
    train_pred <- train_pred %>%
      mutate(across(c({{outcome}}), ~ 10^(.)))
  }

  # Metrics are computed on unlogged values even if predictions are on logged
  # values. This is so the metric has a standard meaning across all models.
  train_metrics <- metrics(train_pred, truth = !!outcome, estimate = prediction)
  test_metrics <- metrics(test_pred, truth = !!outcome, estimate = prediction)

  if(test_hist){
  test_metrics <- test_metrics %>%
    mutate(.estimate=round(.estimate, digits = 3))
  }

  # do we want to save all of this?
  return(list("model" = model_fit,
              "log_outcome" = log_outcome,
              "train_metrics" = train_metrics,
              "test_metrics" = test_metrics,
              "train" = train_pred,
              "test" = test_pred))
}
```


```{r load cy debris for nashville}
load_nash_cy_debris <- function(){
  #' Load Nashville calendar-year values of total debris
  #'
  #' The decided-upon way to test the debris-prediction model is to make predictions on historical permits in Nashville and compare the aggregated calendar-year results to known values. This function loads those known values.
  #'
  #' @return (tibble) Contains calendar year and total debris for a few years in Nashville

  read_excel(expand_boxpath("Nashville/Recycling and Landfilling totals _ Nashville.xlsx"), sheet = 1, skip = 1) %>%
    clean_names() %>%
    filter(!is.na(.)) %>%
    rename(total_debris = total_debris_generated) %>%
    arrange(cy) %>%
    select(cy, total_debris)
}
```


``` {r model}
get_model <- function(model_type, engine){
  #' Get tidymodels model
  #'
  #' Puts together the model and engine type specified by the user
  #'
  #' @param model_type (tidymodels model) A selection of models can be found at https://www.tidymodels.org/find/parsnip/
  #' @param engine (chr) tidymodels engine
  #'
  #' @return tidymodels model with engine specified
  #' @export
  #'
  #' @examples get_model(linear_reg(), "lm")

  model_type %>%
    set_engine(engine)
}
```


``` {r split}
split_train_test <- function(df, seed = NULL, frac_train = 0.7){
  #' Split data into training and testing sets
  #'
  #' @param df (tibble) Observations
  #' @param seed (num, default = NULL) RNG seed
  #' @param frac_train (num, default = 0.7) Fraction of observations desired in training set
  #'
  #' @return (named list) Observations split into training and testing sets

  set.seed(seed)

  #Need to decide on proportion
  splits <- initial_split(df, prop = frac_train)

  train <- training(splits)
  test <- testing(splits)

  return(list("train" = train, "test" = test))
}
```


``` {r base recipe}
get_base_recipe <- function(outcome, train){
  #' Get base tidymodels recipe
  #'
  #' Set up a base tidymodels recipe establishing outcome variable, formula, and training set. Additional desired steps to the recipe can then be piped after calling this function.
  #'
  #' @param outcome (chr) Name of outcome variable (i.e., truth)
  #' @param train (tibble) Training set
  #'
  #' @return tidymodels recipe

  # Set up base recipe
  formula <-  as.formula(sprintf("%s ~ .", outcome))
  model_recipe <-
    recipe(formula, data = train)

  return(model_recipe)
}
```


``` {r default recipe}
get_default_recipe <- function(train, outcome){
  #' Get default recipe
  #'
  #' Set up a default recipe for predicting total_debris for permits.
  #'
  #' @param train (tibble) Training set
  #' @param outcome (chr) Name of outcome variable
  #'
  #' @return tidymodels recipe

  indicators = c("permit_number", "permit_subtype_description", "council_dist", "date_issued")
  log_pred = c("const_cost")

  model_recipe <- get_base_recipe(outcome, train) %>%
    update_role(all_of(indicators), new_role="ID") %>%
    step_impute_median(all_numeric_predictors()) %>%
    step_impute_mode(all_nominal_predictors()) %>%
    step_log(all_of(log_pred), base = 10, offset=0.01) %>%
    step_novel(all_nominal_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_zv(all_predictors())

  return(model_recipe)
}
```


``` {r plotting}
plot_pred_vs_true <- function(predictions, outcome, logspace = TRUE){
  #' Plot predicted vs true values
  #'
  #' @param predictions (tibble) Tibble containing both predictions and true outcome
  #' @param outcome (chr) Name of outcome variable (i.e., truth)
  #' @param logspace (bool, default = TRUE) If TRUE, plot x and y in log10-space
  #'
  #' @return scatterplot of predictions vs truth

  p <- predictions %>%
    ggplot(aes(x = {{outcome}}, y = prediction)) +
    geom_abline(col = "red", lty = 2) +
    geom_point(alpha = .4)

  if(logspace){
    p <- p +
      scale_x_log10() +
      scale_y_log10()
  }

  return(p)
}
```


``` {r testing}
get_prediction <- function(model, df){
  #' Get prediction
  #'
  #' Get predictions using a trained model. If predictions were made in log-space, they will be un-logged.
  #'
  #' @param model (named list) Trained model. Output of `create_model`
  #' @param df (tibble) Features on which we wish to make predictions.
  #'
  #' @return (tibble) Original tibble plus predictions

  # get predictions via model
  predictions <- predict(model$model, df) %>%
    bind_cols(df) %>%
    rename(prediction = .pred)

  # un-log if they were computed in log-space
  if(model$log_outcome){
    predictions <- predictions %>%
      mutate(prediction = 10^prediction)
  }

  return(predictions)
}
```


```{r get features from model}
get_features <- function(model, dtype = "all"){
  #' Get features from model
  #'
  #' Extract a vector containing all features (not outcome) used in model.
  #'
  #' @param model (named list) Model output by `create_model`
  #' @param dtype (chr, default = "all") Data type of desired features (e.g. "num"). If "all", all features are returned regardless of datatype.
  #'
  #' @return (vec) Vector of features used in model.

  var_info <- model$model$pre$actions$recipe$recipe$var_info

  features <- var_info %>%
    filter(role!="outcome")

  if(dtype!="all"){
    features <- features %>%
      filter(type==dtype)
  }

  return(features$variable)
}
```


```{r set up custom train and test sets}
train_test_historical <- function(train_city = "all", outcome = "total_debris", min_outcome = 1e-5, max_outcome = 40000, cols_to_keep=NULL){
  #' Set up custom training and testing sets
  #'
  #' Set up historical permits from Nashville as test set, San Fran and/or Austin as training set. For the Nashville permits, we do not have measurements of how much debris is generated for each permit, but we do have aggregated measurements of C&D debris for each calendar year. Thus, we can train on the permits (SF/Aus) which do have debris data, pass in the Nashville permits to make per-permit predictions, aggregate results by calendar year, and compare.
  #'
  #' @param train_city (chr, default = "all") Name of city to use for training.
  #' @param outcome (chr, default = "total_debris") Name of outcome variable. (changing is not recommended)
  #' @param min_outcome (chr, default = 1e-5) Inclusive minimum value of outcome variable. All observations in the training set with outcome variable less than or equal to this value will be removed.
  #' @param max_outcome (chr, default = 40000) Inclusive maximum value of outcome variable. All observations in the training set with outcome variable greater than or equal to this value will be removed.
  #' @param cols_to_keep (chr array, default = NULL) Names of columns to keep in both dataframes. If NULL, all columns are returned.
  #'
  #' @return (named list) Named list of tibbles "train","test" with training/testing sets.
  #' @export
  #'
  #' @examples train_test_historical(train_city = "all",
  #'                                 outcome = "total_debris",
  #'                                 max_outcome = 15000,
  #'                                 cols_to_keep = c("total_debris", "const_cost",
  #'                                                  "project_type", "comm_v_res"))


  # test <- read_feather(expand_boxpath("Nashville/adams_nashville_no_touching.feather"))
  test <- load_permits(city="nashville") %>%
    filter(generates_debris) %>%
    filter(comm_v_res!="other")

  # sub-optimal to call this again, but it's fine
  train <- load_permits(train_city, essentials_only = TRUE) %>%
    filter(city_name != "nashville") %>%
    select({{outcome}}, everything()) %>%
    filter(between(!!rlang::sym(outcome), min_outcome, max_outcome))

  if(!is.null(cols_to_keep)){
    if(any(!(cols_to_keep %in% colnames(train)))){
      stop("passed columns not found in structured dataset")
    }
    train <- train %>%
      select(all_of(cols_to_keep))
    test <- test %>%
      select(all_of(cols_to_keep))
  }

  return(list("train" = train, "test" = test))
}
```


Here's a function that takes a list of dataframes and returns all columns that have above a certain fraction of non-missing values across all dataframes.

```{r common low missing cols function}
common_cols_lowmiss <- function(dflist, thresh = 0.25){
  #' Common columns with low missing fraction
  #'
  #' From a list of dataframes/tibbles, get column names that have less than a particular fraction of missing values in all dataframes.
  #'
  #' @param dflist (list) list of dataframes or tibbles
  #' @param thresh (num, default = 0.25) maximum allowable fraction of missing values
  #'
  #' @return (chr array) Names of columns below desired fraction missing in all dataframes

  good_cnames <- NULL
  for(df_name in dflist){
    c <- colnames(df_name)
    mask <- colSums(is.na(df_name))/nrow(df_name) < thresh
    if(is.null(good_cnames)){
      good_cnames <- c[mask]
    }
    else{
      good_cnames <- good_cnames[good_cnames %in% c[mask]]
    }
  }

  return(good_cnames)
}
```