-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path34-features-load-clean-all.Rmd
More file actions
74 lines (44 loc) · 1.57 KB
/
34-features-load-clean-all.Rmd
File metadata and controls
74 lines (44 loc) · 1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
---
title: "34-feature-engineering"
output: html_notebook
---
This notebook cleans and merges all city dataframes into one comprehensive dataframe.
```{r load}
source(knitr::purl("10-load-data.Rmd"))
fs::file_delete("10-load-data.R")
mdl_list <- list('Austin, TX', 'Nashville', 'San Fran, CA')
dfs <- mdl_list %>%
map(~read_feather(expand_boxpath(str_c(., '/ready_for_first_model.feather'))))
austin_permits <- as.data.frame(dfs[1])
nash_permits <- as.data.frame(dfs[2])
sanfran_permits <- as.data.frame(dfs[3])
#appends city id to prevent duplicate permit numbers during merge of datasets
set_permit_prefix <- function(df, prefix) {
df <- df %>%
mutate(permit_number <- str_c(prefix, '-', permit_number))
return(df)
}
nash_permits <- set_permit_prefix(nash_permits, "NSH")
austin_permits <- set_permit_prefix(nash_permits, "AUS")
sanfran_permits <- set_permit_prefix(nash_permits, "SAN")
```
```{r merge_df}
#merge nashville and austin
nash_austin_df <- bind_rows(nash_permits, austin_permits)
nash_austin_df
tail(nash_austin_df)
#merge nashville/austin with san fran
all_permits_df <- bind_rows(nash_austin_df, sanfran_permits)
all_permits_df
tail(all_permits_df)
nrow(all_permits_df)
#final dataframe cleaning
all_permits_clean <- all_permits_df %>%
filter(const_cost > 1) #based on Allie's recommendation to not count projects of 1 or 0 dollars
all_permits_clean
nrow(all_permits_clean)
#filtering out these low cost permits only reduces dataset by 3%
```
```{r write_new_df}
write_feather(all_permits_clean, expand_boxpath("all_permits.feather"))
```