-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.R
More file actions
89 lines (71 loc) · 1.67 KB
/
main.R
File metadata and controls
89 lines (71 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# LOAD DATA
library(data.table)
library(here)
path = here("Data",
"cal_housing.data")
col_names = c("longitude",
"latitude",
"housingMedianAge",
"totalRooms",
"totalBedrooms",
"population",
"households",
"medianIncome",
"medianHouseValue")
data <- fread(path,
col.names = col_names)
rm(path)
##-------------------------------------------------------------------------------------
# FEATURE ENGINEERING
cols <- c("totalRooms",
"totalBedrooms")
setnames(
data[,
(cols) := lapply(.SD, "/", data$population),
.SDcols = cols],
cols,
c("avg_rooms",
"avg_beds")
)
setnames(
data[, population := population / households],
"population",
"avg_population"
)
cols <- c("medianHouseValue",
"housingMedianAge",
"households",
"avg_population",
"avg_rooms",
"avg_beds")
new_cols <- c("log_house_value",
"log_age",
"log_households",
"log_avg_population",
"log_avg_rooms",
"log_avg_beds")
setnames(
data[,
(cols) := lapply(.SD, log),
.SDcols = cols],
cols,
new_cols
)
data[,
`:=` (
med_income2 = medianIncome^2,
med_income3 = medianIncome^3
)
]
y <- data$log_house_value
X <- data
X$log_house_value <- NULL
# indices for train-test-split
set.seed(101)
train <- sample(1:nrow(data),
floor(0.75*nrow(data))
)
prices_test = data[-train,
.(log_house_value)]
rm(cols)
rm(new_cols)