-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwine_project_r.R
More file actions
134 lines (120 loc) · 4.58 KB
/
wine_project_r.R
File metadata and controls
134 lines (120 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
### Logistic Regression Project
# Import packages and link them to project
library(ggplot2)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(reshape2)
library(caret)
library(car)
library(MASS)
# STEP 1: Import and clean data
## Sep = ";" removed the semicolon seperator values from the dataset
## Check for na values
sum(is.na(wine))
# None!
wine <- read.csv("~/Desktop/Grad School/Quantitative Methods/Project/wine_csv.csv", sep=";")
# Create independent plots comparing each variable to each variable to visually discover collinearities
ggplot(wine, aes(x=fixed.acidity, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Quality vs Fixed Acidity',
x='Fixed Acidity', y='Quality') +
theme_classic()
ggplot(wine, aes(x=volatile.acidity, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Quality vs Volatile Acidity',
x='Volatile Acidity', y='Quality') +
theme_classic()
ggplot(wine, aes(x=citric.acid, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Quality vs Citric Acid',
x='Citric Acidity', y='Quality') +
theme_classic()
ggplot(wine, aes(x=residual.sugar, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Residual Sugar vs Quality',
x='Residual Sugar', y='Quality') +
theme_classic()
ggplot(wine, aes(x=chlorides, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Chlorides vs Quality',
x='pH', y='Quality') +
theme_classic()
ggplot(wine, aes(x=free.sulfur.dioxide, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Free Sulfur Dioxide vs Quality',
x='Free Sulfur Dioxide', y='Quality') +
theme_classic()
ggplot(wine, aes(x=total.sulfur.dioxide, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Free Sulfur Dioxide vs Quality',
x='Total Sulfur Dioxide', y='Quality') +
theme_classic()
ggplot(wine, aes(x=density, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Density vs Quality',
x='pH', y='Quality') +
theme_classic()
ggplot(wine, aes(x=pH, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'pH vs Quality',
x='pH', y='Quality') +
theme_classic()
ggplot(wine, aes(x=sulphates, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Sulphates vs Quality',
x='Sulphates', y='Quality') +
theme_classic()
ggplot(wine, aes(x=alcohol, y=quality)) + geom_point() +
geom_smooth(method = 'lm')+
labs(title = 'Alcohol vs Quality',
x='Alcohol', y='Quality') +
theme_classic()
# Check for collinearity with Pearson Correlations
wineCol <- colnames(wine)
wine_pearCor <- c(cor(wine$quality, wine$fixed.acidity , method = 'pearson'),
cor(wine$quality, wine$volatile.acidity , method = 'pearson'),
cor(wine$quality, wine$citric.acid, method = 'pearson'),
cor(wine$quality, wine$residual.sugar, method = 'pearson'),
cor(wine$quality, wine$chlorides , method = 'pearson'),
cor(wine$quality, wine$free.sulfur.dioxide , method = 'pearson'),
cor(wine$quality, wine$total.sulfur.dioxide , method = 'pearson'),
cor(wine$quality, wine$density , method = 'pearson'),
cor(wine$quality, wine$pH , method = 'pearson'),
cor(wine$quality, wine$sulphates , method = 'pearson'),
cor(wine$quality, wine$alcohol , method = 'pearson'),
cor(wine$quality, wine$quality , method = 'pearson')
)
wineStats2<- t(wineStats)
wineStats2
# Check the Variance Inflaction Factor VIF
# start by splitting the data into training and test set
set.seed(123)
training.samples <- wine$quality %>%
createDataPartition(p = .08, list = FALSE)
train.data <- wine[training.samples, ]
test.data <- wine[-training.samples, ]
# Build the regression model
VIF_wineModel <- lm(quality ~., data = train.data)
# Make predictions
predicitons <- VIF_wineModel %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predicitons, test.data$quality),
R2 = R2(predicitons, test.data$quality)
)
# Check the variance inflation factor (VIF)
wineModel1 <- as.data.frame(vif(VIF_wineModel))
wineModel1
# Fixed Acidity and Density have high VIF, meaning that we should remove them from our model.
# We use stepAIC to improve the model further
wineMLR <- lm(quality ~ ., data = wine)
wineModel <- stepAIC(wineMLR, direction = "both")
summary(wineMLR)
wineCoef <- as.data.frame(summary(wineMLR)$coefficients[,4])
wineCoef
head(wine)
wineMLR_data <- subset(wine, select=-c(density, fixed.acidity, residual.sugar, citric.acid))
head(wineMLR_data)
# Run multiple linear regression and summarize
summary(lm(quality~ ., data = wineMLR_data))