analytics/logisticRegression.R at master · bhavya1917/analytics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Logistic Regression

# Load the textbook R package
require(ISLR)
?Default
# Load in the credit data
head(Default)
attach(Default)
str(Default)
Default
dim(Default)
write.csv(Default,'./data/cclogr.csv')
#Data Structures
#
head(Default)
str(Default)
summary(Default)

names(Default)
# How many people actual default?
tmp = table(default)
tmp
333/9667

(tmp[[2]]/tmp[[1]])*100
tmp[[2]]/dim(Default)[1] * 100

#Plot
library(ggplot2); library(gridExtra)
## Loading required package: grid
x = qplot(x=balance, y=income, color=default, shape=default, geom='point')+scale_shape(solid=FALSE)
y = qplot(x=default, y=balance, fill=default, geom='boxplot')+guides(fill=FALSE)
z = qplot(x=default, y=income, fill=default, geom='boxplot')+guides(fill=FALSE)
# Plot
x
grid.arrange(y, z, nrow=1)

#Model
logitb = glm(default ~ balance, data=Default, family='binomial')
summary(logitb)
coef(logitb)

exp(coef(logitb))
range(Default$balance)
b1= logitb$coefficients[2]
exp(b1)^1  # due to 1 unit increase in balance
exp(b1)^100  # due to 100 unit increase in balance
exp(b1)^1000  # due to 1000 unit increase in balance


exp(confint(logitb))

#Assessing impact of predictors on Probability of Outcome

# Create test data between the range
predict(logit, newdata = data.frame(balance=c(0,100,500,1000,1500,1800,2500)), type=c('response'))

head(Default)

top_n(Default, 5, balance)
Default %>% arrange(balance) %>% slice(1:5)
#top_n(Default, 5, desc(balance))
Default %>% arrange(balance) %>% slice(seq(1,n(),1000))

#student
str(Default)
logits = glm(default ~ student, data=Default, family='binomial')
summary(logits)


# Create a new dummy variable for students
#Default$studentD <- 0  # put all zeros
#Default$studentD[Default$student=="Yes"] <- 1  # replace 0 wherever yes is there
#head(Default)

#logit <- glm(default ~ studentD, data=Default, family='binomial')
#summary(logit)
#predict(logit, newdata = data.frame(studentD=c(0)), type=c('response'))
#predict(logit, newdata = data.frame(studentD=c(1)), type=c('response'))


logits <- glm(default ~ student, data=Default, family='binomial')
summary(logits)
predict(logits, newdata = data.frame(student=c('Yes')), type=c('response'))
predict(logits, newdata = data.frame(student=c('No')), type=c('response'))


#Multiple Logistic Regression
logit = glm(default ~ income + balance + student, family='binomial', data=Default)
summary(logit)


Default %>% group_by(student) %>% arrange(student, balance)


# Predict
library(dplyr)
(ndata = (slice(Default, seq(1,n(),500))))
slice(Default, seq(1,n(),1000))
head(ndata)
addmargins(prop.table(table(Default$default,Default$student)))
0.2817/0.9667; 0.0127/0.0333
options(digits=10)

fitted.results = predict(logit, newdata=ndata,type='response')

fitted.results
cbind(ndata, fitted.results)
ndata
ndata %>% mutate(predict = ifelse(fitted.results < 0.5, 0,1)) #creating new column

fitted.results
ifelse(fitted.results < 0.05, 0,1)


(ndata2 = data.frame(student=c('Yes','No'), balance=mean(Default$balance), income=mean(Default$income)))
(fitted.results2 <- predict(logit, newdata=ndata2,type='response'))


#Accuracy of Model
library(caret)
set.seed(3456)
str(Default)
trainIndex <- createDataPartition(Default$default, p = .67,
                                  list = FALSE,   times = 1)

Train <- Default[ trainIndex,]
Test  <- Default[-trainIndex,]
head(Train)
head(Test)

# Logistic Regression Model
model = glm(default ~ student, data=Default, family='binomial')
Test$model_prob <- predict(model, Test, type = "response")
head(Test)
Test <- Test  %>% mutate(default_pred = ifelse(model_prob > .5,'Yes','No'))
head(Test)
Test <- Test %>% mutate(accurate = 1*(default == default_pred))
sum(Test$accurate)/nrow(Test)
#96% Accuracy

?createDataPartition