Skip to content

Commit 304bba4

Browse files
authored
Add files via upload
1 parent 1f9b9be commit 304bba4

6 files changed

Lines changed: 1829 additions & 0 deletions

2025_SISBID_PCA_Demo.Rmd

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
---
2+
title: "2025 SISBID Dimension Reduction Demo"
3+
author: "Genevera I. Allen & Yufeng Liu"
4+
output:
5+
html_document: default
6+
pdf_document: default
7+
---
8+
9+
# PCA Demo Using Digits Data
10+
11+
Load Packages
12+
```{r, echo = TRUE}
13+
library(ggplot2)
14+
library(GGally)
15+
```
16+
17+
Load Digits Data
18+
19+
```{r, echo = TRUE}
20+
#code for digits - ALL
21+
rm(list=ls())
22+
load("UnsupL_SISBID_2025.Rdata")
23+
```
24+
25+
Create Subset of just 3's and 8's
26+
27+
```{r, echo = TRUE}
28+
dat38 = rbind(digits[which(rownames(digits)==3),],digits[which(rownames(digits)==8),])
29+
```
30+
31+
## Try Princomp
32+
33+
With Centering & Scaling
34+
35+
```{r, echo = TRUE}
36+
pc = princomp(dat38) #default - centers and scales
37+
biplot(pc,cex=.7)
38+
screeplot(pc)
39+
```
40+
41+
PC Scatterplot
42+
43+
```{r, echo = TRUE}
44+
PC1 <- as.matrix(x=pc$scores[,1])
45+
PC2 <- as.matrix(pc$scores[,2])
46+
plot(PC1,PC2,type="n",xlab="PC1",ylab="PC2")
47+
text(PC1,PC2,rownames(dat38),col=rownames(dat38))
48+
```
49+
50+
51+
Pairs Plot Using ggpairs
52+
53+
```{r, echo = TRUE}
54+
PC1 <- as.matrix(x=pc$scores[,1])
55+
PC2 <- as.matrix(pc$scores[,2])
56+
PC3 <- as.matrix(pc$scores[,3])
57+
PC4 <- as.matrix(pc$scores[,4])
58+
PC5<-as.matrix(pc$scores[,5])
59+
60+
pc.df.digits <- data.frame(digit_name = row.names(dat38), PC1, PC2,PC3, PC4, PC5)
61+
62+
ggpairs(pc.df.digits, mapping = aes(color = digit_name))
63+
```
64+
65+
PC Loadings
66+
67+
```{r, echo = TRUE}
68+
par(mfrow=c(3,5),mar=c(.1,.1,.1,.1))
69+
for(i in 1:15){
70+
imagedigit(pc$loadings[,i])
71+
}
72+
```
73+
74+
Variance explained
75+
76+
```{r, echo = TRUE}
77+
78+
varex = 100*pc$sdev^2/sum(pc$sdev^2)
79+
par(mfrow=c(2,1))
80+
screeplot(pc)
81+
plot(varex,type="l",ylab="% Variance Explained",xlab="Component")
82+
```
83+
84+
Cumulative variance explained
85+
86+
```{r, echo = TRUE}
87+
#cumulative variance explained
88+
cvarex = NULL
89+
for(i in 1:ncol(cdat)){
90+
cvarex[i] = sum(varex[1:i])
91+
}
92+
plot(cvarex,type="l",ylab="Cumulative Variance Explained",xlab="Component", main = "Principal Component vs. Variance Explained" )
93+
```
94+
95+
96+
## Compare to SVD
97+
98+
Without Centering & Scaling
99+
100+
```{r, echp=TRUE}
101+
svdd = svd(dat38)
102+
U = svdd$u
103+
V = svdd$v #PC loadings
104+
D = svdd$d
105+
Z = dat38%*%V #PCs
106+
```
107+
108+
PC Scatterplots
109+
110+
```{r, echo = TRUE}
111+
PC1 <- U[,1]
112+
PC2 <- U[,2]
113+
plot(PC1,PC2,type="n",xlab="PC1",ylab="PC2")
114+
text(PC1,PC2,rownames(dat38),col=rownames(dat38))
115+
```
116+
117+
Pairs Plot Using ggpairs
118+
119+
```{r, echo = TRUE}
120+
PC1 <- U[,1]
121+
PC2 <- U[,2]
122+
PC3 <- U[,3]
123+
PC4 <- U[,4]
124+
PC5 <- U[,5]
125+
126+
pc.df.digits <- data.frame(digit_name = row.names(dat38), PC1, PC2,PC3, PC4, PC5)
127+
128+
ggpairs(pc.df.digits, mapping = aes(color = digit_name))
129+
```
130+
131+
PC Loadings
132+
133+
```{r, echo = TRUE}
134+
par(mfrow=c(3,5),mar=c(.1,.1,.1,.1))
135+
for(i in 1:15){
136+
imagedigit(V[,i])
137+
}
138+
```
139+
140+
141+
Variance Explained
142+
```{r, echo = TRUE}
143+
#Variance Explained
144+
varex = 0
145+
cumvar = 0
146+
denom = sum(D^2)
147+
for(i in 1:256){
148+
varex[i] = D[i]^2/denom
149+
cumvar[i] = sum(D[1:i]^2)/denom
150+
}
151+
152+
```
153+
154+
155+
Screeplot
156+
```{r, echo = TRUE}
157+
158+
par(mfrow=c(1,2))
159+
plot(1:256,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
160+
plot(1:256,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")
161+
```
162+
163+
164+
# PCA Demo Using College Data
165+
166+
Load in Packages
167+
```{r, echo = TRUE}
168+
library(ISLR)
169+
```
170+
171+
```{r, echo = TRUE}
172+
data(College)
173+
cdat = College[,2:18]
174+
dim(cdat)
175+
names(cdat)
176+
```
177+
178+
```{r, echo = TRUE}
179+
pc = princomp(cdat) #default - centers and scales
180+
181+
#Go back and display these plots side by side
182+
183+
biplot(pc,cex=.7)
184+
screeplot(pc)
185+
186+
```
187+
188+
scatter plots - patterns among observations
189+
190+
```{r, echo = TRUE}
191+
PC1 <- as.matrix(x=pc$scores[,1])
192+
PC2 <- as.matrix(pc$scores[,2])
193+
194+
PC <- data.frame(State = row.names(cdat), PC1, PC2)
195+
ggplot(PC, aes(PC1, PC2)) +
196+
geom_text(aes(label = State), size = 3) +
197+
xlab("PC1") +
198+
ylab("PC2") +
199+
ggtitle("First Two Principal Components of College Data")
200+
201+
```
202+
203+
Pairs Plot
204+
205+
```{r, echo = TRUE}
206+
comp_labels<-c("PC1","PC2","PC3","PC4", "PC5")
207+
pairs(pc$scores[,1:5], labels = comp_labels, main = "Pairs of PC's for College Data")
208+
```
209+
210+
Loadings - variables that contribute to these patterns
211+
212+
```{r, echo = TRUE}
213+
214+
par(mfrow=c(2,1))
215+
barplot(pc$loadings[,1],cex.names=.6,main="PC 1 Loadings")
216+
barplot(pc$loadings[,2],cex.names=.6,main="PC 2 Loadings")
217+
```
218+
219+
Variance explained
220+
221+
```{r, echo = TRUE}
222+
223+
varex = 100*pc$sdev^2/sum(pc$sdev^2)
224+
par(mfrow=c(2,1))
225+
screeplot(pc)
226+
plot(varex,type="l",ylab="% Variance Explained",xlab="Component")
227+
```
228+
229+
Cumulative variance explained
230+
231+
```{r, echo = TRUE}
232+
#cumulative variance explained
233+
cvarex = NULL
234+
for(i in 1:ncol(cdat)){
235+
cvarex[i] = sum(varex[1:i])
236+
}
237+
plot(cvarex,type="l",ylab="Cumulative Variance Explained",xlab="Component", main = "Principal Component V. Variance Explained" )
238+
```
239+

2025_SISBID_PCA_Demo.html

Lines changed: 579 additions & 0 deletions
Large diffs are not rendered by default.

2025_SISBID_PCA_Demo.pdf

2.92 MB
Binary file not shown.

0 commit comments

Comments
 (0)