Skip to content

Commit 9d5f328

Browse files
authored
Add files via upload
1 parent 8b03183 commit 9d5f328

3 files changed

Lines changed: 1089 additions & 0 deletions

File tree

2025_SISBID_Unsupervised_Lab.Rmd

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
---
2+
title: "2025 SISBID Unsupervised Lab"
3+
author: "Genevera I. Allen & Yufeng Liu"
4+
output:
5+
html_document: default
6+
pdf_document: default
7+
---
8+
9+
# Data Description
10+
11+
## gdat is Gene Expression Data, n = 445 patients x p = 353 genes
12+
- Only 353 genes with somatic mutations from COSMIC are retained
13+
14+
## Data is Level III TCGA BRCA RNA-Sequencing gene expression data that have already been pre-processed according to the following steps:
15+
- Reads normalized by RPKM
16+
- Corrected for overdispersion by a log-transformation (1 + data)
17+
- Short gene name labels are given as the column names
18+
19+
## cdat is Clinical Data, n = 445 patients x q = 6 clinical features
20+
- Subtype - denotes 5 PAM50 subtypes including Basal-like, Luminal A, Luminal B, HER2-enriched, and Normal-like
21+
- ER-Status - estrogen-receptor status
22+
- PR-Status - progesterone-receptor status
23+
- HER2-Status - human epidermal growth factor receptor 2 status
24+
- Node - number of lymph nodes involved
25+
- Metastasis - indicator for whether the cancer has metastasized
26+
27+
# Problems
28+
29+
30+
## Problem 1 - Dimension reduction
31+
32+
1a - Apply PCA, NMF, ICA and MDS, UMAP, and tSNE to this dataset. Compare and contrast the results using these methods.
33+
1b - Relate the dimension reduction results with the clinical data. Is any clinical information reflected in the lower dimensional spaces?
34+
35+
1c - Overall, which dimension reduction method do you recommend for this data set and why?
36+
37+
38+
## Problem 2 - Clustering
39+
40+
2a - Apply various clustering algorithms such as K-means (explore different K), hierarchical clustering (explore different linkages), NMF, and biclustering. Compare the clustering results using these methods.
41+
42+
2b - Relate the clustering results with the clinical data. Can the clustering algorithm recover some of the clinical information such as cancer subtypes?
43+
44+
2c - (Optional) Validate your cluster findings.
45+
46+
2c - Overall, which clustering method(s) do you recommend for this data set and why?
47+
48+
49+
## Problem 3 - Multiple comparisons
50+
51+
3a - Identify important genes to differetiate ER postive and negative, PR postive and negative, HER2 postive and negative, metastasis status.
52+
53+
3b - Try different procedures to adjust for multiple comparisons.
54+
55+
3c - Examine the lists of genes identified using different methods for each clinical response. Which method is best? Why?
56+
57+
58+
## Problem 5 - Graphical models
59+
60+
5a - Use graphical models to explore interactions among genes. Are any of the well-connected genes related to patterns previously identified?
61+
62+
63+
## Problem 6 - Visulaization
64+
65+
66+
6a - Visualize this data using multiple approaches.
67+
68+
6b - Prepare the "best" visual summary of this data.
69+
70+
71+
72+
## Problem 7 - Exploratory Data Analysis Summary
73+
74+
75+
7a - What is the most interesting finding?
76+
77+
7b - Is this finding consistent and stable?
78+
79+
7c - Prepare a visual summary that best illustrates this interesting finding.
80+
81+
82+
83+
# R scripts to help out with the BRCA case study Lab
84+
## Don't peek at this if you want to practice coding on your own!!
85+
86+
Load Data
87+
```{r, echo = TRUE}
88+
load("UnsupL_SISBID_2025.Rdata")
89+
library(ggplot2)
90+
library(kknn)
91+
library(GGally)
92+
library(umap)
93+
library(Rtsne)
94+
library(igraph)
95+
library(huge)
96+
```
97+
98+
99+
Explore Data
100+
```{r, echo = TRUE}
101+
dim(gdat)
102+
dim(cdat)
103+
104+
# clinical data
105+
table(cdat$Subtype)
106+
table(cdat$ER)
107+
table(cdat$PR)
108+
table(cdat$HER2)
109+
table(cdat$Node)
110+
table(cdat$Metastasis)
111+
table(cdat$ER,cdat$PR)
112+
113+
```
114+
115+
116+
## Cluster Heatmap - biclustering
117+
```{r, echo = TRUE}
118+
#cluster heatmap - biclustering
119+
aa = grep("grey",colors())
120+
bb = grep("green",colors())
121+
cc = grep("red",colors())
122+
gcol2 = colors()[c(aa[1:2],bb[1:25],cc[1:50])]
123+
124+
```
125+
126+
Without scaling
127+
```{r, echo = TRUE}
128+
heatmap(gdat,col=gcol2,hclustfun=function(x)hclust(x,method="ward.D"))
129+
```
130+
131+
With scaling
132+
```{r, echo = TRUE}
133+
134+
heatmap(scale(gdat),col=gcol2,hclustfun=function(x)hclust(x,method="ward.D"))
135+
136+
Cols=function(vec){cols=rainbow(length(unique(vec)))
137+
return(cols[as.numeric(as.factor(vec))])}
138+
139+
heatmap(scale(gdat),col=gcol2,hclustfun=function(x)hclust(x,method="ward.D"),labRow=cdat$Subtype,RowSideColors=Cols(cdat$Subtype))
140+
```
141+
142+
## Dimension Reduction
143+
144+
PCA
145+
```{r, echo = TRUE}
146+
Cols=function(vec){cols=rainbow(length(unique(vec)))
147+
return(cols[as.numeric(as.factor(vec))])}
148+
149+
sv = svd(scale(gdat,center=TRUE,scale=FALSE))
150+
V = sv$v
151+
Z = gdat%*%V
152+
153+
K = 3
154+
pclabs = c("PC1","PC2","PC3","PC4")
155+
par(mfrow=c(1,K))
156+
for(i in 1:K){
157+
j = i+1
158+
plot(Z[,i],Z[,j],pch=16,xlab=pclabs[i],ylab=pclabs[j],col=Cols(cdat$Subtype))
159+
}
160+
legend(-45,0,pch=16,col=rainbow(5),levels(cdat$Subtype))
161+
```
162+
163+
Pairs Plot
164+
```{r, echo = TRUE}
165+
PC1<-as.matrix(Z[,1])
166+
PC2<-as.matrix(Z[,2])
167+
PC3<-as.matrix(Z[,3])
168+
PC4<-as.matrix(Z[,4])
169+
PC5<-as.matrix(Z[,5])
170+
171+
pc.df.cdat<-data.frame(Subtype = cdat$Subtype, PC1, PC2, PC3, PC4, PC5)
172+
ggpairs(pc.df.cdat, mapping = aes(color = Subtype))
173+
```
174+
175+
MDS
176+
```{r, echo = TRUE}
177+
178+
Dmat = dist(gdat,method="maximum")
179+
mdsres = cmdscale(Dmat,k=2)
180+
plot(mdsres[,1],mdsres[,2],pch=16,col=Cols(cdat$Subtype), main = "Dimension Reduction MDS")
181+
legend(-30,20,pch=16,col=rainbow(5),levels(cdat$Subtype))
182+
183+
```
184+
185+
186+
ICA
187+
```{r, echo = TRUE}
188+
189+
require("fastICA")
190+
191+
K = 4
192+
icafit = fastICA(gdat,n.comp=K)
193+
194+
kk = 3
195+
pclabs = c("ICA1","ICA2","ICA3","ICA4")
196+
par(mfrow=c(1,kk))
197+
for(i in 1:kk){
198+
j = i+1
199+
plot(icafit$A[i,],icafit$A[j,],pch=16,xlab=pclabs[i],ylab=pclabs[j],col=Cols(cdat$Subtype))
200+
}
201+
legend(-1,2.8,pch=16,col=rainbow(5),levels(cdat$Subtype))
202+
```
203+
204+
UMAP
205+
```{r, echo = TRUE}
206+
gdat.umap = umap(gdat)
207+
plot(gdat.umap$layout[,1], y =gdat.umap$layout[,2], type = "n", main = "UMAP", xlab = "UMAP1", ylab = "UMAP2")
208+
text(gdat.umap$layout[,1], y =gdat.umap$layout[,2], type = "n", cdat$Subtype, col=Cols(cdat$Subtype), cex = .7 )
209+
210+
```
211+
212+
213+
214+
## Clustering
215+
216+
K-means
217+
```{r, echo = TRUE}
218+
219+
K = 5
220+
km = kmeans(gdat,centers=K,nstart=25)
221+
table(km$cluster,cdat$Subtype)
222+
223+
```
224+
225+
226+
Plot Kmeans with labels
227+
```{r, echo = TRUE}
228+
plot(Z[,1],Z[,2],col=km$cluster, main = "Plot Kmeans Clusters ", xlab = "PC1", ylab = "PC2")
229+
text(Z[,1],Z[,2],cdat$Subtype,cex=.75,col=km$cluster)
230+
cens = km$centers
231+
points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3)
232+
```
233+
234+
235+
Hierarchical
236+
```{r, echo = TRUE}
237+
#which linakge is the best?
238+
#which distance metric is the best?
239+
240+
Dmat = dist(gdat)
241+
com.hc = hclust(Dmat,method="ward.D")
242+
243+
244+
plot(com.hc,labels=cdat$Subtype,cex=.5)
245+
246+
res.com = cutree(com.hc,5)
247+
table(res.com,cdat$Subtype)
248+
```
249+
250+
Consensus Clustering with Hierarchical
251+
```{r, echo = TRUE}
252+
#Note that ConsensusClusterPlus not available for R version 4.0.2
253+
#results = ConsensusClusterPlus(gdat,maxK=6,reps=500,pItem=0.8,pFeature=1,
254+
#clusterAlg="hc",distance="pearson",plot="png")
255+
```
256+
257+
Look at results for first 5 clusters
258+
```{r, echo = TRUE}
259+
#heatmap(results[[2]][["consensusMatrix"]][1:5,1:5])
260+
```
261+
262+
263+
Spectral Clustering
264+
```{r, echo = TRUE}
265+
K = 5
266+
s_gdat = specClust(gdat, centers=K, nn = 7, method = "symmetric", gmax=NULL)
267+
```
268+
269+
Visualize
270+
```{r, echo = TRUE}
271+
plot(Z[,1],Z[,2],col=s_gdat$cluster, main = "Visualize Spectral Clusters", xlab = "PC1", ylab = "PC2")
272+
text(Z[,1],Z[,2],cdat$Subtype,cex=.75,col=s_gdat$cluster)
273+
```
274+
275+
276+
277+
## Genes significantly associated with ER or PR Status, etc
278+
```{r, echo = TRUE}
279+
280+
x = gdat[cdat$ER=="Positive" | cdat$ER=="Negative",]
281+
y.er = cdat$ER[cdat$ER=="Positive" | cdat$ER=="Negative"]
282+
y.label = rep(1, length(y.er))
283+
y.label[y.er == "Positive"]=2
284+
285+
ps = NULL
286+
for(i in 1:ncol(gdat)) ps = c(ps,
287+
t.test(x[y.label==1,i],x[y.label==2,i])$p.value)
288+
fdrs.bh = p.adjust(ps, method="BH")
289+
290+
cat("Number of Tests significant with alpha=0.1 using Bonferroni correction:",
291+
sum(ps<0.1/length(y.label)), fill=TRUE)
292+
293+
cat("Number of Tests with FDR below 0.1:",
294+
sum(fdrs.bh<0.1), fill=TRUE)
295+
296+
plot(sort(ps,decreasing=FALSE),ylab="P-Values")
297+
#BH procedure
298+
abline(a=0, b=0.1/length(y.label),col="red")
299+
#Bonferroni
300+
abline(a=0.1/length(y.label), b=0,col="blue")
301+
```
302+
303+
304+
## Graphical models - how are genes related?
305+
306+
```{r}
307+
# use huge package
308+
neth = huge(gdat,method="mb")
309+
plot(neth)
310+
```
311+
312+
```{r}
313+
## stability selection with huge
314+
net.s <- huge.select(neth, criterion="stars")
315+
net.s
316+
plot(net.s)
317+
```
318+
319+
320+
```{r}
321+
#larger lambda
322+
mat <- neth$path[[2]]
323+
neti <- as.undirected(graph_from_adjacency_matrix(mat))
324+
plot(neti,vertex.label=colnames(gdat),vertex.size=2,vertex.label.cex=1.2,vertex.label.dist=1,layout=layout_with_kk)
325+
```
326+
327+
328+
```{r}
329+
#smaller lambda
330+
mat = neth$path[[6]]
331+
neti = as.undirected(graph_from_adjacency_matrix(mat))
332+
plot(neti,vertex.label=colnames(gdat),vertex.size=2,vertex.label.cex=1.2,vertex.label.dist=1,layout=layout_with_kk)
333+
```

2025_SISBID_Unsupervised_Lab.html

Lines changed: 756 additions & 0 deletions
Large diffs are not rendered by default.

2025_SISBID_Unsupervised_Lab.pdf

3.1 MB
Binary file not shown.

0 commit comments

Comments
 (0)