|
| 1 | +--- |
| 2 | +title: "2025 SISBID Unsupervised Lab" |
| 3 | +author: "Genevera I. Allen & Yufeng Liu" |
| 4 | +output: |
| 5 | + html_document: default |
| 6 | + pdf_document: default |
| 7 | +--- |
| 8 | + |
| 9 | +# Data Description |
| 10 | + |
| 11 | +## gdat is Gene Expression Data, n = 445 patients x p = 353 genes |
| 12 | +- Only 353 genes with somatic mutations from COSMIC are retained |
| 13 | + |
| 14 | +## Data is Level III TCGA BRCA RNA-Sequencing gene expression data that have already been pre-processed according to the following steps: |
| 15 | +- Reads normalized by RPKM |
| 16 | +- Corrected for overdispersion by a log-transformation (1 + data) |
| 17 | +- Short gene name labels are given as the column names |
| 18 | + |
| 19 | +## cdat is Clinical Data, n = 445 patients x q = 6 clinical features |
| 20 | +- Subtype - denotes 5 PAM50 subtypes including Basal-like, Luminal A, Luminal B, HER2-enriched, and Normal-like |
| 21 | +- ER-Status - estrogen-receptor status |
| 22 | +- PR-Status - progesterone-receptor status |
| 23 | +- HER2-Status - human epidermal growth factor receptor 2 status |
| 24 | +- Node - number of lymph nodes involved |
| 25 | +- Metastasis - indicator for whether the cancer has metastasized |
| 26 | + |
| 27 | +# Problems |
| 28 | + |
| 29 | + |
| 30 | +## Problem 1 - Dimension reduction |
| 31 | + |
| 32 | +1a - Apply PCA, NMF, ICA and MDS, UMAP, and tSNE to this dataset. Compare and contrast the results using these methods. |
| 33 | +1b - Relate the dimension reduction results with the clinical data. Is any clinical information reflected in the lower dimensional spaces? |
| 34 | + |
| 35 | +1c - Overall, which dimension reduction method do you recommend for this data set and why? |
| 36 | + |
| 37 | + |
| 38 | +## Problem 2 - Clustering |
| 39 | + |
| 40 | +2a - Apply various clustering algorithms such as K-means (explore different K), hierarchical clustering (explore different linkages), NMF, and biclustering. Compare the clustering results using these methods. |
| 41 | + |
| 42 | +2b - Relate the clustering results with the clinical data. Can the clustering algorithm recover some of the clinical information such as cancer subtypes? |
| 43 | + |
| 44 | +2c - (Optional) Validate your cluster findings. |
| 45 | + |
| 46 | +2c - Overall, which clustering method(s) do you recommend for this data set and why? |
| 47 | + |
| 48 | + |
| 49 | +## Problem 3 - Multiple comparisons |
| 50 | + |
| 51 | +3a - Identify important genes to differetiate ER postive and negative, PR postive and negative, HER2 postive and negative, metastasis status. |
| 52 | + |
| 53 | +3b - Try different procedures to adjust for multiple comparisons. |
| 54 | + |
| 55 | +3c - Examine the lists of genes identified using different methods for each clinical response. Which method is best? Why? |
| 56 | + |
| 57 | + |
| 58 | +## Problem 5 - Graphical models |
| 59 | + |
| 60 | +5a - Use graphical models to explore interactions among genes. Are any of the well-connected genes related to patterns previously identified? |
| 61 | + |
| 62 | + |
| 63 | +## Problem 6 - Visulaization |
| 64 | + |
| 65 | + |
| 66 | +6a - Visualize this data using multiple approaches. |
| 67 | + |
| 68 | +6b - Prepare the "best" visual summary of this data. |
| 69 | + |
| 70 | + |
| 71 | + |
| 72 | +## Problem 7 - Exploratory Data Analysis Summary |
| 73 | + |
| 74 | + |
| 75 | +7a - What is the most interesting finding? |
| 76 | + |
| 77 | +7b - Is this finding consistent and stable? |
| 78 | + |
| 79 | +7c - Prepare a visual summary that best illustrates this interesting finding. |
| 80 | + |
| 81 | + |
| 82 | + |
| 83 | +# R scripts to help out with the BRCA case study Lab |
| 84 | +## Don't peek at this if you want to practice coding on your own!! |
| 85 | + |
| 86 | +Load Data |
| 87 | +```{r, echo = TRUE} |
| 88 | +load("UnsupL_SISBID_2025.Rdata") |
| 89 | +library(ggplot2) |
| 90 | +library(kknn) |
| 91 | +library(GGally) |
| 92 | +library(umap) |
| 93 | +library(Rtsne) |
| 94 | +library(igraph) |
| 95 | +library(huge) |
| 96 | +``` |
| 97 | + |
| 98 | + |
| 99 | +Explore Data |
| 100 | +```{r, echo = TRUE} |
| 101 | +dim(gdat) |
| 102 | +dim(cdat) |
| 103 | +
|
| 104 | +# clinical data |
| 105 | +table(cdat$Subtype) |
| 106 | +table(cdat$ER) |
| 107 | +table(cdat$PR) |
| 108 | +table(cdat$HER2) |
| 109 | +table(cdat$Node) |
| 110 | +table(cdat$Metastasis) |
| 111 | +table(cdat$ER,cdat$PR) |
| 112 | +
|
| 113 | +``` |
| 114 | + |
| 115 | + |
| 116 | +## Cluster Heatmap - biclustering |
| 117 | +```{r, echo = TRUE} |
| 118 | +#cluster heatmap - biclustering |
| 119 | +aa = grep("grey",colors()) |
| 120 | +bb = grep("green",colors()) |
| 121 | +cc = grep("red",colors()) |
| 122 | +gcol2 = colors()[c(aa[1:2],bb[1:25],cc[1:50])] |
| 123 | +
|
| 124 | +``` |
| 125 | + |
| 126 | +Without scaling |
| 127 | +```{r, echo = TRUE} |
| 128 | +heatmap(gdat,col=gcol2,hclustfun=function(x)hclust(x,method="ward.D")) |
| 129 | +``` |
| 130 | + |
| 131 | +With scaling |
| 132 | +```{r, echo = TRUE} |
| 133 | +
|
| 134 | +heatmap(scale(gdat),col=gcol2,hclustfun=function(x)hclust(x,method="ward.D")) |
| 135 | +
|
| 136 | +Cols=function(vec){cols=rainbow(length(unique(vec))) |
| 137 | + return(cols[as.numeric(as.factor(vec))])} |
| 138 | +
|
| 139 | +heatmap(scale(gdat),col=gcol2,hclustfun=function(x)hclust(x,method="ward.D"),labRow=cdat$Subtype,RowSideColors=Cols(cdat$Subtype)) |
| 140 | +``` |
| 141 | + |
| 142 | +## Dimension Reduction |
| 143 | + |
| 144 | +PCA |
| 145 | +```{r, echo = TRUE} |
| 146 | +Cols=function(vec){cols=rainbow(length(unique(vec))) |
| 147 | + return(cols[as.numeric(as.factor(vec))])} |
| 148 | +
|
| 149 | +sv = svd(scale(gdat,center=TRUE,scale=FALSE)) |
| 150 | +V = sv$v |
| 151 | +Z = gdat%*%V |
| 152 | +
|
| 153 | +K = 3 |
| 154 | +pclabs = c("PC1","PC2","PC3","PC4") |
| 155 | +par(mfrow=c(1,K)) |
| 156 | +for(i in 1:K){ |
| 157 | + j = i+1 |
| 158 | + plot(Z[,i],Z[,j],pch=16,xlab=pclabs[i],ylab=pclabs[j],col=Cols(cdat$Subtype)) |
| 159 | +} |
| 160 | +legend(-45,0,pch=16,col=rainbow(5),levels(cdat$Subtype)) |
| 161 | +``` |
| 162 | + |
| 163 | +Pairs Plot |
| 164 | +```{r, echo = TRUE} |
| 165 | +PC1<-as.matrix(Z[,1]) |
| 166 | +PC2<-as.matrix(Z[,2]) |
| 167 | +PC3<-as.matrix(Z[,3]) |
| 168 | +PC4<-as.matrix(Z[,4]) |
| 169 | +PC5<-as.matrix(Z[,5]) |
| 170 | +
|
| 171 | +pc.df.cdat<-data.frame(Subtype = cdat$Subtype, PC1, PC2, PC3, PC4, PC5) |
| 172 | +ggpairs(pc.df.cdat, mapping = aes(color = Subtype)) |
| 173 | +``` |
| 174 | + |
| 175 | +MDS |
| 176 | +```{r, echo = TRUE} |
| 177 | +
|
| 178 | +Dmat = dist(gdat,method="maximum") |
| 179 | +mdsres = cmdscale(Dmat,k=2) |
| 180 | +plot(mdsres[,1],mdsres[,2],pch=16,col=Cols(cdat$Subtype), main = "Dimension Reduction MDS") |
| 181 | +legend(-30,20,pch=16,col=rainbow(5),levels(cdat$Subtype)) |
| 182 | +
|
| 183 | +``` |
| 184 | + |
| 185 | + |
| 186 | +ICA |
| 187 | +```{r, echo = TRUE} |
| 188 | +
|
| 189 | +require("fastICA") |
| 190 | +
|
| 191 | +K = 4 |
| 192 | +icafit = fastICA(gdat,n.comp=K) |
| 193 | +
|
| 194 | +kk = 3 |
| 195 | +pclabs = c("ICA1","ICA2","ICA3","ICA4") |
| 196 | +par(mfrow=c(1,kk)) |
| 197 | +for(i in 1:kk){ |
| 198 | + j = i+1 |
| 199 | + plot(icafit$A[i,],icafit$A[j,],pch=16,xlab=pclabs[i],ylab=pclabs[j],col=Cols(cdat$Subtype)) |
| 200 | +} |
| 201 | +legend(-1,2.8,pch=16,col=rainbow(5),levels(cdat$Subtype)) |
| 202 | +``` |
| 203 | + |
| 204 | +UMAP |
| 205 | +```{r, echo = TRUE} |
| 206 | +gdat.umap = umap(gdat) |
| 207 | +plot(gdat.umap$layout[,1], y =gdat.umap$layout[,2], type = "n", main = "UMAP", xlab = "UMAP1", ylab = "UMAP2") |
| 208 | +text(gdat.umap$layout[,1], y =gdat.umap$layout[,2], type = "n", cdat$Subtype, col=Cols(cdat$Subtype), cex = .7 ) |
| 209 | +
|
| 210 | +``` |
| 211 | + |
| 212 | + |
| 213 | + |
| 214 | +## Clustering |
| 215 | + |
| 216 | +K-means |
| 217 | +```{r, echo = TRUE} |
| 218 | +
|
| 219 | +K = 5 |
| 220 | +km = kmeans(gdat,centers=K,nstart=25) |
| 221 | +table(km$cluster,cdat$Subtype) |
| 222 | +
|
| 223 | +``` |
| 224 | + |
| 225 | + |
| 226 | +Plot Kmeans with labels |
| 227 | +```{r, echo = TRUE} |
| 228 | +plot(Z[,1],Z[,2],col=km$cluster, main = "Plot Kmeans Clusters ", xlab = "PC1", ylab = "PC2") |
| 229 | +text(Z[,1],Z[,2],cdat$Subtype,cex=.75,col=km$cluster) |
| 230 | +cens = km$centers |
| 231 | +points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3) |
| 232 | +``` |
| 233 | + |
| 234 | + |
| 235 | +Hierarchical |
| 236 | +```{r, echo = TRUE} |
| 237 | +#which linakge is the best? |
| 238 | +#which distance metric is the best? |
| 239 | +
|
| 240 | +Dmat = dist(gdat) |
| 241 | +com.hc = hclust(Dmat,method="ward.D") |
| 242 | +
|
| 243 | +
|
| 244 | +plot(com.hc,labels=cdat$Subtype,cex=.5) |
| 245 | +
|
| 246 | +res.com = cutree(com.hc,5) |
| 247 | +table(res.com,cdat$Subtype) |
| 248 | +``` |
| 249 | + |
| 250 | +Consensus Clustering with Hierarchical |
| 251 | +```{r, echo = TRUE} |
| 252 | +#Note that ConsensusClusterPlus not available for R version 4.0.2 |
| 253 | +#results = ConsensusClusterPlus(gdat,maxK=6,reps=500,pItem=0.8,pFeature=1, |
| 254 | +#clusterAlg="hc",distance="pearson",plot="png") |
| 255 | +``` |
| 256 | + |
| 257 | +Look at results for first 5 clusters |
| 258 | +```{r, echo = TRUE} |
| 259 | + #heatmap(results[[2]][["consensusMatrix"]][1:5,1:5]) |
| 260 | +``` |
| 261 | + |
| 262 | + |
| 263 | +Spectral Clustering |
| 264 | +```{r, echo = TRUE} |
| 265 | +K = 5 |
| 266 | +s_gdat = specClust(gdat, centers=K, nn = 7, method = "symmetric", gmax=NULL) |
| 267 | +``` |
| 268 | + |
| 269 | +Visualize |
| 270 | +```{r, echo = TRUE} |
| 271 | +plot(Z[,1],Z[,2],col=s_gdat$cluster, main = "Visualize Spectral Clusters", xlab = "PC1", ylab = "PC2") |
| 272 | +text(Z[,1],Z[,2],cdat$Subtype,cex=.75,col=s_gdat$cluster) |
| 273 | +``` |
| 274 | + |
| 275 | + |
| 276 | + |
| 277 | +## Genes significantly associated with ER or PR Status, etc |
| 278 | +```{r, echo = TRUE} |
| 279 | +
|
| 280 | +x = gdat[cdat$ER=="Positive" | cdat$ER=="Negative",] |
| 281 | +y.er = cdat$ER[cdat$ER=="Positive" | cdat$ER=="Negative"] |
| 282 | +y.label = rep(1, length(y.er)) |
| 283 | +y.label[y.er == "Positive"]=2 |
| 284 | +
|
| 285 | +ps = NULL |
| 286 | +for(i in 1:ncol(gdat)) ps = c(ps, |
| 287 | + t.test(x[y.label==1,i],x[y.label==2,i])$p.value) |
| 288 | +fdrs.bh = p.adjust(ps, method="BH") |
| 289 | +
|
| 290 | +cat("Number of Tests significant with alpha=0.1 using Bonferroni correction:", |
| 291 | +sum(ps<0.1/length(y.label)), fill=TRUE) |
| 292 | +
|
| 293 | +cat("Number of Tests with FDR below 0.1:", |
| 294 | +sum(fdrs.bh<0.1), fill=TRUE) |
| 295 | +
|
| 296 | +plot(sort(ps,decreasing=FALSE),ylab="P-Values") |
| 297 | +#BH procedure |
| 298 | +abline(a=0, b=0.1/length(y.label),col="red") |
| 299 | +#Bonferroni |
| 300 | +abline(a=0.1/length(y.label), b=0,col="blue") |
| 301 | +``` |
| 302 | + |
| 303 | + |
| 304 | +## Graphical models - how are genes related? |
| 305 | + |
| 306 | +```{r} |
| 307 | +# use huge package |
| 308 | +neth = huge(gdat,method="mb") |
| 309 | +plot(neth) |
| 310 | +``` |
| 311 | + |
| 312 | +```{r} |
| 313 | +## stability selection with huge |
| 314 | +net.s <- huge.select(neth, criterion="stars") |
| 315 | +net.s |
| 316 | +plot(net.s) |
| 317 | +``` |
| 318 | + |
| 319 | + |
| 320 | +```{r} |
| 321 | +#larger lambda |
| 322 | +mat <- neth$path[[2]] |
| 323 | +neti <- as.undirected(graph_from_adjacency_matrix(mat)) |
| 324 | +plot(neti,vertex.label=colnames(gdat),vertex.size=2,vertex.label.cex=1.2,vertex.label.dist=1,layout=layout_with_kk) |
| 325 | +``` |
| 326 | + |
| 327 | + |
| 328 | +```{r} |
| 329 | +#smaller lambda |
| 330 | +mat = neth$path[[6]] |
| 331 | +neti = as.undirected(graph_from_adjacency_matrix(mat)) |
| 332 | +plot(neti,vertex.label=colnames(gdat),vertex.size=2,vertex.label.cex=1.2,vertex.label.dist=1,layout=layout_with_kk) |
| 333 | +``` |
0 commit comments