|
| 1 | +--- |
| 2 | +title: "2025 SISBID High-Dimensional Hypothesis Testing Lab" |
| 3 | +author: "Genevera I. Allen and Yufeng Liu" |
| 4 | +output: |
| 5 | + html_document: default |
| 6 | + pdf_document: default |
| 7 | +always_allow_html: yes |
| 8 | +--- |
| 9 | + |
| 10 | +```{r setup, include=FALSE} |
| 11 | +knitr::opts_chunk$set(echo = TRUE) |
| 12 | +``` |
| 13 | + |
| 14 | +Load Packages |
| 15 | +```{r message= FALSE} |
| 16 | +library(sda) |
| 17 | +library(ggplot2) |
| 18 | +``` |
| 19 | + |
| 20 | +H_0 : feature is not associated with the response. |
| 21 | +## Data set 1 - Simulated Data |
| 22 | +Small simulated data set to demonstrate multiple testing when **all null hypthesis hold**. |
| 23 | +```{r} |
| 24 | +#simulate data |
| 25 | +x <- matrix(rnorm(1000*50),ncol=50) |
| 26 | +y <- sample(c(0,1),50,rep=TRUE) |
| 27 | +ps <- NULL |
| 28 | +for(i in 1:1000){ |
| 29 | + ps <- c(ps,t.test(x[i,y==0],x[i,y==1])$p.value) |
| 30 | +} |
| 31 | +cat("Around 5% of p-values are below 0.05:",mean(ps<.05),fill=TRUE) |
| 32 | +``` |
| 33 | + |
| 34 | +Benjamini-Hochberg Algorithm for FDR Control |
| 35 | +```{r} |
| 36 | +fdrs.bh <- p.adjust(ps, method="BH") |
| 37 | +BHData = data.frame(cbind(ps,fdrs.bh)) |
| 38 | +colnames(BHData) = c("OriginalP","BH.P") |
| 39 | +ggplot(BHData) + |
| 40 | + geom_point(mapping = aes(x = OriginalP, y = BH.P)) + |
| 41 | + ggtitle("Original p-value vs p-value after Benjamini-Hochberg correction") + |
| 42 | + theme(plot.title = element_text(hjust = 0.5)) + |
| 43 | + xlab("Orginal p-values") + ylab("p-values after Benjamini-Hochberg correction") |
| 44 | +``` |
| 45 | + |
| 46 | + |
| 47 | +```{r} |
| 48 | +BHData$index = 1:nrow(BHData) |
| 49 | +ggplot(BHData) + |
| 50 | + geom_point(mapping = aes(x = index, y = BH.P)) + |
| 51 | + ggtitle("p-value after Benjamini-Hochberg correction") + |
| 52 | + theme(plot.title = element_text(hjust = 0.5)) + |
| 53 | + xlab("Index") + ylab("p-values after Benjamini-Hochberg correction") |
| 54 | +``` |
| 55 | + |
| 56 | + |
| 57 | +## Data set 2 - Simulated Data |
| 58 | +Small simulated data set to demonstrate multiple testing when **not all null hypthesis hold**. |
| 59 | +```{r} |
| 60 | +#simulate data |
| 61 | +x <- matrix(rnorm(1000*50),ncol=50) |
| 62 | +y <- sample(c(0,1),50,rep=TRUE) |
| 63 | +x[1:100,y==0] <- x[1:100,y==0] + 1 |
| 64 | +ps <- NULL |
| 65 | +for(i in 1:1000) { |
| 66 | + ps <- c(ps,t.test(x[i,y==0],x[i,y==1])$p.value) |
| 67 | +} |
| 68 | +cat("Way more than 5% of p-values are below 0.05:",mean(ps<.05),fill=TRUE) |
| 69 | +``` |
| 70 | + |
| 71 | +```{r} |
| 72 | +fdrs.bh <- p.adjust(ps, method="BH") |
| 73 | +# plot |
| 74 | +BHData = data.frame(cbind(ps,fdrs.bh)) |
| 75 | +colnames(BHData) = c("OriginalP","BH.P") |
| 76 | +ggplot(BHData) + |
| 77 | + geom_point(mapping = aes(x = OriginalP, y = BH.P)) + |
| 78 | + ggtitle("Original p-value vs p-value after Benjamini-Hochberg correction") + |
| 79 | + theme(plot.title = element_text(hjust = 0.5)) + |
| 80 | + xlab("Orginal p-values") + ylab("p-values after Benjamini-Hochberg correction") |
| 81 | +
|
| 82 | +BHData$index = 1:nrow(BHData) |
| 83 | +ggplot(BHData) + |
| 84 | + geom_point(mapping = aes(x = index, y = BH.P)) + |
| 85 | + ggtitle("p-value after Benjamini-Hochberg correction") + |
| 86 | + theme(plot.title = element_text(hjust = 0.5)) + |
| 87 | + xlab("Index") + ylab("p-values after Benjamini-Hochberg correction") |
| 88 | +``` |
| 89 | + |
| 90 | + |
| 91 | +```{r} |
| 92 | +cat("Number of Tests with FDR below 0.4:",sum(fdrs.bh<0.4), fill=TRUE) |
| 93 | +cat("Compute the BH FDR Directly:",max(which(sort(ps,decreasing=FALSE) < .4*(1:1000)/1000)), |
| 94 | + fill=TRUE) |
| 95 | +BHData = BHData[order(ps,decreasing = FALSE),] |
| 96 | +BHData$index = 1:nrow(BHData) |
| 97 | +# plot |
| 98 | +ggplot(BHData) + |
| 99 | + geom_point(mapping = aes(x = index, y = OriginalP)) + |
| 100 | + ggtitle("Original p-values with different correction methods") + |
| 101 | + geom_abline(intercept = 0.4/1000,slope = 0,col= "red") + #Bonferroni |
| 102 | + geom_abline(intercept = 0 ,slope = 0.4/1000,col= "blue") + #BH procedure |
| 103 | + theme(plot.title = element_text(hjust = 0.5)) + |
| 104 | + xlab("Index") + ylab("Ordered p-values") |
| 105 | +
|
| 106 | +``` |
| 107 | + |
| 108 | +### Data set 3, Real Data: Prostate Data (Singh et al. 2002). This data set consists of gene expression levels for 6033 genes among 102 men. |
| 109 | +The dataset is available from the R package "sda" |
| 110 | +* Problem 1 - We wish to identify important genes to differetiate cancer or healthy patients. What kind of tests are reasonable? |
| 111 | +* Problem 2 - In order to adjust for multiple comparisons, which procedures should one use? |
| 112 | +* Problem 3 - Examine the list of genes identified. |
| 113 | +```{r} |
| 114 | +## import data |
| 115 | +data(singh2002) |
| 116 | +x = singh2002$x |
| 117 | +y = singh2002$y |
| 118 | +
|
| 119 | +n1 = sum(y == "healthy") |
| 120 | +n2 = length(y) - n1 |
| 121 | +``` |
| 122 | + |
| 123 | + |
| 124 | +```{r} |
| 125 | +ps<-NULL |
| 126 | +for(i in 1:ncol(x)) { |
| 127 | + ps <- c(ps, t.test(x[1:n1,i], x[(n1+1):(n1+n2),i])$p.value) |
| 128 | +} |
| 129 | +## ordered p-values names(ps)<-seq(1,ncol(x),1) |
| 130 | +p1 =sort (ps) |
| 131 | +
|
| 132 | +## plot ordered p-values |
| 133 | +plot(p1[1:100], pch=rep('*',100),ylim=c(0,0.003), ylab="ordered p-values") |
| 134 | +## rejection boundry of Benjamini-Hochberg's procedure |
| 135 | +abline(a=0, b=0.1/ncol(x), col="red") |
| 136 | +## rejection boundary of Bonferroni at 0.1 |
| 137 | +abline(a=0.1/ncol(x), b=0, col="blue",lty=5) |
| 138 | +
|
| 139 | +cat("Compute the no. rejection by Bonferroni:", |
| 140 | + max(which(sort(ps,decreasing=FALSE) < .1/ncol(x))), fill=TRUE) |
| 141 | +cat("Compute the BH FDR Directly:", |
| 142 | +max(which(sort(ps,decreasing=FALSE) < .1*(1:ncol(x))/ncol(x))), |
| 143 | + fill=TRUE) |
| 144 | +
|
| 145 | +arrows(x0 = 61, y0 = 0.00085, x1 = 58, y1 = p1[57], length = 0.1) |
| 146 | +text(63.5, 0.00085, labels="imax = 57", cex=.8, pos=4, col="black") |
| 147 | +legend("topleft",legend=c("BH's Procedure", "Bonferroni", "Ordered p-values"), |
| 148 | + lty=c(1, 5, NA), col=c("red","blue", "black"), pch = c(NA, NA, '*')) |
| 149 | +``` |
0 commit comments