1+ batch_downsample <- function (mergeddata ,empBayes = FALSE ,numberSamples = 100 ,annotation ,DownRange = seq(0.1 ,0.9 ,by = 0.05 ),allBroad ,allSanger ,OrigCorrection ,cmpAnnot ){
2+ nsamples <- nrow(annotation )
3+ annotAll <- rbind(annotation ,annotation )
4+ annotAll $ cn <- c(annotation $ Smn ,annotation $ Bmn )
5+ annotAll $ cn <- gsub(" " ," " ,annotAll $ cn )
6+
7+ rownames(annotAll )<- annotAll $ cn
8+ annottab <- table(annotation $ tissue )
9+ lineageFreq <- as.vector(annottab )/ nsamples
10+ names(lineageFreq )<- names(annottab )
11+ sampleprobs <- lineageFreq [annotation $ tissue ]
12+ names(sampleprobs )<- annotation $ model_id
13+ lineagelabels <- cmpAnnot [c(colnames(allBroad ),colnames(allSanger ))," tissue" ]
14+ names(lineagelabels )<- c(colnames(allBroad ),colnames(allSanger ))
15+ batchlabel <- c(rep(1 ,ncol(allBroad )),rep(2 ,ncol(allSanger )))
16+ names(batchlabel )<- names(lineagelabels )
17+ cor_res <- list ()
18+ euc_res <- list ()
19+ cluster_resLineage <- list ()
20+ cluster_resBatch <- list ()
21+ for (i in 1 : length(DownRange )){
22+ samples_gen <- sapply(1 : numberSamples ,function (y ) annotAll [annotAll $ model_id %in% sample(annotation $ model_id ,size = round(nsamples * DownRange [i ]),prob = sampleprobs )," cn" ])
23+ # samples_gen matrix nrow sample size, ncol numberSamples
24+ cor_res [[i ]]<- vector(" numeric" ,length = numberSamples )
25+ euc_res [[i ]]<- vector(" numeric" ,length = numberSamples )
26+ cluster_resLineage [[i ]]<- vector(" numeric" ,length = numberSamples )
27+ cluster_resBatch [[i ]]<- vector(" numeric" ,length = numberSamples )
28+ for (j in 1 : numberSamples ){
29+ result <- BatchCorrection(data1 = allBroad ,data2 = allSanger ,CombatRes = ComBatCP(mergeddata [,samples_gen [,j ]],batch = unlist(sapply(samples_gen [,j ],function (y )strsplit(y ," ---" )[[1 ]][2 ])),empBayes = empBayes ))
30+ perf_res <- compareCD(OrigCorrection ,result $ qNorm ,DownRange = DownRange )
31+ # cluster_resL<-ASWpc(result$qNorm,numberPCs = 20,lineagelabels)
32+
33+ cluster_resB <- ASWpc(result $ qNorm ,numberPCs = 20 ,lineagelabels = batchlabel )
34+ cor_res [[i ]][j ]<- perf_res $ corRes
35+ euc_res [[i ]][j ]<- perf_res $ eucRes
36+ # cluster_resLineage[[i]][j]<-cluster_resL
37+ cluster_resLineage [[i ]][j ]<- i + j
38+ cluster_resBatch [[i ]][j ]<- cluster_resB
39+ }
40+
41+
42+ }
43+ # ideal output is a list of matrices. Each matrix is one parameter with Sample value x DownRange
44+ return (list (corRes = cor_res ,eucRes = euc_res ,clusterL = cluster_resLineage ,clusterB = cluster_resBatch ))
45+ }
46+
47+ compareCD <- function (origCorrection ,sampleCorrection ,DownRange = seq(0.1 ,0.9 ,by = 0.05 ),numberPCs = 20 ,lineagelabels ){
48+ corRes <- list ()
49+ eucRes <- list ()
50+
51+ for (i in 1 : length(DownRange )){
52+ sampleRes <- sampleCorrection
53+ corRes [[i ]]<- unlist(sapply(colnames(sampleRes ),function (y ) cor(sampleRes [,y ],origCorrection [,y ],use = " pairwise" )))
54+ eucRes [[i ]]<- unlist(sapply(colnames(sampleRes ),function (y ) dist(t(na.omit(cbind(sampleRes [,y ],origCorrection [,y ]))))))
55+ }
56+ return (list (corRes = corRes ,eucRes = eucRes ))
57+ }
58+
59+
60+ batch_downsampleAMI <- function (mergeddata ,empBayes = FALSE ,numberSamples = 100 ,annotation ,DownRange = seq(0.1 ,0.9 ,by = 0.05 ),allBroad ,allSanger ,OrigCorrection ,cmpAnnot ){
61+ nsamples <- nrow(annotation )
62+ annotAll <- rbind(annotation ,annotation )
63+ annotAll $ cn <- c(annotation $ Smn ,annotation $ Bmn )
64+ annotAll $ cn <- gsub(" " ," " ,annotAll $ cn )
65+
66+ rownames(annotAll )<- annotAll $ cn
67+ annottab <- table(annotation $ tissue )
68+ lineageFreq <- as.vector(annottab )/ nsamples
69+ names(lineageFreq )<- names(annottab )
70+ sampleprobs <- lineageFreq [annotation $ tissue ]
71+ names(sampleprobs )<- annotation $ model_id
72+ lineagelabels <- cmpAnnot [c(colnames(allBroad ),colnames(allSanger ))," tissue" ]
73+ names(lineagelabels )<- c(colnames(allBroad ),colnames(allSanger ))
74+
75+ cluster_AMILineage <- list ()
76+
77+ for (i in 1 : length(DownRange )){
78+ samples_gen <- sapply(1 : numberSamples ,function (y ) annotAll [annotAll $ model_id %in% sample(annotation $ model_id ,size = round(nsamples * DownRange [i ]),prob = sampleprobs )," cn" ])
79+ # samples_gen matrix nrow sample size, ncol numberSamples
80+
81+ cluster_AMILineage [[i ]]<- vector(" numeric" ,length = numberSamples )
82+
83+ for (j in 1 : numberSamples ){
84+ result <- BatchCorrection(data1 = allBroad ,data2 = allSanger ,CombatRes = ComBatCP(mergeddata [,samples_gen [,j ]],batch = unlist(sapply(samples_gen [,j ],function (y )strsplit(y ," ---" )[[1 ]][2 ])),empBayes = empBayes ))
85+
86+ cluster_resL <- RepeatKmeansAMI(list (result $ qNorm ),genes = " var" ,nrepeat = 20 ,length(lineageFreq ),lineagelabels )
87+
88+ cluster_AMILineage [[i ]][j ]<- median(unlist(cluster_resL [[1 ]]$ ami ))
89+
90+ }
91+
92+
93+ }
94+ # ideal output is a list of matrices. Each matrix is one parameter with Sample value x DownRange
95+ return (cluster_AMILineage )
96+ }
97+
98+ ASWpc <- function (inputdata ,numberPCs = 20 ,lineagelabels ){
99+
100+ inputdata <- inputdata [,names(lineagelabels )[! is.na(lineagelabels )]]
101+ lineagelabels <- lineagelabels [! is.na(lineagelabels )]
102+ if (sum(is.na(inputdata ))!= 0 ){
103+ # Have NAs and need to impute missing values
104+ # data is genes x cell lines
105+ meanVals <- rowMeans(inputdata ,na.rm = TRUE )
106+ genesToimpute <- which(rowSums(is.na(inputdata ))!= 0 )
107+ for (i in 1 : length(genesToimpute )){
108+ selcl <- which(is.na(inputdata [genesToimpute [i ],]))
109+ inputdata [genesToimpute [i ],selcl ]<- meanVals [genesToimpute [i ]]
110+ }
111+ }
112+ estpca <- prcomp(t(inputdata ),scale. = TRUE )
113+
114+ subsetPCs <- estpca $ x [,1 : numberPCs ]
115+
116+ # do the clustering and silhouette values:
117+ distM <- dist(subsetPCs )
118+ res <- silhouetteScores(lineagelabels ,distM )
119+
120+ return (res )
121+ }
122+
123+ plotDS <- function (compareCD ,DownRange = seq(0.1 ,0.9 ,by = 0.05 ),nsamples = NULL ,plotname ,ylim = c(0.98 ,1 ),setNames = NULL ,listRes = FALSE ,noAdj = NULL ,ylab = " " ,plotcolours = NULL ,axisS = 10 ,labelS = 10 ,pwidth = 5 ,pheight = 5 ){
124+ xfactors <- paste0(" Sample" ,DownRange )
125+ if (! is.null(nsamples )){
126+ xfactors <- paste0(" N=" ,round(nsamples * DownRange ))
127+ }
128+ if (! listRes ){
129+ names(compareCD )<- xfactors
130+ corvals <- c()
131+ xfvals <- c()
132+ for (i in 1 : length(xfactors )){
133+ corvals <- c(corvals ,unlist(compareCD [[i ]]))
134+ xfvals <- c(xfvals ,rep(xfactors [i ],length(unlist(compareCD [[i ]]))))
135+
136+ }
137+ if (! is.null(noAdj )){
138+ # also add in the values with correlations with no ComBat adjustment
139+ xfno <- " N=0"
140+ corvals <- c(corvals ,unlist(noAdj ))
141+ xfvals <- c(xfvals ,rep(xfno ,length(unlist(noAdj ))))
142+ }
143+ plotdata <- data.frame (x = corvals ,value = xfvals )
144+ if (! is.null(noAdj )){
145+ plotdata $ value <- factor (plotdata $ value ,levels = c(xfno ,xfactors ))
146+ }else {
147+ plotdata $ value <- factor (plotdata $ value ,levels = xfactors )
148+ }
149+ plotDS <- ggplot(aes(x = value ,y = x ),data = plotdata )+ geom_boxplot()+ theme_bw()+ theme(axis.text.x = element_text(angle = 45 , vjust = 1 , hjust = 1 ),axis.title = element_text(size = axisS ),axis.text = element_text(size = labelS ))+ ylim(ylim )+ labs(x = " " ,y = ylab )
150+ if (! is.null(plotcolours )){
151+ plotDS <- plotDS + scale_fill_manual(plotcolours )
152+ }
153+ savepdf(plotname ,width = pwidth ,height = pheight )
154+ print(plotDS )
155+ dev.off()
156+ }else {
157+ # have a set of correlation downsamples e.g. different pre-processing methods
158+ nSets <- length(compareCD )
159+ corvals <- c()
160+ xfvals <- c()
161+ setvals <- c()
162+ for (i in 1 : nSets ){
163+ CD <- compareCD [[i ]]
164+ if (! is.null(noAdj )){
165+ NC <- noAdj [[i ]]
166+ }
167+ cvals <- c()
168+ xvals <- c()
169+ for (j in 1 : length(xfactors )){
170+ cvals <- c(cvals ,unlist(CD [[j ]]))
171+ xvals <- c(xvals ,rep(xfactors [j ],length(unlist(CD [[j ]]))))
172+
173+ }
174+ if (! is.null(noAdj )){
175+ cvals <- c(cvals ,NC )
176+ xvals <- c(xvals ,rep(" N=0" ,length(NC )))
177+ }
178+ setvals <- c(setvals ,rep(setNames [i ],length(cvals )))
179+ corvals <- c(corvals ,cvals )
180+ xfvals <- c(xfvals ,xvals )
181+
182+ }
183+ plotdata <- data.frame (x = corvals ,value = xfvals ,set = setvals )
184+ if (! is.null(noAdj )){
185+ plotdata $ value <- factor (plotdata $ value ,levels = c(" N=0" ,xfactors ))
186+ }else {
187+ plotdata $ value <- factor (plotdata $ value ,levels = xfactors )}
188+ plotDS <- ggplot(aes(x = value ,y = x ,fill = setvals ),data = plotdata )+ geom_boxplot()+ theme_bw()+ theme(axis.text.x = element_text(angle = 45 , vjust = 1 , hjust = 1 ),legend.position = " none" ,axis.title = element_text(size = axisS ),axis.text = element_text(size = labelS ))+ ylim(ylim )+ labs(x = " " ,y = ylab )
189+ if (! is.null(plotcolours )){
190+ plotDS <- plotDS + scale_fill_manual(values = plotcolours )
191+ }
192+ savepdf(plotname ,width = pwidth ,height = pheight )
193+ print(plotDS )
194+ dev.off()
195+ }
196+
197+ }
198+
199+
200+ plotASW <- function (ASWscores ,DownRange = seq(0.1 ,0.9 ,by = 0.05 ),nsamples = NULL ,plotname ,ylim = c(0.98 ,1 )){
201+ xfactors <- paste0(" Sample" ,DownRange )
202+ if (! is.null(nsamples )){
203+ xfactors <- paste(" N=" ,round(nsamples * DownRange ))
204+ }
205+ names(compareCD )<- xfactors
206+ corvals <- c()
207+ xfvals <- c()
208+ for (i in 1 : length(xfactors )){
209+ corvals <- c(corvals ,unlist(compareCD [[i ]]))
210+ xfvals <- c(xfvals ,rep(xfactors [i ],length(unlist(compareCD [[i ]]))))
211+
212+ }
213+ plotdata <- data.frame (x = corvals ,value = xfvals )
214+ plotdata $ value <- factor (plotdata $ value ,levels = xfactors )
215+ plotDS <- ggplot(aes(x = value ,y = x ),data = plotdata )+ geom_boxplot()+ theme_bw()+ theme(axis.text.x = element_text(angle = 45 , vjust = 1 , hjust = 1 ))+ ylim(ylim )
216+ pdf(plotname )
217+ print(plotDS )
218+ dev.off()
219+ }
0 commit comments