Skip to content

Commit a061b34

Browse files
authored
Upload downsampling scripts
1 parent 085be5c commit a061b34

3 files changed

Lines changed: 537 additions & 0 deletions

File tree

DownSampleFunctions.R

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
batch_downsample<-function(mergeddata,empBayes=FALSE,numberSamples=100,annotation,DownRange=seq(0.1,0.9,by=0.05),allBroad,allSanger,OrigCorrection,cmpAnnot){
2+
nsamples<-nrow(annotation)
3+
annotAll<-rbind(annotation,annotation)
4+
annotAll$cn<-c(annotation$Smn,annotation$Bmn)
5+
annotAll$cn<-gsub(" ","",annotAll$cn)
6+
7+
rownames(annotAll)<-annotAll$cn
8+
annottab<-table(annotation$tissue)
9+
lineageFreq<-as.vector(annottab)/nsamples
10+
names(lineageFreq)<-names(annottab)
11+
sampleprobs<-lineageFreq[annotation$tissue]
12+
names(sampleprobs)<-annotation$model_id
13+
lineagelabels<-cmpAnnot[c(colnames(allBroad),colnames(allSanger)),"tissue"]
14+
names(lineagelabels)<-c(colnames(allBroad),colnames(allSanger))
15+
batchlabel<-c(rep(1,ncol(allBroad)),rep(2,ncol(allSanger)))
16+
names(batchlabel)<-names(lineagelabels)
17+
cor_res<-list()
18+
euc_res<-list()
19+
cluster_resLineage<-list()
20+
cluster_resBatch<-list()
21+
for(i in 1:length(DownRange)){
22+
samples_gen<-sapply(1:numberSamples,function(y) annotAll[annotAll$model_id%in%sample(annotation$model_id,size=round(nsamples*DownRange[i]),prob=sampleprobs),"cn"])
23+
#samples_gen matrix nrow sample size, ncol numberSamples
24+
cor_res[[i]]<-vector("numeric",length = numberSamples)
25+
euc_res[[i]]<-vector("numeric",length = numberSamples)
26+
cluster_resLineage[[i]]<-vector("numeric",length = numberSamples)
27+
cluster_resBatch[[i]]<-vector("numeric",length = numberSamples)
28+
for(j in 1:numberSamples){
29+
result<-BatchCorrection(data1=allBroad,data2=allSanger,CombatRes=ComBatCP(mergeddata[,samples_gen[,j]],batch=unlist(sapply(samples_gen[,j],function(y)strsplit(y,"---")[[1]][2])),empBayes=empBayes))
30+
perf_res<-compareCD(OrigCorrection,result$qNorm,DownRange = DownRange)
31+
#cluster_resL<-ASWpc(result$qNorm,numberPCs = 20,lineagelabels)
32+
33+
cluster_resB<-ASWpc(result$qNorm,numberPCs = 20,lineagelabels=batchlabel)
34+
cor_res[[i]][j]<-perf_res$corRes
35+
euc_res[[i]][j]<-perf_res$eucRes
36+
#cluster_resLineage[[i]][j]<-cluster_resL
37+
cluster_resLineage[[i]][j]<-i+j
38+
cluster_resBatch[[i]][j]<-cluster_resB
39+
}
40+
41+
42+
}
43+
#ideal output is a list of matrices. Each matrix is one parameter with Sample value x DownRange
44+
return(list(corRes=cor_res,eucRes=euc_res,clusterL=cluster_resLineage,clusterB=cluster_resBatch))
45+
}
46+
47+
compareCD<-function(origCorrection,sampleCorrection,DownRange=seq(0.1,0.9,by=0.05),numberPCs=20,lineagelabels){
48+
corRes<-list()
49+
eucRes<-list()
50+
51+
for(i in 1:length(DownRange)){
52+
sampleRes<-sampleCorrection
53+
corRes[[i]]<-unlist(sapply(colnames(sampleRes),function(y) cor(sampleRes[,y],origCorrection[,y],use="pairwise")))
54+
eucRes[[i]]<-unlist(sapply(colnames(sampleRes),function(y) dist(t(na.omit(cbind(sampleRes[,y],origCorrection[,y]))))))
55+
}
56+
return(list(corRes=corRes,eucRes=eucRes))
57+
}
58+
59+
60+
batch_downsampleAMI<-function(mergeddata,empBayes=FALSE,numberSamples=100,annotation,DownRange=seq(0.1,0.9,by=0.05),allBroad,allSanger,OrigCorrection,cmpAnnot){
61+
nsamples<-nrow(annotation)
62+
annotAll<-rbind(annotation,annotation)
63+
annotAll$cn<-c(annotation$Smn,annotation$Bmn)
64+
annotAll$cn<-gsub(" ","",annotAll$cn)
65+
66+
rownames(annotAll)<-annotAll$cn
67+
annottab<-table(annotation$tissue)
68+
lineageFreq<-as.vector(annottab)/nsamples
69+
names(lineageFreq)<-names(annottab)
70+
sampleprobs<-lineageFreq[annotation$tissue]
71+
names(sampleprobs)<-annotation$model_id
72+
lineagelabels<-cmpAnnot[c(colnames(allBroad),colnames(allSanger)),"tissue"]
73+
names(lineagelabels)<-c(colnames(allBroad),colnames(allSanger))
74+
75+
cluster_AMILineage<-list()
76+
77+
for(i in 1:length(DownRange)){
78+
samples_gen<-sapply(1:numberSamples,function(y) annotAll[annotAll$model_id%in%sample(annotation$model_id,size=round(nsamples*DownRange[i]),prob=sampleprobs),"cn"])
79+
#samples_gen matrix nrow sample size, ncol numberSamples
80+
81+
cluster_AMILineage[[i]]<-vector("numeric",length = numberSamples)
82+
83+
for(j in 1:numberSamples){
84+
result<-BatchCorrection(data1=allBroad,data2=allSanger,CombatRes=ComBatCP(mergeddata[,samples_gen[,j]],batch=unlist(sapply(samples_gen[,j],function(y)strsplit(y,"---")[[1]][2])),empBayes=empBayes))
85+
86+
cluster_resL<-RepeatKmeansAMI(list(result$qNorm),genes="var",nrepeat=20,length(lineageFreq),lineagelabels)
87+
88+
cluster_AMILineage[[i]][j]<-median(unlist(cluster_resL[[1]]$ami))
89+
90+
}
91+
92+
93+
}
94+
#ideal output is a list of matrices. Each matrix is one parameter with Sample value x DownRange
95+
return(cluster_AMILineage)
96+
}
97+
98+
ASWpc<-function(inputdata,numberPCs=20,lineagelabels){
99+
100+
inputdata<-inputdata[,names(lineagelabels)[!is.na(lineagelabels)]]
101+
lineagelabels<-lineagelabels[!is.na(lineagelabels)]
102+
if(sum(is.na(inputdata))!=0){
103+
#Have NAs and need to impute missing values
104+
#data is genes x cell lines
105+
meanVals<-rowMeans(inputdata,na.rm=TRUE)
106+
genesToimpute<-which(rowSums(is.na(inputdata))!=0)
107+
for(i in 1:length(genesToimpute)){
108+
selcl<-which(is.na(inputdata[genesToimpute[i],]))
109+
inputdata[genesToimpute[i],selcl]<-meanVals[genesToimpute[i]]
110+
}
111+
}
112+
estpca<-prcomp(t(inputdata),scale.=TRUE)
113+
114+
subsetPCs <- estpca$x[,1:numberPCs]
115+
116+
#do the clustering and silhouette values:
117+
distM<-dist(subsetPCs)
118+
res<-silhouetteScores(lineagelabels,distM)
119+
120+
return(res)
121+
}
122+
123+
plotDS<-function(compareCD,DownRange=seq(0.1,0.9,by=0.05),nsamples=NULL,plotname,ylim=c(0.98,1),setNames=NULL,listRes=FALSE,noAdj=NULL,ylab="",plotcolours=NULL,axisS=10,labelS=10,pwidth=5,pheight=5){
124+
xfactors<-paste0("Sample",DownRange)
125+
if(!is.null(nsamples)){
126+
xfactors<-paste0("N=",round(nsamples*DownRange))
127+
}
128+
if(!listRes){
129+
names(compareCD)<-xfactors
130+
corvals<-c()
131+
xfvals<-c()
132+
for(i in 1:length(xfactors)){
133+
corvals<-c(corvals,unlist(compareCD[[i]]))
134+
xfvals<-c(xfvals,rep(xfactors[i],length(unlist(compareCD[[i]]))))
135+
136+
}
137+
if(!is.null(noAdj)){
138+
#also add in the values with correlations with no ComBat adjustment
139+
xfno<-"N=0"
140+
corvals<-c(corvals,unlist(noAdj))
141+
xfvals<-c(xfvals,rep(xfno,length(unlist(noAdj))))
142+
}
143+
plotdata<-data.frame(x=corvals,value=xfvals)
144+
if(!is.null(noAdj)){
145+
plotdata$value<-factor(plotdata$value,levels=c(xfno,xfactors))
146+
}else{
147+
plotdata$value<-factor(plotdata$value,levels=xfactors)
148+
}
149+
plotDS<-ggplot(aes(x=value,y=x),data=plotdata)+geom_boxplot()+theme_bw()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),axis.title = element_text(size = axisS),axis.text = element_text(size = labelS))+ylim(ylim)+labs(x="",y=ylab)
150+
if(!is.null(plotcolours)){
151+
plotDS<-plotDS+scale_fill_manual(plotcolours)
152+
}
153+
savepdf(plotname,width=pwidth,height=pheight)
154+
print(plotDS)
155+
dev.off()
156+
}else{
157+
#have a set of correlation downsamples e.g. different pre-processing methods
158+
nSets<-length(compareCD)
159+
corvals<-c()
160+
xfvals<-c()
161+
setvals<-c()
162+
for(i in 1:nSets){
163+
CD<-compareCD[[i]]
164+
if(!is.null(noAdj)){
165+
NC<-noAdj[[i]]
166+
}
167+
cvals<-c()
168+
xvals<-c()
169+
for(j in 1:length(xfactors)){
170+
cvals<-c(cvals,unlist(CD[[j]]))
171+
xvals<-c(xvals,rep(xfactors[j],length(unlist(CD[[j]]))))
172+
173+
}
174+
if(!is.null(noAdj)){
175+
cvals<-c(cvals,NC)
176+
xvals<-c(xvals,rep("N=0",length(NC)))
177+
}
178+
setvals<-c(setvals,rep(setNames[i],length(cvals)))
179+
corvals<-c(corvals,cvals)
180+
xfvals<-c(xfvals,xvals)
181+
182+
}
183+
plotdata<-data.frame(x=corvals,value=xfvals,set=setvals)
184+
if(!is.null(noAdj)){
185+
plotdata$value<-factor(plotdata$value,levels=c("N=0",xfactors))
186+
}else{
187+
plotdata$value<-factor(plotdata$value,levels=xfactors)}
188+
plotDS<-ggplot(aes(x=value,y=x,fill=setvals),data=plotdata)+geom_boxplot()+theme_bw()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),legend.position = "none",axis.title = element_text(size = axisS),axis.text = element_text(size = labelS))+ylim(ylim)+labs(x="",y=ylab)
189+
if(!is.null(plotcolours)){
190+
plotDS<-plotDS+scale_fill_manual(values=plotcolours)
191+
}
192+
savepdf(plotname,width=pwidth,height=pheight)
193+
print(plotDS)
194+
dev.off()
195+
}
196+
197+
}
198+
199+
200+
plotASW<-function(ASWscores,DownRange=seq(0.1,0.9,by=0.05),nsamples=NULL,plotname,ylim=c(0.98,1)){
201+
xfactors<-paste0("Sample",DownRange)
202+
if(!is.null(nsamples)){
203+
xfactors<-paste("N=",round(nsamples*DownRange))
204+
}
205+
names(compareCD)<-xfactors
206+
corvals<-c()
207+
xfvals<-c()
208+
for(i in 1:length(xfactors)){
209+
corvals<-c(corvals,unlist(compareCD[[i]]))
210+
xfvals<-c(xfvals,rep(xfactors[i],length(unlist(compareCD[[i]]))))
211+
212+
}
213+
plotdata<-data.frame(x=corvals,value=xfvals)
214+
plotdata$value<-factor(plotdata$value,levels=xfactors)
215+
plotDS<-ggplot(aes(x=value,y=x),data=plotdata)+geom_boxplot()+theme_bw()+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+ylim(ylim)
216+
pdf(plotname)
217+
print(plotDS)
218+
dev.off()
219+
}

DownSamplingFigures.R

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
library(here)
2+
source("Combat_HKfunctions.R")
3+
source("DownSampleFunctions.R")
4+
source("LineageFunctions.R")
5+
6+
library(ggfortify)
7+
library(RColorBrewer)
8+
library(colorspace)
9+
10+
seed <- 1809
11+
set.seed(seed)
12+
13+
dir.Results<-"./Results"
14+
dir.Input<-"/path/to/Figshare"
15+
16+
17+
loadObj<-function(objTS,filePrefix=dir.Results){
18+
load(file=paste0(filePrefix,'/',objTS,".Rdata"))
19+
return(objTS)
20+
}
21+
load(paste0(dir.Input,"/PipelineColours.Rdata"))
22+
23+
corCCR1<-loadObj("corCCR1")
24+
25+
DownSamples=seq(0.05,0.8,by=0.05)
26+
plotDS(corCCR1,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_CCR_ComBat.pdf"),ylim=c(0.95,1),DownRange = DownSamples)
27+
28+
corCCRJ1<-loadObj("corCCRJ1")
29+
30+
plotDS(corCCRJ1,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_CCRJ_ComBat.pdf"),ylim=c(0.95,1),DownRange = DownSamples)
31+
32+
33+
corCERES1<-loadObj("corCERES1")
34+
35+
plotDS(corCERES1,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_CERES_ComBat.pdf"),ylim=c(0.95,1),DownRange = DownSamples)
36+
37+
allCor<-list(corCCR1,corCCRJ1,corCERES1)
38+
setnames<-c("CRISPRcleanR","CCR-JACKS","CERES")
39+
names(PipelineColours)<-setnames
40+
noCombat<-list()
41+
load(file=paste0(dir.Results,"/SQsangerCCRall.Rdata"))
42+
load(file=paste0(dir.Results,"/SQbroadCCRall.Rdata"))
43+
AllCCR<-cbind(BroadScreenA,SangerScreenA[rownames(BroadScreenA),])
44+
#all data corrected:
45+
CcrAllsq<-readRDS(file=paste0(dir.Results,"/CCR_SQ_Combat_All.Rds"))
46+
47+
noCombat[["CCR"]]<-sapply(colnames(AllCCR),function(x) cor(AllCCR[,x],CcrAllsq[,x],use="pairwise"))
48+
load(file=paste0(dir.Results,"/SQsangerJACKSall.Rdata"))
49+
load(file=paste0(dir.Results,"/SQbroadJACKSall.Rdata"))
50+
AllCCRJ<-cbind(JBroadScreenA,JSangerScreenA[rownames(JBroadScreenA),])
51+
CcrAllsqJ<-readRDS(file=paste0(dir.Results,"/CCR_SQ_Combat_All_JACKS.Rds"))
52+
53+
noCombat[["CCRJ"]]<-sapply(colnames(AllCCRJ),function(x) cor(AllCCRJ[,x],CcrAllsqJ[,x],use="pairwise"))
54+
55+
load(file=paste0(dir.Results,"/SQsangerCERESall.Rdata"))
56+
load(file=paste0(dir.Results,"/SQbroadCERESall.Rdata"))
57+
AllCERES<-cbind(BroadScreenCeresA,SangerScreenCeresA[rownames(BroadScreenCeresA),])
58+
#result with all data:
59+
CeresAllsq<-readRDS(file=paste0(dir.Results,"/CERES_SQ_Combat_All.Rds"))
60+
noCombat[["CERES"]]<-sapply(colnames(AllCERES),function(x) cor(AllCERES[,x],CeresAllsq[,x],use="pairwise"))
61+
62+
plotDS(allCor,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat_withN0.pdf"),ylim=c(0.94,1),DownRange = DownSamples,setNames = setnames,listRes=TRUE,noAdj=noCombat,ylab="Pearson Correlation",plotcolours=PipelineColours,pwidth=15,pheight=12)
63+
plotDS(allCor,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat.pdf"),ylim=c(0.98,1),DownRange = DownSamples,setNames = setnames,listRes=TRUE,ylab="Pearson Correlation",plotcolours=PipelineColours,pwidth=15,pheight=12)
64+
65+
66+
67+
noCombatBatch<-list()
68+
ll<-c(rep(1,ncol(BroadScreenA)),rep(2,ncol(SangerScreenA)))
69+
names(ll)<-colnames(AllCCR)
70+
noCombatBatch[["CCR"]]<-ASWpc(AllCCR,numberPCs = 20,lineagelabels=ll)
71+
noCombatBatch[["CCRJ"]]<-ASWpc(AllCCRJ,numberPCs = 20,lineagelabels=ll)
72+
noCombatBatch[["CERES"]]<-ASWpc(AllCERES,numberPCs = 20,lineagelabels=ll)
73+
74+
75+
76+
BatchCCR1<-loadObj("BatchCCR1")
77+
BatchCCRJ1<-loadObj("BatchCCRJ1")
78+
BatchCERES1<-loadObj("BatchCERES1")
79+
allBatch<-list(BatchCCR1,BatchCCRJ1,BatchCERES1)
80+
setnames<-c("CRISPRcleanR","CCR-JACKS","CERES")
81+
plotDS(allBatch,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat_Batch.pdf"),ylim=c(0,0.3),DownRange = DownSamples,setNames = setnames,listRes=TRUE,ylab="Average Silhouette width",plotcolours=PipelineColours,pwidth=15,pheight=12)
82+
plotDS(allBatch,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat_Batch_noCombat.pdf"),ylim=c(0,0.75),DownRange = DownSamples,setNames = setnames,listRes=TRUE,noAdj=noCombatBatch,ylab="Average Silhouette width",plotcolours=PipelineColours,pwidth=15,pheight=12)
83+
84+
85+
#add in results using mixture of cell lines as well:
86+
DownSamplesL=c(0.3,0.6,0.9)
87+
BatchCCR2<-loadObj("BatchCCR2")
88+
BatchCCRJ2<-loadObj("BatchCCRJ2")
89+
BatchCERES2<-loadObj("BatchCERES2")
90+
allBatch2<-list(BatchCCR2,BatchCCRJ2,BatchCERES2)
91+
setnames<-c("CRISPRcleanR","CCR-JACKS","CERES")
92+
plotDS(allBatch2,nsamples=28,plotname=paste0(dir.Results,"/CompareDS_Lung_ComBat_Batch.pdf"),ylim=c(0,0.3),DownRange = DownSamplesL,setNames = setnames,listRes=TRUE,plotcolours = PipelineColours,pwidth=10,pheight=10)
93+
94+
corCCR2<-loadObj("corCCR2")
95+
corCCRJ2<-loadObj("corCCRJ2")
96+
corCERES2<-loadObj("corCERES2")
97+
allCor2<-list(corCCR2,corCCRJ2,corCERES2)
98+
setnames<-c("CRISPRcleanR","CCR-JACKS","CERES")
99+
plotDS(allCor2,nsamples=28,plotname=paste0(dir.Results,"/CompareDS_Lung_ComBat_Cor.pdf"),ylim=c(0.98,1),DownRange = DownSamplesL,setNames = setnames,listRes=TRUE,plotcolours = PipelineColours,pwidth=10,pheight=10)
100+
101+
#Do a subset plot for all cell lines and then put them side by side
102+
allCorSub<-list(corCCR1[1:3],corCCRJ1[1:3],corCERES1[1:3])
103+
plotDS(allCorSub,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat_sub.pdf"),ylim=c(0.98,1),DownRange = DownSamples[1:3],setNames = setnames,listRes=TRUE,ylab="Pearson Correlation",plotcolours=PipelineColours,pwidth=10,pheight=10)
104+
105+
allBatchSub<-list(BatchCCR1[1:3],BatchCCRJ1[1:3],BatchCERES1[1:3])
106+
plotDS(allBatchSub,nsamples=168,plotname=paste0(dir.Results,"/CompareDS_All_ComBat_Batch_sub.pdf"),ylim=c(0,0.3),DownRange = DownSamples[1:3],setNames = setnames,listRes=TRUE,ylab="Average Silhouette width",plotcolours=PipelineColours,pwidth=10,pheight=10)
107+
108+

0 commit comments

Comments
 (0)