-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreVerhaakCox.R
More file actions
86 lines (60 loc) · 2.92 KB
/
preVerhaakCox.R
File metadata and controls
86 lines (60 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
####################################### As the Genes Do not contain information on survival times, we look for pathways ###################
########################################## As the PC doesnot contain much survival we use cox scores #######################################
library(globaltest)
library(org.Hs.eg.db)
library(GO.db)
library(KEGG.db)
exprs.norm <- ExpressionSet(Y.train.prelim)
xx <- as.list(org.Hs.egALIAS2EG)
# Remove pathway identifiers that do not map to any entrez gene id
xx <- xx[!is.na(xx)]
glob <- gtKEGG( Surv(exp(pheno.train[,3]), pheno.train[,2]), exprs.norm, annotation = 'org.Hs.eg.db', multtest = "BH",probe2entrez = xx)
pathway.names <- names(glob)[1:30]
yy <- as.list(KEGGPATHID2EXTID)
######### TO GET GENE NAMES of GENES WITHIN PATHWAYS ##############################3
# For the reverse map:
# Convert the object to a list
dd <- as.list(org.Hs.egPATH2EG)
# Remove pathway identifiers that do not map to any entrez gene id
dd <- dd[!is.na(dd)]
if(length(dd) > 0){
# The entrez gene identifiers for the first two elements of XX
dd[1:2]
# Get the first one
dd[[1]]
}
path2entrez <- dd
path2entrez.subset <- path2entrez[pathway.names]
library('biomaRt')
# listDatasets(ensembl)
ensembl = useMart('ensembl')
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
names.genes <- list(0)
for( i in 1:length(pathway.names)){
names.genes[[i]] = getBM(attributes = c("entrezgene","hgnc_symbol"), filters = "entrezgene",values =path2entrez.subset[i], mart=ensembl)
}
### Data frame FROM the EXPRESSION #################
data.exprs <- as.data.frame(exprs.norm)
data.subsets <- list(0)
for ( i in 1:length(pathway.names) ){
temp.names <- names.genes[[i]][,2][c(names.genes[[i]][,2])%in% colnames(data.exprs)]
data.subsets[[i]] <- as.data.frame(data.exprs[, temp.names])
}
data.combined <- matrix(0, nrow = nrow(data.exprs), ncol =length(pathway.names) )
for ( i in 1:length(pathway.names)){
# data.combined[,i] <- as.vector(apply(data.subsets[[i]],1, mean))
pc <- prcomp(data.subsets[[i]])
data.combined[,i] <- predict(pc,newdata = data.subsets[[i]])[,1]
}
rownames(data.combined) <- rownames(data.exprs)
colnames(data.combined) <- pathway.names
############ A Small Vizualization ###################
pc <- prcomp(data.combined)
pc.pred <- predict(pc,newdata = data.combined)
plot(pc.pred[,1], pc.pred[,2], pch = 19, col = pheno.train[,4])
### checking the Heatmap #######################
### checking the pattern in training data #####################
hmcols<-colorRampPalette(c("green","black","red"))(512)
heatmap.2(x = t(data.combined), scale = "row",Rowv =TRUE ,Colv = FALSE, dendrogram = "row", ColSideColors =c("blue","red","green","purple")[c.true], labRow = colnames(data.combined), labCol = NA, main = ' \n Training Set \n ', col =hmcols, cexRow =0.40, trace ="none", key.title = "Color Key")
######### Saving the Top 30 pathways #################
save(data.combined,file = '30pathwayVerhaark.RData')