-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAML_Complete_Code - Task 3.R
More file actions
332 lines (247 loc) · 11.3 KB
/
AML_Complete_Code - Task 3.R
File metadata and controls
332 lines (247 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
############## DGE ANALYSIS AND FUNCTIONAL ENRICHMENT ##################
# START OF CODE
# Required packages
library(gplots)
library(TCGAbiolinks)
library(tidyverse)
library(SummarizedExperiment)
library(edgeR)
library(org.Hs.eg.db)
library(ggplot2)
# Metadata
Metadata_AML <- read.csv("https://raw.githubusercontent.com/Naren037/hackbio-cancer-internship/refs/heads/main/TASK%203/Datasets/AML%20Metadata.csv", row.names=1)
# Data Mining using TCGABiolinks
query_primary <-GDCquery(project = 'TARGET-AML',
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
sample.type = "Primary Blood Derived Cancer - Bone Marrow")
query_recurrent <-GDCquery(project = 'TARGET-AML',
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
sample.type = "Recurrent Blood Derived Cancer - Bone Marrow")
Output_query_primary <- getResults(query_primary)
Output_query_recurrent <- getResults(query_recurrent)
# Filtering for 20 cases in each criteria
AML_Recurrent <- as.data.frame(Output_query_recurrent[1:20, ])
AML_Primary <- as.data.frame(Output_query_primary[1:20, ])
AML_Primary_20cases <- AML_Primary$cases
query_primary_20cases <-GDCquery(project = 'TARGET-AML',
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
sample.type = "Primary Blood Derived Cancer - Bone Marrow",
barcode = AML_Primary_20cases)
GDCdownload(query_primary_20cases)
GDCprepare(query_primary_20cases, summarizedExperiment = T)
AML_Primary_prep <- GDCprepare(query_primary_20cases, summarizedExperiment = T)
AML_Primary_Dataset <- assay(AML_Primary_prep, 'unstranded')
AML_Primary_Dataset <- as.data.frame(AML_Primary_Dataset)
AML_Recurrent_20cases <- AML_Recurrent$cases
query_recurrent_20cases <-GDCquery(project = 'TARGET-AML',
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
sample.type = "Recurrent Blood Derived Cancer - Bone Marrow",
barcode = AML_Recurrent_20cases)
GDCdownload(query_recurrent_20cases)
GDCprepare(query_recurrent_20cases, summarizedExperiment = T)
AML_Recurrent_prep <- GDCprepare(query_recurrent_20cases, summarizedExperiment = T)
AML_Recurrent_Dataset <- assay(AML_Recurrent_prep, 'unstranded')
AML_Recurrent_Dataset <- as.data.frame(AML_Recurrent_Dataset)
# Pre-processing 1 - Adjusting NA values
AML_Primary_Dataset[is.na(AML_Primary_Dataset)] <- rowMeans(AML_Primary_Dataset, na.rm = TRUE)
AML_Recurrent_Dataset[is.na(AML_Recurrent_Dataset)] <- rowMeans(AML_Recurrent_Dataset, na.rm = TRUE)
# Pre-processing 2 - Normalization using Trimmed mean of M values -> Adjusts differences in sample variation and sequencing depths
combined_counts <- data.frame(AML_Primary_Dataset, AML_Recurrent_Dataset)
dge_combined <- DGEList(counts = combined_counts, group = NULL)
dge_combined <- calcNormFactors(dge_combined)
dge_combined_TMM <- cpm(dge_combined, log = F)
# Pre-processing 3 - Upper Quantile filter of genes -> Removes lowly expressed genes
dataFilt <- TCGAanalyze_Filtering(
tabDF = dge_combined_TMM,
method = "quantile",
qnt.cut = 0.75)
# Splitting the pre-processed data into primary and recurrent groups for DGE Analysis
dge_Primary_TMM <- dataFilt[,1:20] #dge_combined_TMM[,1:20]
dge_Recurrent_TMM <- dataFilt[,21:40] #dge_combined_TMM[,21:40]
# Differential Gene Expression Analysis using TCGAnalyze with extremely stringent cut-offs for Biomarker discovery
dataDEGs <- TCGAanalyze_DEA(
mat1 = dge_Primary_TMM,
mat2 = dge_Recurrent_TMM,
Cond1type = "Primary",
Cond2type = "Recurrent",
fdr.cut = 0.0005,
logFC.cut = 4,
method = "glmLRT")
plotdata <- dataDEGs # for further plots
# Retrieving gene names
ensembl_ids <- rownames(dataDEGs)
ensembl_ids_clean <- gsub("\\..*", "", ensembl_ids)
gene_symbols <- mapIds(
x = org.Hs.eg.db,
keys = ensembl_ids_clean,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
dataDEGs <- data.frame(gene_symbols, dataDEGs)
dataDEGs$gene_name = NULL
dataDEGs$gene_type = NULL
print(gene_symbols)
# Heatmap of DEGs
heat.data <- dataFilt[rownames(plotdata),]
plotcolnames <- t(Metadata_AML$Subtype)
colnames(heat.data) <- plotcolnames
rownames(heat.data) <- dataDEGs$gene_symbols
heatmap.2(as.matrix(heat.data),
Rowv = F,
Colv = F,
dendrogram = "none",
scale = "row",
trace = "none",
col = bluered(300),
cexRow = 0.6,
cexCol = 0.7,
keysize = 1,
key.title = 'Expression',
margins = c(7,7),
main = 'Differentially Expressed Genes',
xlab = 'Sample Type',
ylab = 'Genes')
# Bubble plot - DGE Analysis
ggplot(dataDEGs, aes(x = logFC, y = gene_symbols, size = -log10(PValue))) +
geom_point(alpha=0.5)
# Exporting DGE analysis findings
write.csv(dataDEGs, "AML_DEGs.csv")
# Functional Enrichment
# Gene Ontology (GO) and Pathway enrichment by DEGs list
Genelist <- dataDEGs$gene_symbols
ansEA <- TCGAanalyze_EAcomplete(
TFname = "DEA genes Primary Vs Recurrent",
RegulonList = Genelist)
# Enrichment Analysis EA (TCGAVisualize)
# Gene Ontology (GO) and Pathway enrichment barPlot
TCGAvisualize_EAbarplot(
tf = rownames(ansEA$ResBP),
GOBPTab = head(ansEA$ResBP),
GOCCTab = ansEA$ResCC,
GOMFTab = ansEA$ResMF,
PathTab = ansEA$ResPat,
nRGTab = Genelist,
nBar = 10
)
# Print top 5 GO Biological processes
for (i in 1:5) { print(ansEA[["ResBP"]][[i]])}
# END OF CODE
############## MACHINE LEARNING ##################
# START OF CODE
library(caret)
library(pROC)
library(DALEX)
library(org.Hs.eg.db)
# Importing Data sets
Metadata_AML <- read.csv("https://raw.githubusercontent.com/Naren037/hackbio-cancer-internship/refs/heads/main/TASK%203/Datasets/AML%20Metadata.csv", row.names=1)
Rawdata_AML <- read.csv("https://raw.githubusercontent.com/Naren037/hackbio-cancer-internship/refs/heads/main/TASK%203/Datasets/AML%20Rawdata.csv", row.names=1)
# Pre-processing 1 - Selecting high variance genes
trans.data <- data.frame(t(Rawdata_AML))
SD = apply(trans.data, 2, sd)
Top_predictions = order(SD, decreasing = T)[1:1000]
trans.data = trans.data[, Top_predictions]
dim(trans.data)
# Adding metadata
rownames(Metadata_AML) <- Metadata_AML$CASE.ID
Metadata_AML$`CASE.ID` = NULL
trans.data <- data.frame(Metadata_AML,trans.data)
# Pre-processing 2 - Removing zero values and central values
zero_var <- preProcess(trans.data, method = 'nzv', uniqueCut = 15)
trans.data <- predict(zero_var, trans.data)
dim(trans.data)
# Pre-processing 3 - Centering to avoid feature scaling bias
center_var <- preProcess(trans.data, method = 'center')
trans.data <- predict(center_var, trans.data)
dim(trans.data)
# Pre-processing 4 - Removing highly correlated features
correlation <- preProcess(trans.data, method = 'corr', cutoff = 0.53) #ADJUST LATER
Processed_data <- predict(correlation, trans.data)
# Heat map of pre-processed data set
heatmap.2(as.matrix(Processed_data[,2:50]),
scale = "col",
trace = "none",
Rowv = F,
Colv = F,
dendrogram = "none",
col = bluered(200),
keysize = 1,
key.title = 'Expression',
main = "Pre-processed Expression Profile",
xlab = 'GENES',
ylab = 'SAMPLES',
cexRow = 0.5,
cexCol = 0.7,
margins = c(10,10))
# Machine Learning
# Repeat loop for 100% train accuracy and >90% test accuracy by shuffling the data set in every iteration until the desired accuracy is obtained
# NOTE: Try interrupting R and re-running the loop if it takes too long :-) it works!
repeat {
Processed_data_jumbled <- Processed_data[sample(nrow(Processed_data)), ]
# 70-30 split
datasplit <- createDataPartition(y = Processed_data_jumbled$Subtype, p = 0.7)[[1]]
train <- Processed_data[datasplit,]
test <- Processed_data[-datasplit,]
# Model training using KNN
#control <- trainControl(method = 'cv', number = 4)
control <- trainControl(method = 'repeatedcv', number = 4, repeats = 10) # extra - repeats
KNN <- train(Subtype~.,
data = train,
method = 'knn',
trControl = control,
preProcess = "scale", # extra
tuneGrid = data.frame(k=1:8))
KNN$bestTune
PredictedTrain <- predict(KNN, newdata = train)
PredictedTest <- predict(KNN, newdata = test)
# Interpretation
PredictedTrain <- as.factor(PredictedTrain)
train$Subtype <- as.factor(train$Subtype)
levels(PredictedTrain) <- levels(train$Subtype)
x <- confusionMatrix(PredictedTrain, train$Subtype)
PredictedTest <- as.factor(PredictedTest)
test$Subtype <- as.factor(test$Subtype)
levels(PredictedTest) <- levels(test$Subtype)
y <- confusionMatrix(PredictedTest, test$Subtype)
# Condition check
if (x[["overall"]][["Accuracy"]] == 1 & y[["overall"]][["Accuracy"]] > 0.9)
{break}
}
confusionMatrix(PredictedTrain, train$Subtype)
confusionMatrix(PredictedTest, test$Subtype)
# Variable importance
explainer.aml <- explain(KNN,
label = 'knn',
data = train,
y = as.numeric(train$Subtype))
importance.aml <- feature_importance(explainer.aml, n_sample = 40, type = 'difference')
head(importance.aml)
Top25_Genes <- print(as.data.frame(importance.aml[3:27, 1]))
colnames(Top25_Genes) <- c("GeneID")
rownames(Top25_Genes) <- Top25_Genes$GeneID
# Retrieving gene names
ensembl_ids <- rownames(Top25_Genes)
ensembl_ids_clean <- gsub("\\..*", "", ensembl_ids)
gene_symbols <- mapIds(
x = org.Hs.eg.db,
keys = ensembl_ids_clean,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
# Printing and Exporting Top25 genes (Features)
Top25_Genes <- data.frame(gene_symbols, Top25_Genes)
Top25_Genes$GeneID = NULL
print(Top25_Genes$gene_symbols) # Top 25 genes
write.csv(Top25_Genes, "Top25_Genes_ML.csv")
# END OF CODE