Skip to content

Commit 9053785

Browse files
Merge pull request #2 from AI-Enabled-Software-Testing/feat/ml-models
Update models
2 parents da21bc6 + 81ba0a5 commit 9053785

26 files changed

Lines changed: 4279 additions & 621 deletions

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ __pycache__/
33
.cache/
44
.ruff_cache/
55
*.pyc
6-
.aim/
6+
.aim/
7+
**/runs/

README.md

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,6 @@ This project aims to explore and analyze metaheuristic search-based algorithms f
55
This is our [idea](./Project%20Proposal/Project%20Proposal%20-%20Fernando%20and%20Kelvin.pdf).
66

77
## Datasets
8-
* [MNIST](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html)
9-
* Handwritten Digit Recognition
10-
* Using scikit-learn's fetch_openml
11-
* 28x28 Grayscale Images
12-
* 10 Classes of digits (0-9)
138
* [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html)
149
* Object Recognition
1510
* 32x32 Colored Images
@@ -18,8 +13,7 @@ This is our [idea](./Project%20Proposal/Project%20Proposal%20-%20Fernando%20and%
1813

1914
## Models in Consideration
2015
* **Tree-based Model**: Decision Tree
21-
* **Linear/Polynomial-based**: Linear Regression (optional)
22-
* **Permutation-based** (especially, neural networks): Multi-Layer Perceptron (MLP)
16+
* **(Pixels) Permutation-based** (especially, neural networks): Convolutional Neural Network (CNN)
2317
* **Kernel-based**: K-Nearest-Neighbor (KNN)
2418

2519
## Metaheuristic Guided Search
@@ -81,4 +75,33 @@ This is our [idea](./Project%20Proposal/Project%20Proposal%20-%20Fernando%20and%
8175
2. Run `data_process.py` to process the images in the datasets.
8276
3. Run `data_explorer.py` to view details of processed images from different API endpoints.
8377
* Note: You may need to use a client such as Postman to launch those API requests.
84-
* Note: Refer to [`openapi.yaml`](openapi.yaml) for more detailed descriptions of those endpoints.
78+
* Note: Refer to [`openapi.yaml`](openapi.yaml) for more detailed descriptions of those endpoints.
79+
### Quick Model Training
80+
You can quickly train a CNN model on CIFAR-10 using the provided training script:
81+
82+
```bash
83+
# Basic training with default parameters (300 epochs)
84+
python scripts/train_cnn.py
85+
86+
# Quick training with fewer epochs for testing
87+
python scripts/train_cnn.py --epochs 10 --batch-size 64
88+
89+
# Custom training with specific hyperparameters
90+
python scripts/train_cnn.py --epochs 50 --batch-size 128 --lr 0.001 --model-path .cache/models/my_cnn.pth
91+
```
92+
93+
**Available arguments:**
94+
- `--epochs`: Number of training epochs (default: 300)
95+
- `--batch-size`: Batch size for training (default: 128)
96+
- `--lr`: Learning rate (default: 0.0003)
97+
- `--model-path`: Path to save the trained model (default: .cache/models/cnn_cifar.pth)
98+
- `--device`: Force device selection (cuda/cpu, auto-detects if not specified)
99+
100+
The script includes:
101+
- Automatic CIFAR-10 data loading and preprocessing
102+
- TensorBoard logging for training visualization
103+
- Early stopping and model checkpointing
104+
- CUDA support with automatic device detection
105+
106+
### Model Training with a Customized Tuning Process
107+
* A Proof-of-Concept end-to-end quick demo is shown in the Jupyter Notebook: `notebooks\model_training_flow.ipynb`, including: a shorter demo with less data, data and model loading processes, an exhaustive tuning (without metaheuristics) on only the validation set, training and evaluating on the best found set of hyperparameters for each model.

framework/data_utils.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
"""Data loading and preprocessing utilities."""
2+
23
from pathlib import Path
3-
from typing import Tuple, List
4+
from typing import List, Tuple
45
import numpy as np
56
from datasets import load_from_disk
67
from sklearn.model_selection import train_test_split
78
from torch.utils.data import DataLoader
8-
from framework.datasets import CIFAR10Dataset
9+
910
from framework import utils
11+
from framework.datasets import CIFAR10Dataset
12+
1013

1114
def load_cifar10_data():
1215
"""Load CIFAR-10 dataset (grayscale from processed datasets)."""
@@ -21,8 +24,7 @@ def load_cifar10_data():
2124
"uv run python -m scripts.data_process"
2225
)
2326

24-
ds_dict = load_from_disk(str(dataset_path))
25-
return ds_dict
27+
return load_from_disk(str(dataset_path))
2628

2729

2830
def prepare_data(ds_dict, split: str):
@@ -39,14 +41,10 @@ def split_train_val(
3941
images: List[np.ndarray],
4042
labels: np.ndarray,
4143
val_ratio: float = 0.2,
42-
random_state: int = 42
44+
random_state: int = 42,
4345
) -> Tuple[List[np.ndarray], np.ndarray, List[np.ndarray], np.ndarray]:
4446
X_train, X_val, y_train, y_val = train_test_split(
45-
images,
46-
labels,
47-
test_size=val_ratio,
48-
stratify=labels,
49-
random_state=random_state
47+
images, labels, test_size=val_ratio, stratify=labels, random_state=random_state
5048
)
5149
return X_train, y_train, X_val, y_val
5250

@@ -57,15 +55,15 @@ def create_dataloaders(
5755
X_val: List[np.ndarray],
5856
y_val: np.ndarray,
5957
batch_size: int,
60-
num_workers: int = 2
58+
num_workers: int = 2,
6159
) -> Tuple[DataLoader, DataLoader]:
6260
train_dataset = CIFAR10Dataset(X_train, y_train)
6361
train_loader = DataLoader(
6462
train_dataset,
6563
batch_size=batch_size,
6664
shuffle=True,
6765
num_workers=num_workers,
68-
pin_memory=utils.is_cuda(),
66+
pin_memory=utils.is_cuda_available(),
6967
)
7068

7169
val_dataset = CIFAR10Dataset(X_val, y_val)
@@ -74,8 +72,7 @@ def create_dataloaders(
7472
batch_size=batch_size,
7573
shuffle=False,
7674
num_workers=num_workers,
77-
pin_memory=utils.is_cuda()
75+
pin_memory=utils.is_cuda_available(),
7876
)
7977

8078
return train_loader, val_loader
81-

framework/datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,3 @@ def __getitem__(self, idx):
1818
image = torch.from_numpy(image).unsqueeze(0)
1919

2020
return image, label
21-

framework/training.py

Lines changed: 46 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import torch
2+
import torch.nn as nn
23
from tqdm import tqdm
34
from typing import Tuple, Optional
45
from pathlib import Path
@@ -13,64 +14,56 @@ class Checkpoint:
1314
def __init__(self, model_path: str):
1415
self.model_path = Path(model_path)
1516
self.best_val_acc = 0.0
16-
17+
1718
def save_if_better(
1819
self,
1920
model: Module,
2021
optimizer: Optimizer,
2122
epoch: int,
2223
val_acc: float,
2324
train_acc: float,
24-
**kwargs
25+
**kwargs,
2526
) -> bool:
2627
"""Save checkpoint if validation accuracy improved."""
2728
if val_acc > self.best_val_acc:
2829
self.best_val_acc = val_acc
2930
self.model_path.parent.mkdir(parents=True, exist_ok=True)
30-
31-
checkpoint = {
32-
'epoch': epoch,
33-
'model_state_dict': model.state_dict(),
34-
'optimizer_state_dict': optimizer.state_dict(),
35-
'val_acc': val_acc,
36-
'train_acc': train_acc,
37-
**kwargs
31+
32+
checkpoint_data = {
33+
"model_state_dict": model.state_dict(),
34+
"optimizer_state_dict": optimizer.state_dict(),
35+
"epoch": epoch,
36+
"val_acc": val_acc,
37+
"train_acc": train_acc,
38+
**kwargs,
3839
}
39-
40-
torch.save(checkpoint, str(self.model_path))
40+
torch.save(checkpoint_data, self.model_path)
4141
return True
4242
return False
43-
44-
def load(self, model: Module, optimizer: Optimizer) -> dict:
45-
"""Load checkpoint from disk."""
46-
checkpoint = torch.load(str(self.model_path), map_location='cpu')
47-
model.load_state_dict(checkpoint['model_state_dict'])
48-
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
49-
self.best_val_acc = checkpoint.get('val_acc', 0.0)
50-
return checkpoint
5143

5244

5345
class EarlyStopping:
54-
def __init__(self, patience: int = 10, min_delta: float = 0.0) -> None:
46+
def __init__(self, patience: int = 7, min_delta: float = 0.0):
5547
self.patience = patience
5648
self.min_delta = min_delta
57-
self.best_loss = float('inf')
5849
self.counter = 0
59-
self.best_acc = 0.0
60-
61-
def __call__(self, val_loss: float, val_acc: float) -> bool:
62-
if val_loss < self.best_loss - self.min_delta:
50+
self.best_loss: Optional[float] = None
51+
self.best_acc: float = 0.0
52+
53+
def __call__(self, val_loss: float, val_acc: Optional[float] = None) -> bool:
54+
"""Returns True if training should stop."""
55+
if self.best_loss is None or val_loss < self.best_loss - self.min_delta:
6356
self.best_loss = val_loss
64-
self.best_acc = val_acc
57+
if val_acc is not None:
58+
self.best_acc = max(self.best_acc, val_acc)
6559
self.counter = 0
6660
return False
67-
else:
68-
self.counter += 1
69-
if self.counter >= self.patience:
70-
return True
71-
return False
7261

62+
if val_acc is not None:
63+
self.best_acc = max(self.best_acc, val_acc)
7364

65+
self.counter += 1
66+
return self.counter >= self.patience
7467

7568

7669
def train_epoch(
@@ -79,18 +72,20 @@ def train_epoch(
7972
criterion: torch.nn.Module,
8073
optimizer: Optimizer,
8174
device: torch.device,
82-
scheduler: LRScheduler,
75+
scheduler: Optional[LRScheduler] = None,
8376
epoch: int = 0,
8477
grad_clip_norm: float = 1.0,
85-
writer: Optional[SummaryWriter] = None
78+
writer: Optional[SummaryWriter] = None,
8679
) -> Tuple[float, float]:
8780
"""Trains the model for one epoch and returns the epoch loss and accuracy."""
88-
model.train()
81+
nn.Module.train(model, mode=True)
8982
running_loss = 0.0
9083
correct = 0
9184
total = 0
9285

93-
for batch_idx, (images, labels) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
86+
for batch_idx, (images, labels) in enumerate(
87+
tqdm(train_loader, desc="Training", leave=False)
88+
):
9489
images, labels = images.to(device), labels.to(device)
9590

9691
optimizer.zero_grad()
@@ -99,40 +94,32 @@ def train_epoch(
9994
loss.backward()
10095
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_norm)
10196
optimizer.step()
102-
scheduler.step()
97+
if scheduler is not None:
98+
scheduler.step()
10399

104-
# Stats - compute once and reuse to avoid duplicate .item() calls
105-
loss_value = loss.item() # Single GPU->CPU sync
100+
loss_value = loss.item()
106101
running_loss += loss_value
107-
102+
108103
_, predicted = torch.max(outputs.data, 1)
109-
batch_correct = (predicted == labels).sum().item() # Single GPU->CPU sync
104+
batch_correct = (predicted == labels).sum().item()
110105
total += labels.size(0)
111106
correct += batch_correct
112107

113-
# Track metrics
114108
if batch_idx % 10 == 0 and writer is not None:
115109
batch_total = labels.size(0)
116110
batch_acc = 100 * batch_correct / batch_total
117-
current_lr = optimizer.param_groups[0]['lr']
111+
current_lr = optimizer.param_groups[0]["lr"]
118112
step = epoch * len(train_loader) + batch_idx
119-
writer.add_scalar('train/batch_loss', loss_value, step)
120-
writer.add_scalar('train/batch_accuracy', batch_acc, step)
121-
writer.add_scalar('train/learning_rate', current_lr, step)
113+
writer.add_scalar("train/batch_loss", loss_value, step)
114+
writer.add_scalar("train/batch_accuracy", batch_acc, step)
115+
writer.add_scalar("train/learning_rate", current_lr, step)
122116

123117
epoch_loss = running_loss / len(train_loader)
124118
epoch_acc = correct / total
125119

126-
# Track epoch-level metrics
127120
if writer is not None:
128-
writer.add_scalar('train/epoch_loss', epoch_loss, epoch)
129-
writer.add_scalar('train/epoch_accuracy', epoch_acc * 100, epoch)
130-
131-
# Log parameter and gradient histograms (only every N epochs to reduce CPU overhead)
132-
if writer is not None and (epoch % 10 == 0 or epoch == 1): # Log every 10 epochs or first epoch
133-
for name, param in model.named_parameters():
134-
writer.add_histogram(f'train_params/{name}', param.data, epoch)
135-
writer.add_histogram(f'train_grads/{name}', param.grad.data, epoch)
121+
writer.add_scalar("train/epoch_loss", epoch_loss, epoch)
122+
writer.add_scalar("train/epoch_accuracy", epoch_acc * 100, epoch)
136123

137124
return epoch_loss, epoch_acc
138125

@@ -143,7 +130,7 @@ def validate(
143130
criterion: torch.nn.Module,
144131
device: torch.device,
145132
epoch: int = 0,
146-
writer: Optional[SummaryWriter] = None
133+
writer: Optional[SummaryWriter] = None,
147134
) -> Tuple[float, float]:
148135
"""Validates the model and returns the epoch loss and accuracy."""
149136
model.eval()
@@ -157,8 +144,8 @@ def validate(
157144

158145
outputs = model(images)
159146
loss = criterion(outputs, labels)
160-
161147
running_loss += loss.item()
148+
162149
_, predicted = torch.max(outputs.data, 1)
163150
total += labels.size(0)
164151
correct += (predicted == labels).sum().item()
@@ -167,8 +154,7 @@ def validate(
167154
epoch_acc = correct / total
168155

169156
if writer is not None:
170-
writer.add_scalar('val/epoch_loss', epoch_loss, epoch)
171-
writer.add_scalar('val/epoch_accuracy', epoch_acc * 100, epoch)
157+
writer.add_scalar("val/epoch_loss", epoch_loss, epoch)
158+
writer.add_scalar("val/epoch_accuracy", epoch_acc * 100, epoch)
172159

173160
return epoch_loss, epoch_acc
174-

0 commit comments

Comments
 (0)