Skip to content

Commit 42e49fe

Browse files
committed
🎉 Initial commit
0 parents  commit 42e49fe

6 files changed

Lines changed: 772 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
data*
2+
__pycache__/
3+
.ipynb_checkpoints/

Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
SHELL := /bin/bash
2+
PATH := /root/miniconda3/bin:$(PATH)
3+
4+
.PHONY: mnist_ddp.py mnist.py mnist_hvd.py
5+
6+
mnist.py:
7+
torchrun $@
8+
9+
mnist_ddp.py: OMP_NUM_THREADS = 1
10+
mnist_ddp.py:
11+
torchrun --nproc_per_node=8 $@ --batch-size 64 --epochs 14
12+
13+
mnist_hvd.py:
14+
horovodrun -np 8 -H localhost:8 python $@ --batch-size 64

mnist.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
from __future__ import print_function
2+
import time
3+
import argparse
4+
import torch
5+
import torch.nn as nn
6+
import torch.nn.functional as F
7+
import torch.optim as optim
8+
from torchvision import datasets, transforms
9+
from torch.optim.lr_scheduler import StepLR
10+
11+
12+
class Net(nn.Module):
13+
def __init__(self):
14+
super(Net, self).__init__()
15+
self.conv1 = nn.Conv2d(1, 32, 3, 1)
16+
self.conv2 = nn.Conv2d(32, 64, 3, 1)
17+
self.dropout1 = nn.Dropout(0.25)
18+
self.dropout2 = nn.Dropout(0.5)
19+
self.fc1 = nn.Linear(9216, 128)
20+
self.fc2 = nn.Linear(128, 10)
21+
22+
def forward(self, x):
23+
x = self.conv1(x)
24+
x = F.relu(x)
25+
x = self.conv2(x)
26+
x = F.relu(x)
27+
x = F.max_pool2d(x, 2)
28+
x = self.dropout1(x)
29+
x = torch.flatten(x, 1)
30+
x = self.fc1(x)
31+
x = F.relu(x)
32+
x = self.dropout2(x)
33+
x = self.fc2(x)
34+
output = F.log_softmax(x, dim=1)
35+
return output
36+
37+
38+
def train(args, model, device, train_loader, optimizer, epoch):
39+
model.train()
40+
for batch_idx, (data, target) in enumerate(train_loader):
41+
data, target = data.to(device), target.to(device)
42+
optimizer.zero_grad()
43+
output = model(data)
44+
loss = F.nll_loss(output, target)
45+
loss.backward()
46+
optimizer.step()
47+
if batch_idx % args.log_interval == 0:
48+
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
49+
epoch, batch_idx * len(data), len(train_loader.dataset),
50+
100. * batch_idx / len(train_loader), loss.item()))
51+
if args.dry_run:
52+
break
53+
54+
55+
def test(model, device, test_loader):
56+
model.eval()
57+
test_loss = 0
58+
correct = 0
59+
with torch.no_grad():
60+
for data, target in test_loader:
61+
data, target = data.to(device), target.to(device)
62+
output = model(data)
63+
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
64+
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
65+
correct += pred.eq(target.view_as(pred)).sum().item()
66+
67+
test_loss /= len(test_loader.dataset)
68+
69+
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
70+
test_loss, correct, len(test_loader.dataset),
71+
100. * correct / len(test_loader.dataset)))
72+
73+
74+
def main():
75+
# Training settings
76+
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
77+
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
78+
help='input batch size for training (default: 64)')
79+
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
80+
help='input batch size for testing (default: 1000)')
81+
parser.add_argument('--epochs', type=int, default=14, metavar='N',
82+
help='number of epochs to train (default: 14)')
83+
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
84+
help='learning rate (default: 1.0)')
85+
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
86+
help='Learning rate step gamma (default: 0.7)')
87+
parser.add_argument('--no-cuda', action='store_true', default=False,
88+
help='disables CUDA training')
89+
parser.add_argument('--no-mps', action='store_true', default=False,
90+
help='disables macOS GPU training')
91+
parser.add_argument('--dry-run', action='store_true', default=False,
92+
help='quickly check a single pass')
93+
parser.add_argument('--seed', type=int, default=1, metavar='S',
94+
help='random seed (default: 1)')
95+
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
96+
help='how many batches to wait before logging training status')
97+
parser.add_argument('--save-model', action='store_true', default=False,
98+
help='For Saving the current Model')
99+
args = parser.parse_args()
100+
use_cuda = not args.no_cuda and torch.cuda.is_available()
101+
use_mps = not args.no_mps and torch.backends.mps.is_available()
102+
103+
torch.manual_seed(args.seed)
104+
105+
if use_cuda:
106+
device = torch.device("cuda")
107+
elif use_mps:
108+
device = torch.device("mps")
109+
else:
110+
device = torch.device("cpu")
111+
112+
train_kwargs = {'batch_size': args.batch_size}
113+
test_kwargs = {'batch_size': args.test_batch_size}
114+
if use_cuda:
115+
cuda_kwargs = {'num_workers': 1,
116+
'pin_memory': True,
117+
'shuffle': True}
118+
train_kwargs.update(cuda_kwargs)
119+
test_kwargs.update(cuda_kwargs)
120+
121+
transform=transforms.Compose([
122+
transforms.ToTensor(),
123+
transforms.Normalize((0.1307,), (0.3081,))
124+
])
125+
dataset1 = datasets.MNIST('../data', train=True, download=True,
126+
transform=transform)
127+
dataset2 = datasets.MNIST('../data', train=False,
128+
transform=transform)
129+
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
130+
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
131+
132+
model = Net().to(device)
133+
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
134+
135+
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
136+
for epoch in range(1, args.epochs + 1):
137+
train(args, model, device, train_loader, optimizer, epoch)
138+
test(model, device, test_loader)
139+
scheduler.step()
140+
141+
if args.save_model:
142+
torch.save(model.state_dict(), "mnist_cnn.pt")
143+
144+
145+
if __name__ == '__main__':
146+
start = time.time()
147+
main()
148+
print(f'Total time elapsed: {time.time() - start} seconds')

mnist_ddp.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#########################################################################
2+
# Reference: https://blog.csdn.net/u010900574/article/details/122780585 #
3+
#########################################################################
4+
5+
from __future__ import print_function
6+
import os
7+
import time
8+
import argparse
9+
import torch
10+
import torch.nn as nn
11+
import torch.nn.functional as F
12+
import torch.optim as optim
13+
from torchvision import datasets, transforms
14+
from torch.optim.lr_scheduler import StepLR
15+
import torch.distributed as dist
16+
17+
def init_distributed_mode(args):
18+
"""
19+
initilize DDP
20+
"""
21+
os.environ['OMP_NUM_THREADS'] = "1"
22+
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
23+
args.rank = int(os.environ["RANK"])
24+
args.world_size = int(os.environ["WORLD_SIZE"])
25+
args.gpu = int(os.environ["LOCAL_RANK"])
26+
elif "SLURM_PROCID" in os.environ:
27+
args.rank = int(os.environ["SLURM_PROCID"])
28+
args.gpu = args.rank % torch.cuda.device_count()
29+
elif hasattr(args, "rank"):
30+
pass
31+
else:
32+
print("Not using distributed mode")
33+
args.distributed = False
34+
return
35+
36+
args.distributed = True
37+
38+
torch.cuda.set_device(args.gpu)
39+
args.dist_backend = "nccl"
40+
print(f"| distributed init (rank {args.rank}): {args.dist_url}, local rank:{args.gpu}, world size:{args.world_size}", flush=True)
41+
dist.init_process_group(
42+
backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
43+
)
44+
45+
class Net(nn.Module):
46+
def __init__(self):
47+
super(Net, self).__init__()
48+
self.conv1 = nn.Conv2d(1, 32, 3, 1)
49+
self.conv2 = nn.Conv2d(32, 64, 3, 1)
50+
self.dropout1 = nn.Dropout(0.25)
51+
self.dropout2 = nn.Dropout(0.5)
52+
self.fc1 = nn.Linear(9216, 128)
53+
self.fc2 = nn.Linear(128, 10)
54+
55+
def forward(self, x):
56+
x = self.conv1(x)
57+
x = F.relu(x)
58+
x = self.conv2(x)
59+
x = F.relu(x)
60+
x = F.max_pool2d(x, 2)
61+
x = self.dropout1(x)
62+
x = torch.flatten(x, 1)
63+
x = self.fc1(x)
64+
x = F.relu(x)
65+
x = self.dropout2(x)
66+
x = self.fc2(x)
67+
output = F.log_softmax(x, dim=1)
68+
return output
69+
70+
71+
def train(args, model, device, train_loader, optimizer, epoch):
72+
model.train()
73+
for batch_idx, (data, target) in enumerate(train_loader):
74+
data, target = data.to(device), target.to(device)
75+
optimizer.zero_grad()
76+
output = model(data)
77+
loss = F.nll_loss(output, target)
78+
loss.backward()
79+
optimizer.step()
80+
if args.distributed:
81+
if dist.get_rank() == 0:
82+
if batch_idx % args.log_interval == 0:
83+
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
84+
epoch, dist.get_world_size() * batch_idx * len(data), len(train_loader.dataset),
85+
100. * batch_idx / len(train_loader), loss.item()))
86+
else:
87+
if batch_idx % args.log_interval == 0:
88+
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
89+
epoch, batch_idx * len(data), len(train_loader.dataset),
90+
100. * batch_idx / len(train_loader), loss.item()))
91+
if args.dry_run:
92+
break
93+
94+
95+
def test(model, device, test_loader):
96+
model.eval()
97+
test_loss = 0
98+
correct = 0
99+
with torch.no_grad():
100+
for data, target in test_loader:
101+
data, target = data.to(device), target.to(device)
102+
output = model(data)
103+
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
104+
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
105+
correct += pred.eq(target.view_as(pred)).sum().item()
106+
107+
test_loss /= len(test_loader.dataset)
108+
109+
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
110+
test_loss, correct, len(test_loader.dataset),
111+
100. * correct / len(test_loader.dataset)))
112+
113+
114+
def main():
115+
# Training settings
116+
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
117+
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
118+
help='input batch size for training (default: 64)')
119+
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
120+
help='input batch size for testing (default: 1000)')
121+
parser.add_argument('--epochs', type=int, default=14, metavar='N',
122+
help='number of epochs to train (default: 14)')
123+
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
124+
help='learning rate (default: 1.0)')
125+
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
126+
help='Learning rate step gamma (default: 0.7)')
127+
parser.add_argument('--no-cuda', action='store_true', default=False,
128+
help='disables CUDA training')
129+
parser.add_argument('--dry-run', action='store_true', default=False,
130+
help='quickly check a single pass')
131+
parser.add_argument('--seed', type=int, default=1, metavar='S',
132+
help='random seed (default: 1)')
133+
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
134+
help='how many batches to wait before logging training status')
135+
parser.add_argument('--save-model', action='store_true', default=False,
136+
help='For Saving the current Model')
137+
138+
parser.add_argument('--local_rank', type=int, help='local rank, will passed by ddp')
139+
parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
140+
parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
141+
args = parser.parse_args()
142+
use_cuda = not args.no_cuda and torch.cuda.is_available()
143+
144+
init_distributed_mode(args)
145+
146+
torch.manual_seed(args.seed)
147+
148+
device = torch.device("cuda" if use_cuda else "cpu")
149+
150+
train_kwargs = {'batch_size': args.batch_size}
151+
test_kwargs = {'batch_size': args.test_batch_size}
152+
if use_cuda:
153+
cuda_kwargs = {'num_workers': 1,
154+
'pin_memory': True,
155+
}
156+
train_kwargs.update(cuda_kwargs)
157+
test_kwargs.update(cuda_kwargs)
158+
159+
transform=transforms.Compose([
160+
transforms.ToTensor(),
161+
transforms.Normalize((0.1307,), (0.3081,))
162+
])
163+
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
164+
val_dataset = datasets.MNIST('./data', train=False, transform=transform)
165+
if args.distributed:
166+
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True)
167+
else:
168+
train_sampler = torch.utils.data.RandomSampler(train_dataset)
169+
test_sampler = torch.utils.data.SequentialSampler(val_dataset)
170+
171+
train_loader = torch.utils.data.DataLoader(train_dataset, sampler = train_sampler, **train_kwargs)
172+
test_loader = torch.utils.data.DataLoader(val_dataset, sampler = test_sampler, **test_kwargs)
173+
174+
model = Net().to(device)
175+
model_without_ddp = model
176+
if args.distributed:
177+
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
178+
model_without_ddp = model.module
179+
180+
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
181+
182+
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
183+
for epoch in range(1, args.epochs + 1):
184+
if args.distributed:
185+
train_sampler.set_epoch(epoch)
186+
train(args, model, device, train_loader, optimizer, epoch)
187+
if args.distributed:
188+
# Only run validation on GPU 0 process, for simplity, so we do not run validation on multi gpu.
189+
if dist.get_rank() == 0:
190+
test(model_without_ddp, device, test_loader)
191+
else:
192+
test(model, device, test_loader)
193+
scheduler.step()
194+
195+
if args.save_model:
196+
if args.distributed:
197+
if dist.get_rank() == 0:
198+
# only save model on GPU0 process.
199+
torch.save(model.state_dict(), f"mnist_cnn.pt")
200+
else:
201+
torch.save(model.state_dict(), f"mnist_cnn_.pt")
202+
203+
204+
if __name__ == '__main__':
205+
start = time.time()
206+
main()
207+
print(f'Total time elapsed: {time.time() - start} seconds')

0 commit comments

Comments
 (0)