-
Notifications
You must be signed in to change notification settings - Fork 0
/
multi-gpu-model-parallel.py
95 lines (73 loc) · 3.31 KB
/
multi-gpu-model-parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time
class ModelPart1(nn.Module):
def __init__(self):
super(ModelPart1, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
def forward(self, x):
x = self.conv1(x)
x = nn.functional.relu(x)
x = self.pool(x)
return x
class ModelPart2(nn.Module):
def __init__(self):
super(ModelPart2, self).__init__()
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
self.fc1 = nn.Linear(32 * 16 * 16, 64)
self.fc2 = nn.Linear(64, 10)
def forward(self, x):
x = self.conv2(x)
x = nn.functional.relu(x)
x = x.view(64, -1)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.fc2(x)
return x
device1 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device2 = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using devices: {device1}, {device2}")
model_part1 = ModelPart1().to(device1)
model_part2 = ModelPart2().to(device2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
list(model_part1.parameters()) + list(model_part2.parameters()),
lr=0.001, momentum=0.9
)
train_loader = torch.utils.data.DataLoader(
torchvision.datasets.CIFAR10(root='./data', train=True, download=True,
transform=transforms.ToTensor()),
batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
time_start = time.time()
for epoch in range(10):
running_loss = 0.0
running_corrects = 0
for i, (inputs, labels) in enumerate(train_loader):
# print(f"Batch {i}: Input shape: {inputs.shape}, Labels shape: {labels.shape}")
inputs = inputs.to(device1)
labels = labels.to(device2)
# print(f"Batch {i}: Inputs device: {inputs.device}, Labels device: {labels.device}")
optimizer.zero_grad()
intermediates = model_part1(inputs)
# print(f"Batch {i}: Intermediates shape: {intermediates.shape}, device: {intermediates.device}")
intermediates = intermediates.to(device2)
# print(f"Batch {i}: Intermediates after moving to device2: shape: {intermediates.shape}, device: {intermediates.device}")
outputs = model_part2(intermediates)
# print(f"Batch {i}: Outputs shape: {outputs.shape}, device: {outputs.device}")
# print(f"Batch {i}: Labels shape: {labels.shape}, device: {labels.device}")
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
_, preds = torch.max(outputs, 1)
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if i == 0: # Only print for the first batch
break # Exit after the first batch to avoid flooding the output
epoch_loss = running_loss / len(train_loader.dataset)
epoch_acc = running_corrects.float() / len(train_loader.dataset)
print('Epoch [{}/{}], Loss: {:.4f}, Acc: {:.4f}'.format(epoch+1, 10, epoch_loss, epoch_acc))
print(f"Total training time: {time.time() - time_start:.2f} seconds")