Skip to content

Commit

Permalink
update multiple gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
um3 committed Jan 24, 2024
1 parent df95b92 commit bce32d1
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
Binary file added .train.py.swp
Binary file not shown.
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def get_id(img_path):
#if opt.fp16:
# model_structure = network_to_half(model_structure)

if torch.cuda.get_device_capability()[0]>6: # should be >=7
if torch.cuda.get_device_capability()[0]>6 and len(opt.gpu_ids)==1: # should be >=7
print("Compiling model...")
# https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0
torch.set_float32_matmul_precision('high')
Expand Down
17 changes: 9 additions & 8 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
if epoch<opt.warm_epoch and phase == 'train':
warm_up = min(1.0, warm_up + 0.9 / warm_iteration)
loss = loss*warm_up
print(loss, warm_up)

if phase == 'train':
if fp16: # we use optimier to backward loss
Expand Down Expand Up @@ -436,7 +437,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
#print('Best val Acc: {:4f}'.format(best_acc))
#print('Best val Acc: {:4f}'.format(best_acc)

# load best model weights
model.load_state_dict(last_model_wts)
Expand Down Expand Up @@ -511,18 +512,12 @@ def save_network(network, epoch_label):
# model to gpu
model = model.cuda()

if torch.cuda.get_device_capability()[0]>6: # should be >=7
torch.set_float32_matmul_precision('high')
print("Compiling model... The first epoch may be slow, which is expected!")
# https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0
model = torch.compile(model, mode="reduce-overhead", dynamic = True) # pytorch 2.0

optim_name = optim.SGD #apex.optimizers.FusedSGD
if opt.FSGD: # apex is needed
optim_name = FusedSGD

if len(opt.gpu_ids)>1:
model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids).cuda()
model = torch.nn.DataParallel(model, device_ids=opt.gpu_ids)
if not opt.PCB:
ignored_params = list(map(id, model.module.classifier.parameters() ))
base_params = filter(lambda p: id(p) not in ignored_params, model.module.parameters())
Expand Down Expand Up @@ -604,6 +599,12 @@ def save_network(network, epoch_label):
#optimizer_ft = FP16_Optimizer(optimizer_ft, static_loss_scale = 128.0)
model, optimizer_ft = amp.initialize(model, optimizer_ft, opt_level = "O1")

if torch.cuda.get_device_capability()[0]>6 and len(opt.gpu_ids)==1: # should be >=7 and one gpu
torch.set_float32_matmul_precision('high')
print("Compiling model... The first epoch may be slow, which is expected!")
# https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0
model = torch.compile(model, mode="reduce-overhead", dynamic = True) # pytorch 2.0

model = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=opt.total_epoch)

0 comments on commit bce32d1

Please sign in to comment.