Skip to content

Commit 5f7b3f3

Browse files
muellerzrgojiteji
authored andcommitted
Update all no_trainer with skip_first_batches (huggingface#23664)
1 parent 884b246 commit 5f7b3f3

File tree

12 files changed

+115
-98
lines changed

12 files changed

+115
-98
lines changed

examples/pytorch/image-classification/run_image_classification_no_trainer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -451,22 +451,26 @@ def collate_fn(examples):
451451
if "epoch" in training_difference:
452452
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
453453
resume_step = None
454+
completed_steps = starting_epoch * num_update_steps_per_epoch
454455
else:
455456
resume_step = int(training_difference.replace("step_", ""))
456457
starting_epoch = resume_step // len(train_dataloader)
457458
resume_step -= starting_epoch * len(train_dataloader)
459+
completed_steps = resume_step
460+
461+
# update the progress_bar if load from checkpoint
462+
progress_bar.update(completed_steps)
458463

459464
for epoch in range(starting_epoch, args.num_train_epochs):
460465
model.train()
461466
if args.with_tracking:
462467
total_loss = 0
463-
for step, batch in enumerate(train_dataloader):
464-
# We need to skip steps until we reach the resumed step
465-
if args.resume_from_checkpoint and epoch == starting_epoch:
466-
if resume_step is not None and step < resume_step:
467-
completed_steps += 1
468-
continue
469-
468+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
469+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
470+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
471+
else:
472+
active_dataloader = train_dataloader
473+
for step, batch in enumerate(active_dataloader):
470474
with accelerator.accumulate(model):
471475
outputs = model(**batch)
472476
loss = outputs.loss

examples/pytorch/image-pretraining/run_mim_no_trainer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -660,29 +660,27 @@ def preprocess_images(examples):
660660
if "epoch" in training_difference:
661661
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
662662
resume_step = None
663+
completed_steps = starting_epoch * num_update_steps_per_epoch
663664
else:
664665
# need to multiply `gradient_accumulation_steps` to reflect real steps
665666
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
666667
starting_epoch = resume_step // len(train_dataloader)
667668
resume_step -= starting_epoch * len(train_dataloader)
669+
completed_steps = resume_step
668670

669671
# update the progress_bar if load from checkpoint
670-
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
671-
completed_steps = starting_epoch * num_update_steps_per_epoch
672+
progress_bar.update(completed_steps)
672673

673674
for epoch in range(starting_epoch, args.num_train_epochs):
674675
model.train()
675676
if args.with_tracking:
676677
total_loss = 0
677-
for step, batch in enumerate(train_dataloader):
678-
# We need to skip steps until we reach the resumed step
679-
if args.resume_from_checkpoint and epoch == starting_epoch:
680-
if resume_step is not None and step < resume_step:
681-
if step % args.gradient_accumulation_steps == 0:
682-
progress_bar.update(1)
683-
completed_steps += 1
684-
continue
685-
678+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
679+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
680+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
681+
else:
682+
active_dataloader = train_dataloader
683+
for step, batch in enumerate(active_dataloader):
686684
with accelerator.accumulate(model):
687685
outputs = model(**batch)
688686
loss = outputs.loss

examples/pytorch/language-modeling/run_clm_no_trainer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -566,29 +566,27 @@ def group_texts(examples):
566566
if "epoch" in training_difference:
567567
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
568568
resume_step = None
569+
completed_steps = starting_epoch * num_update_steps_per_epoch
569570
else:
570571
# need to multiply `gradient_accumulation_steps` to reflect real steps
571572
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
572573
starting_epoch = resume_step // len(train_dataloader)
573574
resume_step -= starting_epoch * len(train_dataloader)
575+
completed_steps = resume_step
574576

575577
# update the progress_bar if load from checkpoint
576-
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
577-
completed_steps = starting_epoch * num_update_steps_per_epoch
578+
progress_bar.update(completed_steps)
578579

579580
for epoch in range(starting_epoch, args.num_train_epochs):
580581
model.train()
581582
if args.with_tracking:
582583
total_loss = 0
583-
for step, batch in enumerate(train_dataloader):
584-
# We need to skip steps until we reach the resumed step
585-
if args.resume_from_checkpoint and epoch == starting_epoch:
586-
if resume_step is not None and step < resume_step:
587-
if step % args.gradient_accumulation_steps == 0:
588-
progress_bar.update(1)
589-
completed_steps += 1
590-
continue
591-
584+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
585+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
586+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
587+
else:
588+
active_dataloader = train_dataloader
589+
for step, batch in enumerate(active_dataloader):
592590
with accelerator.accumulate(model):
593591
outputs = model(**batch)
594592
loss = outputs.loss

examples/pytorch/language-modeling/run_mlm_no_trainer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -610,29 +610,27 @@ def group_texts(examples):
610610
if "epoch" in training_difference:
611611
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
612612
resume_step = None
613+
completed_steps = starting_epoch * num_update_steps_per_epoch
613614
else:
614615
# need to multiply `gradient_accumulation_steps` to reflect real steps
615616
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
616617
starting_epoch = resume_step // len(train_dataloader)
617618
resume_step -= starting_epoch * len(train_dataloader)
619+
completed_steps = resume_step
618620

619621
# update the progress_bar if load from checkpoint
620-
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
621-
completed_steps = starting_epoch * num_update_steps_per_epoch
622+
progress_bar.update(completed_steps)
622623

623624
for epoch in range(starting_epoch, args.num_train_epochs):
624625
model.train()
625626
if args.with_tracking:
626627
total_loss = 0
627-
for step, batch in enumerate(train_dataloader):
628-
# We need to skip steps until we reach the resumed step
629-
if args.resume_from_checkpoint and epoch == starting_epoch:
630-
if resume_step is not None and step < resume_step:
631-
if step % args.gradient_accumulation_steps == 0:
632-
progress_bar.update(1)
633-
completed_steps += 1
634-
continue
635-
628+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
629+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
630+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
631+
else:
632+
active_dataloader = train_dataloader
633+
for step, batch in enumerate(active_dataloader):
636634
with accelerator.accumulate(model):
637635
outputs = model(**batch)
638636
loss = outputs.loss

examples/pytorch/multiple-choice/run_swag_no_trainer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -557,22 +557,26 @@ def preprocess_function(examples):
557557
if "epoch" in training_difference:
558558
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
559559
resume_step = None
560+
completed_steps = starting_epoch * num_update_steps_per_epoch
560561
else:
561562
resume_step = int(training_difference.replace("step_", ""))
562563
starting_epoch = resume_step // len(train_dataloader)
563564
resume_step -= starting_epoch * len(train_dataloader)
565+
completed_steps = resume_step
566+
567+
# update the progress_bar if load from checkpoint
568+
progress_bar.update(completed_steps)
564569

565570
for epoch in range(starting_epoch, args.num_train_epochs):
566571
model.train()
567572
if args.with_tracking:
568573
total_loss = 0
569-
for step, batch in enumerate(train_dataloader):
570-
# We need to skip steps until we reach the resumed step
571-
if args.resume_from_checkpoint and epoch == starting_epoch:
572-
if resume_step is not None and step < resume_step:
573-
completed_steps += 1
574-
continue
575-
574+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
575+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
576+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
577+
else:
578+
active_dataloader = train_dataloader
579+
for step, batch in enumerate(active_dataloader):
576580
with accelerator.accumulate(model):
577581
outputs = model(**batch)
578582
loss = outputs.loss

examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -809,22 +809,26 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
809809
if "epoch" in training_difference:
810810
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
811811
resume_step = None
812+
completed_steps = starting_epoch * num_update_steps_per_epoch
812813
else:
813814
resume_step = int(training_difference.replace("step_", ""))
814815
starting_epoch = resume_step // len(train_dataloader)
815816
resume_step -= starting_epoch * len(train_dataloader)
817+
completed_steps = resume_step
818+
819+
# update the progress_bar if load from checkpoint
820+
progress_bar.update(completed_steps)
816821

817822
for epoch in range(starting_epoch, args.num_train_epochs):
818823
model.train()
819824
if args.with_tracking:
820825
total_loss = 0
821-
for step, batch in enumerate(train_dataloader):
822-
# We need to skip steps until we reach the resumed step
823-
if args.resume_from_checkpoint and epoch == starting_epoch:
824-
if resume_step is not None and step < resume_step:
825-
completed_steps += 1
826-
continue
827-
826+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
827+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
828+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
829+
else:
830+
active_dataloader = train_dataloader
831+
for step, batch in enumerate(active_dataloader):
828832
with accelerator.accumulate(model):
829833
outputs = model(**batch)
830834
loss = outputs.loss

examples/pytorch/question-answering/run_qa_no_trainer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -825,22 +825,26 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
825825
if "epoch" in training_difference:
826826
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
827827
resume_step = None
828+
completed_steps = starting_epoch * num_update_steps_per_epoch
828829
else:
829830
resume_step = int(training_difference.replace("step_", ""))
830831
starting_epoch = resume_step // len(train_dataloader)
831832
resume_step -= starting_epoch * len(train_dataloader)
833+
completed_steps = resume_step
834+
835+
# update the progress_bar if load from checkpoint
836+
progress_bar.update(completed_steps)
832837

833838
for epoch in range(starting_epoch, args.num_train_epochs):
834839
model.train()
835840
if args.with_tracking:
836841
total_loss = 0
837-
for step, batch in enumerate(train_dataloader):
838-
# We need to skip steps until we reach the resumed step
839-
if args.resume_from_checkpoint and epoch == starting_epoch:
840-
if resume_step is not None and step < resume_step:
841-
completed_steps += 1
842-
continue
843-
842+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
843+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
844+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
845+
else:
846+
active_dataloader = train_dataloader
847+
for step, batch in enumerate(active_dataloader):
844848
with accelerator.accumulate(model):
845849
outputs = model(**batch)
846850
loss = outputs.loss

examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -554,22 +554,26 @@ def preprocess_val(example_batch):
554554
if "epoch" in training_difference:
555555
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
556556
resume_step = None
557+
completed_steps = starting_epoch * num_update_steps_per_epoch
557558
else:
558559
resume_step = int(training_difference.replace("step_", ""))
559560
starting_epoch = resume_step // len(train_dataloader)
560561
resume_step -= starting_epoch * len(train_dataloader)
562+
completed_steps = resume_step
563+
564+
# update the progress_bar if load from checkpoint
565+
progress_bar.update(completed_steps)
561566

562567
for epoch in range(starting_epoch, args.num_train_epochs):
568+
model.train()
563569
if args.with_tracking:
564570
total_loss = 0
565-
model.train()
566-
for step, batch in enumerate(train_dataloader):
567-
# We need to skip steps until we reach the resumed step
568-
if args.resume_from_checkpoint and epoch == starting_epoch:
569-
if resume_step is not None and step < resume_step:
570-
completed_steps += 1
571-
continue
572-
571+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
572+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
573+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
574+
else:
575+
active_dataloader = train_dataloader
576+
for step, batch in enumerate(active_dataloader):
573577
with accelerator.accumulate(model):
574578
outputs = model(**batch)
575579
loss = outputs.loss

examples/pytorch/summarization/run_summarization_no_trainer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -626,22 +626,26 @@ def postprocess_text(preds, labels):
626626
if "epoch" in training_difference:
627627
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
628628
resume_step = None
629+
completed_steps = starting_epoch * num_update_steps_per_epoch
629630
else:
630631
resume_step = int(training_difference.replace("step_", ""))
631632
starting_epoch = resume_step // len(train_dataloader)
632633
resume_step -= starting_epoch * len(train_dataloader)
634+
completed_steps = resume_step
635+
636+
# update the progress_bar if load from checkpoint
637+
progress_bar.update(completed_steps)
633638

634639
for epoch in range(starting_epoch, args.num_train_epochs):
635640
model.train()
636641
if args.with_tracking:
637642
total_loss = 0
638-
for step, batch in enumerate(train_dataloader):
639-
# We need to skip steps until we reach the resumed step
640-
if args.resume_from_checkpoint and epoch == starting_epoch:
641-
if resume_step is not None and step < resume_step:
642-
completed_steps += 1
643-
continue
644-
643+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
644+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
645+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
646+
else:
647+
active_dataloader = train_dataloader
648+
for step, batch in enumerate(active_dataloader):
645649
with accelerator.accumulate(model):
646650
outputs = model(**batch)
647651
loss = outputs.loss

examples/pytorch/text-classification/run_glue_no_trainer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -510,12 +510,12 @@ def preprocess_function(examples):
510510
model.train()
511511
if args.with_tracking:
512512
total_loss = 0
513-
for step, batch in enumerate(train_dataloader):
514-
# We need to skip steps until we reach the resumed step
515-
if args.resume_from_checkpoint and epoch == starting_epoch:
516-
if resume_step is not None and step < resume_step:
517-
completed_steps += 1
518-
continue
513+
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
514+
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
515+
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
516+
else:
517+
active_dataloader = train_dataloader
518+
for step, batch in enumerate(active_dataloader):
519519
outputs = model(**batch)
520520
loss = outputs.loss
521521
# We keep track of the loss at each epoch

0 commit comments

Comments
 (0)