Skip to content

Commit 919dd05

Browse files
committed
Merge branch 'bug-6623' into cleanup-result-on-attributes
2 parents 265d06d + 108fbd3 commit 919dd05

File tree

16 files changed

+69
-330
lines changed

16 files changed

+69
-330
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
153153
- Removed deprecated `LightningModule` `hparams` setter ([#6207](https://github.com/PyTorchLightning/pytorch-lightning/pull/6207))
154154

155155

156+
- Removed legacy code to include `step` dictionary returns in `callback_metrics` ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682))
157+
158+
156159
- Removed `optimizer_idx` argument from `training_step` in manual optimization ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093))
157160

158161

docs/source/ecosystem/asr_nlp_tts.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,12 +270,12 @@ with PyTorch Lightning since every NeMo model is a Lightning Module.
270270
log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
271271
)
272272
wer_num, wer_denom = self._wer(predictions, transcript, transcript_len)
273-
tensorboard_logs = {
273+
self.log_dict({
274274
'train_loss': loss_value,
275275
'training_batch_wer': wer_num / wer_denom,
276276
'learning_rate': self._optimizer.param_groups[0]['lr'],
277-
}
278-
return {'loss': loss_value, 'log': tensorboard_logs}
277+
})
278+
return loss_value
279279
280280
Neural Types in NeMo ASR
281281
------------------------
@@ -539,8 +539,8 @@ since every NeMo model is a Lightning Module.
539539
logits = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
540540
541541
loss = self.loss(logits=logits, labels=labels, loss_mask=loss_mask)
542-
tensorboard_logs = {'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']}
543-
return {'loss': loss, 'log': tensorboard_logs}
542+
self.log_dict({'train_loss': loss, 'lr': self._optimizer.param_groups[0]['lr']})
543+
return loss
544544
...
545545
546546
Neural Types in NeMo NLP

docs/source/ecosystem/bolts.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ you can trust the implementations and use them to bootstrap your research much f
6868
6969
loss = self.criterion(logits.view(-1, logits.size(-1)), x.view(-1).long())
7070
71-
logs = {"loss": loss}
72-
return {"loss": loss, "log": logs}
71+
self.log("loss", loss)
72+
return loss
7373
7474
----------
7575

pytorch_lightning/callbacks/model_checkpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ def _validate_monitor_key(self, trainer):
590590
m = (
591591
f"ModelCheckpoint(monitor='{self.monitor}') not found in the returned metrics:"
592592
f" {list(metrics.keys())}. "
593-
f"HINT: Did you call self.log('{self.monitor}', tensor) in the LightningModule?"
593+
f"HINT: Did you call self.log('{self.monitor}', value) in the LightningModule?"
594594
)
595595
raise MisconfigurationException(m)
596596

pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,6 @@ def update_logger_connector(self) -> Tuple[Dict, Dict]:
346346

347347
# update callback_metrics
348348
logger_connector._callback_metrics.update(callback_metrics)
349-
logger_connector._callback_metrics.pop("epoch", None)
350349

351350
batch_pbar_metrics.pop("debug_epoch", None)
352351
return batch_pbar_metrics, batch_log_metrics

pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py

Lines changed: 11 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def progress_bar_metrics(self, progress_bar_metrics: Dict) -> None:
7878

7979
@property
8080
def cached_results(self) -> Union[EpochResultStore, None]:
81-
return self._cached_results.get(self.trainer._running_stage) # type: ignore
81+
return self._cached_results.get(self.trainer._running_stage)
8282

8383
def get_metrics(self, key: str) -> Dict:
8484
metrics_holder: MetricsHolder = getattr(self, f"_{key}")
@@ -121,8 +121,6 @@ def cache_logged_metrics(self):
121121
def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
122122
# logging
123123
self.configure_logger(logger)
124-
# todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
125-
# and assign here the desired value
126124
self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
127125
self.trainer.log_every_n_steps = log_every_n_steps
128126
self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
@@ -185,9 +183,6 @@ def cache_training_step_metrics(self, opt_closure_result):
185183
batch_log_metrics = opt_closure_result.training_step_output.log_metrics
186184
logged_metrics_tmp.update(batch_log_metrics)
187185

188-
callback_metrics = opt_closure_result.training_step_output.callback_metrics
189-
callback_metrics_tmp.update(callback_metrics)
190-
191186
batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end
192187
pbar_metrics_tmp.update(batch_pbar_metrics)
193188

@@ -210,9 +205,6 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):
210205
metrics (dict): Metric values
211206
grad_norm_dic (dict): Gradient norms
212207
step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
213-
log_train_step_metrics (bool): Used to track if `log_metrics` function is being called in during training
214-
steps. In training steps, we will log metrics on step: `total_nb_idx` (for accumulated gradients)
215-
and global_step for the rest.
216208
"""
217209
# add gpu memory
218210
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
@@ -339,9 +331,9 @@ def _track_callback_metrics(self, eval_results):
339331
if self.trainer.state in (TrainerState.TESTING, TrainerState.VALIDATING):
340332
self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
341333

342-
def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics):
334+
def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics):
343335
# eval loop returns all metrics
344-
dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics}
336+
dataloader_result_metrics = {**prog_bar_metrics, **log_metrics}
345337

346338
# add metrics to prog bar
347339
self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics)
@@ -350,13 +342,6 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric
350342
if len(log_metrics) > 0:
351343
self.trainer.logger_connector.log_metrics(log_metrics, {})
352344

353-
# track metrics for callbacks (all prog bar, logged and callback metrics)
354-
callback_metrics.update(log_metrics)
355-
callback_metrics.update(prog_bar_metrics)
356-
self.trainer.logger_connector.callback_metrics.update(callback_metrics)
357-
if self.trainer.state in (TrainerState.TESTING, TrainerState.VALIDATING):
358-
self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics)
359-
360345
if len(dataloader_result_metrics) > 0:
361346
self.eval_loop_results.append(dataloader_result_metrics)
362347

@@ -371,20 +356,16 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results):
371356
eval_results = [eval_results]
372357

373358
num_loaders: int = self.trainer.evaluation_loop.num_dataloaders
374-
prog_bar_metrics, log_metrics, callback_metrics = {}, {}, {}
359+
prog_bar_metrics, log_metrics = {}, {}
375360

376361
for result_idx, result in enumerate(eval_results):
377-
_, prog_bar_metrics, log_metrics, callback_metrics = self.trainer.process_dict_result(result)
362+
_, prog_bar_metrics, log_metrics = self.trainer.process_dict_result(result)
378363

379364
if num_loaders > 1:
380-
self.__process_eval_epoch_end_results_and_log_legacy_update(
381-
prog_bar_metrics, log_metrics, callback_metrics
382-
)
365+
self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics)
383366

384367
if num_loaders == 1:
385-
self.__process_eval_epoch_end_results_and_log_legacy_update(
386-
prog_bar_metrics, log_metrics, callback_metrics
387-
)
368+
self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics)
388369

389370
def on_train_epoch_end(self):
390371
# inform cached logger connector epoch finished
@@ -397,8 +378,6 @@ def log_train_epoch_end_metrics(self, epoch_output, num_optimizers):
397378

398379
model = self.trainer.lightning_module
399380

400-
epoch_callback_metrics = {}
401-
402381
# ------------------------
403382
# determine if using a result obj
404383
# ------------------------
@@ -426,10 +405,9 @@ def log_train_epoch_end_metrics(self, epoch_output, num_optimizers):
426405

427406
# TODO: deprecate 1.0
428407
else:
429-
out = self.__run_legacy_training_epoch_end(
430-
num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics
408+
epoch_log_metrics, epoch_progress_bar_metrics = self.__run_legacy_training_epoch_end(
409+
num_optimizers, epoch_output, model, is_result_obj
431410
)
432-
epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics = out
433411

434412
# it will perform reduction over epoch and return log metrics
435413
cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics()
@@ -447,9 +425,6 @@ def log_train_epoch_end_metrics(self, epoch_output, num_optimizers):
447425
self.log_metrics(epoch_log_metrics, {})
448426
self._callback_metrics.update(epoch_log_metrics)
449427

450-
# add metrics to callbacks
451-
self._callback_metrics.update(epoch_callback_metrics)
452-
453428
# add metrics to progress_bar and callbacks
454429
if len(epoch_progress_bar_metrics) > 0:
455430
self.add_progress_bar_metrics(epoch_progress_bar_metrics)
@@ -481,9 +456,7 @@ def training_epoch_end(self, model, epoch_output, num_optimizers):
481456
# capture logging
482457
self.trainer.logger_connector.cache_logged_metrics()
483458

484-
def __run_legacy_training_epoch_end(
485-
self, num_optimizers, epoch_output, model, is_result_obj, epoch_callback_metrics
486-
):
459+
def __run_legacy_training_epoch_end(self, num_optimizers, epoch_output, model, is_result_obj):
487460

488461
epoch_log_metrics = {}
489462
epoch_progress_bar_metrics = {}
@@ -514,15 +487,14 @@ def __run_legacy_training_epoch_end(
514487
_processed_outputs = self.trainer.process_dict_result(epoch_output)
515488
epoch_progress_bar_metrics = _processed_outputs[1]
516489
epoch_log_metrics = _processed_outputs[2]
517-
epoch_callback_metrics = _processed_outputs[3]
518490

519491
# --------------------------
520492
# Structured Result (auto epoch end)
521493
# --------------------------
522494
elif is_result_obj:
523495
epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output)
524496

525-
return epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics
497+
return epoch_log_metrics, epoch_progress_bar_metrics
526498

527499
def __auto_reduce_results_on_epoch_end(self, epoch_output):
528500
epoch_log_metrics = {}

pytorch_lightning/trainer/logging.py

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414

1515
import inspect
1616
from abc import ABC
17-
from typing import Mapping
1817

1918
import torch
2019

2120
from pytorch_lightning.utilities import DistributedType
2221
from pytorch_lightning.utilities.distributed import rank_zero_warn
22+
from pytorch_lightning.utilities.exceptions import MisconfigurationException
2323
from pytorch_lightning.utilities.memory import recursive_detach
2424

2525

@@ -32,8 +32,14 @@ class TrainerLoggingMixin(ABC):
3232

3333
def metrics_to_scalars(self, metrics):
3434
new_metrics = {}
35+
# TODO: this is duplicated in MetricsHolder. should be unified
3536
for k, v in metrics.items():
3637
if isinstance(v, torch.Tensor):
38+
if v.numel() != 1:
39+
raise MisconfigurationException(
40+
f"The metric `{k}` does not contain a single element"
41+
f" thus it cannot be converted to float. Found `{v}`"
42+
)
3743
v = v.item()
3844

3945
if isinstance(v, dict):
@@ -71,22 +77,7 @@ def process_dict_result(self, output, train=False):
7177
if isinstance(output, torch.Tensor):
7278
progress_bar_metrics = {}
7379
log_metrics = {}
74-
callback_metrics = {}
75-
return output, progress_bar_metrics, log_metrics, callback_metrics
76-
77-
# ---------------
78-
# EXTRACT CALLBACK KEYS
79-
# ---------------
80-
# all keys not progress_bar or log are candidates for callbacks
81-
callback_metrics = {}
82-
if isinstance(output, Mapping):
83-
for k, v in output.items():
84-
if k not in ['progress_bar', 'log']:
85-
callback_metrics[k] = v
86-
87-
if train and self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
88-
num_gpus = self.num_gpus
89-
callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus)
80+
return output, progress_bar_metrics, log_metrics
9081

9182
# ---------------
9283
# EXTRACT PROGRESS BAR KEYS
@@ -143,17 +134,12 @@ def process_dict_result(self, output, train=False):
143134
if self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
144135
loss = self.reduce_distributed_output(loss, self.num_gpus)
145136

146-
# use every metric passed in as a candidate for callback
147-
callback_metrics.update(progress_bar_metrics)
148-
callback_metrics.update(log_metrics)
149-
150137
# detach all metrics for callbacks to prevent memory leaks
151138
# no .item() because it will slow things down
152-
callback_metrics = recursive_detach(callback_metrics)
153139
progress_bar_metrics = recursive_detach(progress_bar_metrics)
154140
log_metrics = recursive_detach(log_metrics)
155141

156-
return loss, progress_bar_metrics, log_metrics, callback_metrics
142+
return loss, progress_bar_metrics, log_metrics
157143

158144
def reduce_distributed_output(self, output, num_gpus):
159145
if num_gpus <= 1:

pytorch_lightning/trainer/trainer.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -823,15 +823,6 @@ def run_sanity_check(self, ref_model):
823823
# run eval step
824824
_, eval_results = self.run_evaluation()
825825

826-
# allow no returns from eval
827-
if eval_results is not None and len(eval_results) > 0:
828-
# when we get a list back, used only the last item
829-
if isinstance(eval_results, list):
830-
eval_results = eval_results[-1]
831-
832-
_, _, _, callback_metrics = self.process_dict_result(eval_results)
833-
self.logger_connector.callback_metrics = callback_metrics
834-
835826
self.on_sanity_check_end()
836827

837828
self._running_stage = stage

pytorch_lightning/trainer/training_loop.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,6 @@ def _process_training_step_output(self, training_step_output, split_batch):
341341
batch_loss=training_step_output[0],
342342
pbar_on_batch_end=training_step_output[1],
343343
log_metrics=training_step_output[2],
344-
callback_metrics=training_step_output[3],
345344
)
346345
# if the user decides to finally reduce things in epoch_end, save raw output without graphs
347346
if isinstance(training_step_output_for_epoch_end, torch.Tensor):

tests/base/model_valid_epoch_ends.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ def _mean(res, key):
4343
val_loss_mean = val_loss_mean.item()
4444
val_acc_mean = val_acc_mean.item()
4545

46-
metrics_dict = {'early_stop_on': val_loss_mean, 'val_acc': val_acc_mean}
47-
results = {'progress_bar': metrics_dict, 'log': metrics_dict}
48-
return results
46+
self.log('early_stop_on', val_loss_mean, prog_bar=True)
47+
self.log('val_acc', val_acc_mean, prog_bar=True)
4948

5049
def validation_epoch_end__multiple_dataloaders(self, outputs):
5150
"""

0 commit comments

Comments
 (0)