diff --git a/.gitignore b/.gitignore index 7c699310bf33a17ed7bf80b56ed9d492b2595ca7..e804e434ede13728880664421113a066cba9d9c4 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,9 @@ instance/ # Scrapy stuff: .scrapy +# Weights & Biases stuff: +wandb + # PyBuilder target/ diff --git a/dan/ocr/manager/training.py b/dan/ocr/manager/training.py index 94bd6bdd6f921207044899da030919a1a628ced9..0c18b2ae23a32b7f3ee5c778a3eeaba55691b9c2 100644 --- a/dan/ocr/manager/training.py +++ b/dan/ocr/manager/training.py @@ -16,6 +16,7 @@ import numpy as np import torch import torch.distributed as dist import torch.multiprocessing as mp +import wandb import yaml from torch.cuda.amp import GradScaler, autocast from torch.nn import CrossEntropyLoss @@ -640,6 +641,14 @@ class GenericTrainingManager: display_values = None # perform epochs for num_epoch in range(self.latest_epoch + 1, nb_epochs): + # Whether we will evaluate this epoch + evaluate_epoch = ( + self.params["training"]["validation"]["eval_on_valid"] + and num_epoch + % self.params["training"]["validation"]["eval_on_valid_interval"] + == 0 + ) + # set models trainable for model_name in self.models: self.models[model_name].train() @@ -707,7 +716,7 @@ class GenericTrainingManager: ) if self.is_master: - # log metrics in tensorboard file + # Log metrics in tensorboard file for key in display_values: self.writer.add_scalar( "train/{}_{}".format( @@ -717,20 +726,28 @@ class GenericTrainingManager: num_epoch, ) + # Log "Weights & Biases" metrics + if self.params.get("wandb"): + wandb.log( + data={ + f"train_{key}": value + for key, value in display_values.items() + }, + step=num_epoch, + commit=not evaluate_epoch, # Do not commit now if data will be updated with the evaluation + ) + # evaluate and compute metrics for valid sets - if ( - self.params["training"]["validation"]["eval_on_valid"] - and num_epoch - % self.params["training"]["validation"]["eval_on_valid_interval"] - == 0 - ): + if evaluate_epoch: for valid_set_name in self.dataset.valid_loaders: # evaluate set and compute metrics eval_values = self.validate( valid_set_name, mlflow_logging=mlflow_logging ) - # log valid metrics in tensorboard file + + # Log metrics if self.is_master: + # Log metrics in tensorboard file for key in eval_values: self.writer.add_scalar( "valid/{}_{}".format(valid_set_name, key), @@ -743,6 +760,17 @@ class GenericTrainingManager: self.save_model(epoch=num_epoch, name="best") self.best = eval_values["cer"] + # Log "Weights & Biases" metrics + if self.params.get("wandb"): + wandb.log( + data={ + f"val_{key}": value + for key, value in display_values.items() + }, + step=self.latest_epoch, + commit=True, + ) + # save model weights if self.is_master: self.save_model(epoch=num_epoch, name="last") diff --git a/docs/usage/train/config.md b/docs/usage/train/config.md index fc6765ddea282d4db30f248a11e14cb669f687d9..5b4ecebd55d9a29f1e6844a4779fa44c47d359e2 100644 --- a/docs/usage/train/config.md +++ b/docs/usage/train/config.md @@ -274,9 +274,9 @@ $ pip install .[mlflow] ## Weights & Biases logging -To log your experiment on [Weights & Biases](https://wandb.ai/), you need to: +To log your run on [Weights & Biases](https://wandb.ai/) (W&B), you need to: -- [login to Weights & Biases](https://docs.wandb.ai/ref/cli/wandb-login) via +- [login to W&B](https://docs.wandb.ai/ref/cli/wandb-login) via ```shell wandb login @@ -284,6 +284,18 @@ wandb login - update the following arguments: -| Name | Description | Type | Default | -| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | ------- | -| `wandb` | Key and values to use to initialise your experiment on Weights & Biases. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | | +| Name | Description | Type | Default | +| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ------- | +| `wandb` | Key and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | | + +!!! warning + To resume your DAN training on W&B you need to set the `wandb.id` key to the ID of your W&B run and set the `wandb.resume` key to `must`. Otherwise, W&B will create a new run. + +Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record: + +- a **configuration** using the DAN training configuration. Any `wandb.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration. + +!!! warning + If a `wandb.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.config.*` value instead of using the value of the DAN training configuration. + +- **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics).