Skip to content
Snippets Groups Projects
Commit 28785c80 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'wandb-log-images' into 'main'

W&B: Log images

See merge request !444
parents a816c15c b9df8068
No related branches found
No related tags found
1 merge request!444W&B: Log images
......@@ -97,7 +97,7 @@
"set_name_focus_metric": "$dataset_name-val",
"font": "fonts/LinuxLibertine.ttf",
"maximum_font_size": 32,
"n_tensorboard_images": 5
"nb_logged_images": 5
},
"output_folder": "$dataset_path/output",
"max_nb_epochs": 800,
......
......@@ -87,7 +87,7 @@
"set_name_focus_metric": "training-val",
"font": "fonts/LinuxLibertine.ttf",
"maximum_font_size": 32,
"n_tensorboard_images": 1
"nb_logged_images": 1
},
"output_folder": "dan_trained_model",
"gradient_clipping": {},
......
......@@ -63,8 +63,8 @@ class GenericTrainingManager:
self.maximum_font_size = self.params["training"]["validation"][
"maximum_font_size"
]
self.n_tensorboard_images = self.params["training"]["validation"][
"n_tensorboard_images"
self.nb_logged_images = self.params["training"]["validation"][
"nb_logged_images"
]
self.optimizers = dict()
......@@ -734,7 +734,9 @@ class GenericTrainingManager:
for key, value in display_values.items()
},
step=num_epoch,
commit=not evaluate_epoch, # Do not commit now if data will be updated with the evaluation
# With "Weights & Biases" we can only publish once per step
# Do not commit now if data will be updated with the evaluation
commit=not evaluate_epoch,
)
# evaluate and compute metrics for valid sets
......@@ -768,6 +770,8 @@ class GenericTrainingManager:
for key, value in display_values.items()
},
step=self.latest_epoch,
# With "Weights & Biases" we can only publish once per step
# Publish now because no data will be added for this step
commit=True,
)
......@@ -816,7 +820,7 @@ class GenericTrainingManager:
pbar.set_postfix(values=str(display_values))
pbar.update(len(batch_data["names"]) * self.nb_workers)
if ind_batch < self.n_tensorboard_images:
if ind_batch < self.nb_logged_images:
image = loader.dataset.get_sample_img(ind_batch)
result = create_image(
image,
......@@ -835,6 +839,20 @@ class GenericTrainingManager:
self.latest_epoch,
)
# Log "Weights & Biases" metrics
if self.params.get("wandb", {}).get("images"):
wandb.log(
data={
f"valid/image_{batch_data['names'][0]}": wandb.Image(
result
)
},
step=self.latest_epoch,
# With "Weights & Biases" we can only publish once per step
# Do not commit now because data will be updated with the evaluation metrics
commit=False,
)
# log metrics in MLflow
logging_metrics(
display_values,
......
......@@ -124,9 +124,10 @@ def run(config: dict):
# Path should already exist when "Weights & Biases" is instantiated
Path(os.environ["WANDB_DIR"]).mkdir(parents=True, exist_ok=True)
wandb_config = config["wandb"].pop("config", {})
wandb_params = config["wandb"].get("init", {})
wandb_config = wandb_params.pop("config", {})
wandb.init(
**config["wandb"],
**wandb_params,
config={**config, **wandb_config},
)
......
......@@ -106,9 +106,9 @@ To train on several GPUs, simply set the `training.device.use_ddp` parameter to
| `training.validation.eval_on_valid` | Whether to evaluate and log metrics on the validation set during training. | `bool` | `True` |
| `training.validation.eval_on_valid_interval` | Interval (in epochs) to evaluate during training. | `int` | `5` |
| `training.validation.set_name_focus_metric` | Dataset to focus on to select best weights. | `str` | |
| `training.validation.font` | Path to the font used in the image in the tensorboard. | `str` | `fonts/LinuxLibertine.ttf` |
| `training.validation.maximum_font_size` | Maximum size used for the font of the image in the tensorboard. | `int` | |
| `training.validation.n_tensorboard_images` | Number of images in Tensorboard during validation. | `int` | `5` |
| `training.validation.font` | Path to the font used in the image to log. | `str` | `fonts/LinuxLibertine.ttf` |
| `training.validation.maximum_font_size` | Maximum size used for the font of the image to log. | `int` | |
| `training.validation.nb_logged_images` | Number of images to log during validation. | `int` | `5` |
During the validation stage, the batch size is set to 1. This avoids problems associated with image sizes that can be very different inside batches and lead to significant padding, resulting in performance degradations.
......@@ -284,33 +284,37 @@ wandb login
- update the following arguments:
| Name | Description | Type | Default |
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | ------- |
| `wandb` | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | |
| Name | Description | Type | Default |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | ------- |
| `wandb.init` | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | |
| `wandb.images` | Whether to log validation images during evaluation with their predicted transcription. | `bool` | `False` |
Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record:
- a **configuration** using the DAN training configuration. Any `wandb.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration.
- a **configuration** using the DAN training configuration. Any `wandb.init.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration.
!!! warning
If a `wandb.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.config.*` value instead of using the value of the DAN training configuration.
If a `wandb.init.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.init.config.*` value instead of using the value of the DAN training configuration.
- **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics).
- **images** according to the `wandb.images` and `training.validation.*` keys of the DAN training configuration. To edit the images to log to W&B see [the dedicated section](#validation).
### Resume run
To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training:
- `wandb.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `wandb.resume` with the value `auto`.
- `wandb.init.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `wandb.init.resume` with the value `auto`.
The final configuration should look like:
```json
{
"wandb": {
"id": "<unique_ID>",
"resume": "auto"
"init": {
"id": "<unique_ID>",
"resume": "auto"
}
}
}
```
......@@ -319,14 +323,16 @@ Otherwise, W&B will create a new run for each DAN training session, even if the
### Offline mode
If you do not have Internet access during the DAN training, you can set the `wandb.mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder in the `training.output_folder` defined in the DAN training configuration. To use another location, see [the dedicated section](#training-parameters).
If you do not have Internet access during the DAN training, you can set the `wandb.init.mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder in the `training.output_folder` defined in the DAN training configuration. To use another location, see [the dedicated section](#training-parameters).
The final configuration should look like:
```json
{
"wandb": {
"mode": "offline"
"init": {
"mode": "offline"
}
}
}
```
......
......@@ -54,7 +54,7 @@ Alternatively, you can find them in the `Time series` tab.
### Predictions on the validation set
Five validation images are also displayed at each epoch, along with their predicted transcription and CER and WER.
To log more or less images, update the `training.validation.n_tensorboard_images` parameter in the [configuration file](config.md). The font and its size can also be changed.
To log more or less images, update the `training.validation.nb_logged_images` parameter in the [configuration file](config.md). The font and its size can also be changed.
To visualize them, go in the `Image` tab in Tensorboard.
......
......@@ -109,5 +109,6 @@ known-third-party = [
"torch",
"torchvision",
"tqdm",
"wandb",
"yaml"
]
......@@ -213,7 +213,7 @@ def test_evaluate(
evaluate_config["training"]["validation"]["font"] = "fonts/LinuxLibertine.ttf"
evaluate_config["training"]["validation"]["maximum_font_size"] = 32
evaluate_config["training"]["validation"]["n_tensorboard_images"] = 5
evaluate_config["training"]["validation"]["nb_logged_images"] = 5
evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=output_json)
......@@ -384,7 +384,7 @@ def test_evaluate_language_model(
evaluate_config["training"]["validation"]["font"] = "fonts/LinuxLibertine.ttf"
evaluate_config["training"]["validation"]["maximum_font_size"] = 32
evaluate_config["training"]["validation"]["n_tensorboard_images"] = 5
evaluate_config["training"]["validation"]["nb_logged_images"] = 5
evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=None)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment