diff --git a/dan/ocr/train.py b/dan/ocr/train.py index 6f64ef5bce9a3254d8f28ae4f5a5e0b29a2dda68..59c6ac7f6b2430bed1f6889f1429afc87e40e8de 100644 --- a/dan/ocr/train.py +++ b/dan/ocr/train.py @@ -4,8 +4,10 @@ # -*- coding: utf-8 -*- import json import logging +import os import random from copy import deepcopy +from pathlib import Path import numpy as np import torch @@ -107,21 +109,27 @@ def run(config: dict): """ Main program, training a new model, using a valid configuration """ + names = list(config["dataset"]["datasets"].keys()) + # We should only have one dataset + assert len(names) == 1, f"Found {len(names)} datasets but only one is expected" + + dataset_name = names.pop() + update_config(config) + # Start "Weights & Biases" as soon as possible if config.get("wandb"): + # Store "Weights & Biases" files in the output folder by default + if not os.getenv("WANDB_DIR"): + os.environ["WANDB_DIR"] = str(config["training"]["output_folder"]) + # Path should already exist when "Weights & Biases" is instantiated + Path(os.environ["WANDB_DIR"]).mkdir(parents=True, exist_ok=True) + wandb_config = config["wandb"].pop("config", {}) wandb.init( **config["wandb"], config={**config, **wandb_config}, ) - names = list(config["dataset"]["datasets"].keys()) - # We should only have one dataset - assert len(names) == 1, f"Found {len(names)} datasets but only one is expected" - - dataset_name = names.pop() - update_config(config) - if config.get("mlflow") and not MLFLOW_AVAILABLE: logger.error( "Cannot log to MLflow. Please install the `mlflow` extra requirements." diff --git a/docs/usage/train/config.md b/docs/usage/train/config.md index 5b4ecebd55d9a29f1e6844a4779fa44c47d359e2..5ab6b56907bfe5cf0e6c6cf0b2cf96f9791e2a5a 100644 --- a/docs/usage/train/config.md +++ b/docs/usage/train/config.md @@ -284,12 +284,9 @@ wandb login - update the following arguments: -| Name | Description | Type | Default | -| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------ | ------- | -| `wandb` | Key and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | | - -!!! warning - To resume your DAN training on W&B you need to set the `wandb.id` key to the ID of your W&B run and set the `wandb.resume` key to `must`. Otherwise, W&B will create a new run. +| Name | Description | Type | Default | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | ------- | +| `wandb` | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | | Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record: @@ -299,3 +296,45 @@ Using W&B during DAN training will allow you to follow the DAN training with a W If a `wandb.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.config.*` value instead of using the value of the DAN training configuration. - **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics). + +### Resume run + +To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training: + +- `wandb.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/). +- `wandb.resume` with the value `auto`. + +The final configuration should look like: + +```json +{ + "wandb": { + "id": "<unique_ID>", + "resume": "auto" + } +} +``` + +Otherwise, W&B will create a new run for each DAN training session, even if the DAN training has been resumed. + +### Offline mode + +If you do not have Internet access during the DAN training, you can set the `wandb.mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder in the `training.output_folder` defined in the DAN training configuration. To use another location, see [the dedicated section](#training-parameters). + +The final configuration should look like: + +```json +{ + "wandb": { + "mode": "offline" + } +} +``` + +Once your DAN training is complete, you can publish your W&B run with the [`wandb sync`](https://docs.wandb.ai/ref/cli/wandb-sync) command and **the `--append` parameter**: + +```shell +wandb sync --project <wandb_project> --sync-all --append +``` + +As in online mode, we recommend you to set up a resume of your W&B runs (see [the dedicated section](#resume-run)).