Compare revisions

Manon Blanco · Yoann Schneider · Yoann Schneider · Manon Blanco · Yoann Schneider · Yoann Schneider
--- a/configs/eval.json
+++ b/configs/eval.json
@@ -84,7 +84,10 @@
        "validation": {
            "eval_on_valid": true,
            "eval_on_valid_interval": 2,
-            "set_name_focus_metric": "training-val"
+            "set_name_focus_metric": "training-val",
+            "font": "fonts/LinuxLibertine.ttf",
+            "maximum_font_size": 32,
+            "nb_logged_images": 5
        },
        "output_folder": "tests/data/evaluate",
        "gradient_clipping": {},

--- a/dan/datasets/analyze/__init__.py
+++ b/dan/datasets/analyze/__init__.py
@@ -9,7 +9,7 @@ Analyze dataset and display statistics in markdown format.
 from pathlib import Path

 from dan.datasets.analyze.statistics import run
-from dan.utils import read_yaml
+from dan.utils import read_json, read_yaml


 def add_analyze_parser(subcommands) -> None:
@@ -37,5 +37,11 @@ def add_analyze_parser(subcommands) -> None:
        help="The statistics will be saved to this file in Markdown format.",
        required=True,
    )
+    parser.add_argument(
+        "--wandb",
+        dest="wandb_params",
+        type=read_json,
+        help="Keys and values to use to initialise your experiment on W&B.",
+    )

    parser.set_defaults(func=run)
--- a/dan/datasets/analyze/statistics.py
+++ b/dan/datasets/analyze/statistics.py
@@ -13,6 +13,7 @@ import numpy as np
 from mdutils.mdutils import MdUtils
 from prettytable import MARKDOWN, PrettyTable

+from dan.ocr import wandb
 from dan.utils import read_json

 logger = logging.getLogger(__name__)
@@ -184,8 +185,33 @@ class Statistics:
        self.document.create_md_file()


-def run(labels: Path, tokens: Dict | None, output: Path) -> None:
+def run(
+    labels: Path,
+    tokens: Dict | None,
+    output: Path,
+    wandb_params: dict | None,
+) -> None:
    """
    Compute and save a dataset statistics.
    """
    Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens)
+
+    # Publish file on "Weights & Biases"
+    wandb.init(
+        wandb_params,
+        config={
+            wandb.Config.ANALYZE.value: {
+                "wandb": wandb_params,
+                "labels": labels,
+                "tokens": tokens,
+                "output": output,
+            }
+        },
+        output_folder=output.parent,
+    )
+    artifact = wandb.artifact(
+        name=f"run-{wandb.run_id()}-statistics",
+        type="markdown",
+        description="Statistics metrics",
+    )
+    wandb.log_artifact(artifact, local_path=output, name=output.name)
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -276,7 +276,11 @@ def run(config: dict, nerval_threshold: float, output_json: Path | None):
    update_config(config)

    # Start "Weights & Biases" as soon as possible
-    wandb.init(config)
+    wandb.init(
+        wandb_params=config.get("wandb", {}).get("init", {}),
+        config={wandb.Config.EVALUATION.value: config},
+        output_folder=config["training"]["output_folder"],
+    )

    mlflow_logging = bool(config.get("mlflow"))


--- a/dan/ocr/train.py
+++ b/dan/ocr/train.py
@@ -115,7 +115,11 @@ def run(config: dict):
    update_config(config)

    # Start "Weights & Biases" as soon as possible
-    wandb.init(config)
+    wandb.init(
+        wandb_params=config.get("wandb", {}).get("init", {}),
+        config={wandb.Config.TRAINING.value: config},
+        output_folder=config["training"]["output_folder"],
+    )

    if config.get("mlflow") and not MLFLOW_AVAILABLE:
        logger.error(

--- a/dan/ocr/wandb.py
+++ b/dan/ocr/wandb.py
 import functools
 import os
+from enum import Enum
 from pathlib import Path
 from typing import TYPE_CHECKING

@@ -12,6 +13,15 @@ if TYPE_CHECKING:
 WANDB_AVAILABLE = False


+class Config(str, Enum):
+    ANALYZE = "analyze"
+    TRAINING = "training"
+    EVALUATION = "evaluation"
+
+    def __str__(self):
+        return self.value
+
+
 def wandb_required(func):
    """
    Always check that "Weights & Biases" is available before executing the function.
@@ -26,22 +36,28 @@ def wandb_required(func):
    return wrapper


-def init(config: dict) -> None:
-    if not config.get("wandb"):
+def init(wandb_params: dict | None, config: dict, output_folder: Path) -> None:
+    if not wandb_params:
        return

    # Store "Weights & Biases" files in the output folder by default
    if not os.getenv("WANDB_DIR"):
-        os.environ["WANDB_DIR"] = str(config["training"]["output_folder"])
+        os.environ["WANDB_DIR"] = str(output_folder)
    # Path should already exist when "Weights & Biases" is instantiated
    Path(os.environ["WANDB_DIR"]).mkdir(parents=True, exist_ok=True)

-    wandb_params = config["wandb"].get("init", {})
-    wandb_config = wandb_params.pop("config", {})
-    wandb.init(
-        **wandb_params,
-        config={**config, **wandb_config},
-    )
+    # Update config to log
+    wandb_config = wandb_params.get("config", {})
+
+    forbidden_keys = set(wandb_config.keys()).intersection(list(map(str, Config)))
+    assert (
+        not forbidden_keys
+    ), f"Keys {list(forbidden_keys)} are forbidden in W&B config"
+
+    wandb_params = wandb_params.copy()  # Avoid recursions
+    wandb_params["config"] = {**config, **wandb_config}
+
+    wandb.init(**wandb_params)

    global WANDB_AVAILABLE
    WANDB_AVAILABLE = True

--- a/docs/usage/datasets/analyze.md
+++ b/docs/usage/datasets/analyze.md
@@ -4,11 +4,60 @@

 Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.

-| Parameter       | Description                      | Type           | Default |
-| --------------- | -------------------------------- | -------------- | ------- |
-| `--labels`      | Path to the `labels.json` file.  | `pathlib.Path` |         |
-| `--tokens`      | Path to the `tokens.yml` file.   | `pathlib.Path` |         |
-| `--output-file` | Where the summary will be saved. | `pathlib.Path` |         |
+| Parameter       | Description                                                                                                                                                              | Type           | Default |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- | ------- |
+| `--labels`      | Path to the `labels.json` file.                                                                                                                                          | `pathlib.Path` |         |
+| `--tokens`      | Path to the `tokens.yml` file.                                                                                                                                           | `pathlib.Path` |         |
+| `--output-file` | Where the summary will be saved.                                                                                                                                         | `pathlib.Path` |         |
+| `--wandb`       | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict`         |         |
+
+## Weights & Biases logging
+
+To log your statistics file on [Weights & Biases](https://wandb.ai/) (W&B), you need to:
+
+- [login to W&B](https://docs.wandb.ai/ref/cli/wandb-login) via
+
+```shell
+wandb login
+```
+
+### Resume run
+
+To be sure that your statistics file is linked to your DAN training, we strongly recommend you to either reuse [your `wandb.init` parameter of your DAN training configuration](../train/config.md#weights-biases-logging) or define these two keys:
+
+- `id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
+- `resume` with the value `auto`.
+
+The final configuration should look like:
+
+```json
+{
+  "id": "<unique_ID>",
+  "resume": "auto"
+}
+```
+
+Otherwise, W&B will create a new run when you'll publish your statistics file.
+
+### Offline mode
+
+If you do not have Internet access during the file generation, you can set the `mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder next to the `--output-file` defined in the command.
+
+The final configuration should look like:
+
+```json
+{
+  "mode": "offline"
+}
+```
+
+Once your statistics file is complete, you can publish your W&B run with the [`wandb sync`](https://docs.wandb.ai/ref/cli/wandb-sync) command and **the `--append` parameter**:
+
+```shell
+wandb sync --project <wandb_project> --sync-all --append
+```
+
+As in online mode, we recommend you to set up a resume of your W&B runs (see [the dedicated section](#resume-run)).

 ## Examples


--- a/docs/usage/train/config.md
+++ b/docs/usage/train/config.md
@@ -293,16 +293,12 @@ wandb login
 Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record:

 - a **configuration** using the DAN training configuration. Any `wandb.init.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration.
-
-!!! warning
-    If a `wandb.init.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.init.config.*` value instead of using the value of the DAN training configuration.
-
 - **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics).
 - **images** according to the `wandb.images` and `training.validation.*` keys of the DAN training configuration. To edit the images to log to W&B see [the dedicated section](#validation).

 ### Resume run

-To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training:
+To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to either reuse [your `--wandb` parameter of your `analyze` command](../datasets/analyze.md#weights-biases-logging) or define these two keys **before** starting your DAN training:

 - `wandb.init.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
 - `wandb.init.resume` with the value `auto`.
No results found