Merge branch 'wandb-publish-statistics-file' into 'main'

W&B integration: Publish statistics file See merge request !458

Merge branch 'wandb-publish-statistics-file' into 'main'
4e504aea · Yoann Schneider · 886b4cc7 · b1f6104d · 4e504aea · 4e504aea
Commit 4e504aea authored 6 months ago by Yoann Schneider
--- a/configs/eval.json
+++ b/configs/eval.json
@@ -84,7 +84,10 @@
        "validation": {
            "eval_on_valid": true,
            "eval_on_valid_interval": 2,
-            "set_name_focus_metric": "training-val"
+            "set_name_focus_metric": "training-val",
+            "font": "fonts/LinuxLibertine.ttf",
+            "maximum_font_size": 32,
+            "nb_logged_images": 5
        },
        "output_folder": "tests/data/evaluate",
        "gradient_clipping": {},

--- a/dan/datasets/analyze/__init__.py
+++ b/dan/datasets/analyze/__init__.py
@@ -9,7 +9,7 @@ Analyze dataset and display statistics in markdown format.
 from pathlib import Path

 from dan.datasets.analyze.statistics import run
-from dan.utils import read_yaml
+from dan.utils import read_json, read_yaml


 def add_analyze_parser(subcommands) -> None:
@@ -37,5 +37,11 @@ def add_analyze_parser(subcommands) -> None:
        help="The statistics will be saved to this file in Markdown format.",
        required=True,
    )
+    parser.add_argument(
+        "--wandb",
+        dest="wandb_params",
+        type=read_json,
+        help="Keys and values to use to initialise your experiment on W&B.",
+    )

    parser.set_defaults(func=run)
--- a/dan/datasets/analyze/statistics.py
+++ b/dan/datasets/analyze/statistics.py
@@ -13,6 +13,7 @@ import numpy as np
 from mdutils.mdutils import MdUtils
 from prettytable import MARKDOWN, PrettyTable

+from dan.ocr import wandb
 from dan.utils import read_json

 logger = logging.getLogger(__name__)
@@ -184,8 +185,33 @@ class Statistics:
        self.document.create_md_file()


-def run(labels: Path, tokens: Dict | None, output: Path) -> None:
+def run(
+    labels: Path,
+    tokens: Dict | None,
+    output: Path,
+    wandb_params: dict | None,
+) -> None:
    """
    Compute and save a dataset statistics.
    """
    Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens)
+
+    # Publish file on "Weights & Biases"
+    wandb.init(
+        wandb_params,
+        config={
+            wandb.Config.ANALYZE.value: {
+                "wandb": wandb_params,
+                "labels": labels,
+                "tokens": tokens,
+                "output": output,
+            }
+        },
+        output_folder=output.parent,
+    )
+    artifact = wandb.artifact(
+        name=f"run-{wandb.run_id()}-statistics",
+        type="markdown",
+        description="Statistics metrics",
+    )
+    wandb.log_artifact(artifact, local_path=output, name=output.name)
--- a/dan/ocr/wandb.py
+++ b/dan/ocr/wandb.py
@@ -14,6 +14,7 @@ WANDB_AVAILABLE = False


 class Config(str, Enum):
+    ANALYZE = "analyze"
    TRAINING = "training"
    EVALUATION = "evaluation"


--- a/docs/usage/datasets/analyze.md
+++ b/docs/usage/datasets/analyze.md
@@ -4,11 +4,60 @@

 Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.

-| Parameter       | Description                      | Type           | Default |
-| --------------- | -------------------------------- | -------------- | ------- |
-| `--labels`      | Path to the `labels.json` file.  | `pathlib.Path` |         |
-| `--tokens`      | Path to the `tokens.yml` file.   | `pathlib.Path` |         |
-| `--output-file` | Where the summary will be saved. | `pathlib.Path` |         |
+| Parameter       | Description                                                                                                                                                              | Type           | Default |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- | ------- |
+| `--labels`      | Path to the `labels.json` file.                                                                                                                                          | `pathlib.Path` |         |
+| `--tokens`      | Path to the `tokens.yml` file.                                                                                                                                           | `pathlib.Path` |         |
+| `--output-file` | Where the summary will be saved.                                                                                                                                         | `pathlib.Path` |         |
+| `--wandb`       | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict`         |         |
+
+## Weights & Biases logging
+
+To log your statistics file on [Weights & Biases](https://wandb.ai/) (W&B), you need to:
+
+- [login to W&B](https://docs.wandb.ai/ref/cli/wandb-login) via
+
+```shell
+wandb login
+```
+
+### Resume run
+
+To be sure that your statistics file is linked to your DAN training, we strongly recommend you to either reuse [your `wandb.init` parameter of your DAN training configuration](../train/config.md#weights-biases-logging) or define these two keys:
+
+- `id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
+- `resume` with the value `auto`.
+
+The final configuration should look like:
+
+```json
+{
+  "id": "<unique_ID>",
+  "resume": "auto"
+}
+```
+
+Otherwise, W&B will create a new run when you'll publish your statistics file.
+
+### Offline mode
+
+If you do not have Internet access during the file generation, you can set the `mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder next to the `--output-file` defined in the command.
+
+The final configuration should look like:
+
+```json
+{
+  "mode": "offline"
+}
+```
+
+Once your statistics file is complete, you can publish your W&B run with the [`wandb sync`](https://docs.wandb.ai/ref/cli/wandb-sync) command and **the `--append` parameter**:
+
+```shell
+wandb sync --project <wandb_project> --sync-all --append
+```
+
+As in online mode, we recommend you to set up a resume of your W&B runs (see [the dedicated section](#resume-run)).

 ## Examples


--- a/docs/usage/train/config.md
+++ b/docs/usage/train/config.md
@@ -298,7 +298,7 @@ Using W&B during DAN training will allow you to follow the DAN training with a W

 ### Resume run

-To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training:
+To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to either reuse [your `--wandb` parameter of your `analyze` command](../datasets/analyze.md#weights-biases-logging) or define these two keys **before** starting your DAN training:

 - `wandb.init.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
 - `wandb.init.resume` with the value `auto`.