Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
Loading items

Target

Select target project
0 results Searching
Select Git revision
Loading items
Show changes

Commits on Source 4

8 files
+ 128
24
Compare changes
  • Side-by-side
  • Inline

Files

Original line number Diff line number Diff line
@@ -84,7 +84,10 @@
        "validation": {
            "eval_on_valid": true,
            "eval_on_valid_interval": 2,
            "set_name_focus_metric": "training-val"
            "set_name_focus_metric": "training-val",
            "font": "fonts/LinuxLibertine.ttf",
            "maximum_font_size": 32,
            "nb_logged_images": 5
        },
        "output_folder": "tests/data/evaluate",
        "gradient_clipping": {},
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ Analyze dataset and display statistics in markdown format.
from pathlib import Path

from dan.datasets.analyze.statistics import run
from dan.utils import read_yaml
from dan.utils import read_json, read_yaml


def add_analyze_parser(subcommands) -> None:
@@ -37,5 +37,11 @@ def add_analyze_parser(subcommands) -> None:
        help="The statistics will be saved to this file in Markdown format.",
        required=True,
    )
    parser.add_argument(
        "--wandb",
        dest="wandb_params",
        type=read_json,
        help="Keys and values to use to initialise your experiment on W&B.",
    )

    parser.set_defaults(func=run)
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ import numpy as np
from mdutils.mdutils import MdUtils
from prettytable import MARKDOWN, PrettyTable

from dan.ocr import wandb
from dan.utils import read_json

logger = logging.getLogger(__name__)
@@ -184,8 +185,33 @@ class Statistics:
        self.document.create_md_file()


def run(labels: Path, tokens: Dict | None, output: Path) -> None:
def run(
    labels: Path,
    tokens: Dict | None,
    output: Path,
    wandb_params: dict | None,
) -> None:
    """
    Compute and save a dataset statistics.
    """
    Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens)

    # Publish file on "Weights & Biases"
    wandb.init(
        wandb_params,
        config={
            wandb.Config.ANALYZE.value: {
                "wandb": wandb_params,
                "labels": labels,
                "tokens": tokens,
                "output": output,
            }
        },
        output_folder=output.parent,
    )
    artifact = wandb.artifact(
        name=f"run-{wandb.run_id()}-statistics",
        type="markdown",
        description="Statistics metrics",
    )
    wandb.log_artifact(artifact, local_path=output, name=output.name)
Original line number Diff line number Diff line
@@ -276,7 +276,11 @@ def run(config: dict, nerval_threshold: float, output_json: Path | None):
    update_config(config)

    # Start "Weights & Biases" as soon as possible
    wandb.init(config)
    wandb.init(
        wandb_params=config.get("wandb", {}).get("init", {}),
        config={wandb.Config.EVALUATION.value: config},
        output_folder=config["training"]["output_folder"],
    )

    mlflow_logging = bool(config.get("mlflow"))

Original line number Diff line number Diff line
@@ -115,7 +115,11 @@ def run(config: dict):
    update_config(config)

    # Start "Weights & Biases" as soon as possible
    wandb.init(config)
    wandb.init(
        wandb_params=config.get("wandb", {}).get("init", {}),
        config={wandb.Config.TRAINING.value: config},
        output_folder=config["training"]["output_folder"],
    )

    if config.get("mlflow") and not MLFLOW_AVAILABLE:
        logger.error(
+25 −9
Original line number Diff line number Diff line
import functools
import os
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING

@@ -12,6 +13,15 @@ if TYPE_CHECKING:
WANDB_AVAILABLE = False


class Config(str, Enum):
    ANALYZE = "analyze"
    TRAINING = "training"
    EVALUATION = "evaluation"

    def __str__(self):
        return self.value


def wandb_required(func):
    """
    Always check that "Weights & Biases" is available before executing the function.
@@ -26,22 +36,28 @@ def wandb_required(func):
    return wrapper


def init(config: dict) -> None:
    if not config.get("wandb"):
def init(wandb_params: dict | None, config: dict, output_folder: Path) -> None:
    if not wandb_params:
        return

    # Store "Weights & Biases" files in the output folder by default
    if not os.getenv("WANDB_DIR"):
        os.environ["WANDB_DIR"] = str(config["training"]["output_folder"])
        os.environ["WANDB_DIR"] = str(output_folder)
    # Path should already exist when "Weights & Biases" is instantiated
    Path(os.environ["WANDB_DIR"]).mkdir(parents=True, exist_ok=True)

    wandb_params = config["wandb"].get("init", {})
    wandb_config = wandb_params.pop("config", {})
    wandb.init(
        **wandb_params,
        config={**config, **wandb_config},
    )
    # Update config to log
    wandb_config = wandb_params.get("config", {})

    forbidden_keys = set(wandb_config.keys()).intersection(list(map(str, Config)))
    assert (
        not forbidden_keys
    ), f"Keys {list(forbidden_keys)} are forbidden in W&B config"

    wandb_params = wandb_params.copy()  # Avoid recursions
    wandb_params["config"] = {**config, **wandb_config}

    wandb.init(**wandb_params)

    global WANDB_AVAILABLE
    WANDB_AVAILABLE = True
Original line number Diff line number Diff line
@@ -5,10 +5,59 @@
Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.

| Parameter       | Description                                                                                                                                                              | Type           | Default |
| --------------- | -------------------------------- | -------------- | ------- |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- | ------- |
| `--labels`      | Path to the `labels.json` file.                                                                                                                                          | `pathlib.Path` |         |
| `--tokens`      | Path to the `tokens.yml` file.                                                                                                                                           | `pathlib.Path` |         |
| `--output-file` | Where the summary will be saved.                                                                                                                                         | `pathlib.Path` |         |
| `--wandb`       | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict`         |         |

## Weights & Biases logging

To log your statistics file on [Weights & Biases](https://wandb.ai/) (W&B), you need to:

- [login to W&B](https://docs.wandb.ai/ref/cli/wandb-login) via

```shell
wandb login
```

### Resume run

To be sure that your statistics file is linked to your DAN training, we strongly recommend you to either reuse [your `wandb.init` parameter of your DAN training configuration](../train/config.md#weights-biases-logging) or define these two keys:

- `id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `resume` with the value `auto`.

The final configuration should look like:

```json
{
  "id": "<unique_ID>",
  "resume": "auto"
}
```

Otherwise, W&B will create a new run when you'll publish your statistics file.

### Offline mode

If you do not have Internet access during the file generation, you can set the `mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder next to the `--output-file` defined in the command.

The final configuration should look like:

```json
{
  "mode": "offline"
}
```

Once your statistics file is complete, you can publish your W&B run with the [`wandb sync`](https://docs.wandb.ai/ref/cli/wandb-sync) command and **the `--append` parameter**:

```shell
wandb sync --project <wandb_project> --sync-all --append
```

As in online mode, we recommend you to set up a resume of your W&B runs (see [the dedicated section](#resume-run)).

## Examples

Original line number Diff line number Diff line
@@ -293,16 +293,12 @@ wandb login
Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record:

- a **configuration** using the DAN training configuration. Any `wandb.init.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration.

!!! warning
    If a `wandb.init.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.init.config.*` value instead of using the value of the DAN training configuration.

- **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics).
- **images** according to the `wandb.images` and `training.validation.*` keys of the DAN training configuration. To edit the images to log to W&B see [the dedicated section](#validation).

### Resume run

To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training:
To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to either reuse [your `--wandb` parameter of your `analyze` command](../datasets/analyze.md#weights-biases-logging) or define these two keys **before** starting your DAN training:

- `wandb.init.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `wandb.init.resume` with the value `auto`.