Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (4)
......@@ -84,7 +84,10 @@
"validation": {
"eval_on_valid": true,
"eval_on_valid_interval": 2,
"set_name_focus_metric": "training-val"
"set_name_focus_metric": "training-val",
"font": "fonts/LinuxLibertine.ttf",
"maximum_font_size": 32,
"nb_logged_images": 5
},
"output_folder": "tests/data/evaluate",
"gradient_clipping": {},
......
......@@ -9,7 +9,7 @@ Analyze dataset and display statistics in markdown format.
from pathlib import Path
from dan.datasets.analyze.statistics import run
from dan.utils import read_yaml
from dan.utils import read_json, read_yaml
def add_analyze_parser(subcommands) -> None:
......@@ -37,5 +37,11 @@ def add_analyze_parser(subcommands) -> None:
help="The statistics will be saved to this file in Markdown format.",
required=True,
)
parser.add_argument(
"--wandb",
dest="wandb_params",
type=read_json,
help="Keys and values to use to initialise your experiment on W&B.",
)
parser.set_defaults(func=run)
......@@ -13,6 +13,7 @@ import numpy as np
from mdutils.mdutils import MdUtils
from prettytable import MARKDOWN, PrettyTable
from dan.ocr import wandb
from dan.utils import read_json
logger = logging.getLogger(__name__)
......@@ -184,8 +185,33 @@ class Statistics:
self.document.create_md_file()
def run(labels: Path, tokens: Dict | None, output: Path) -> None:
def run(
labels: Path,
tokens: Dict | None,
output: Path,
wandb_params: dict | None,
) -> None:
"""
Compute and save a dataset statistics.
"""
Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens)
# Publish file on "Weights & Biases"
wandb.init(
wandb_params,
config={
wandb.Config.ANALYZE.value: {
"wandb": wandb_params,
"labels": labels,
"tokens": tokens,
"output": output,
}
},
output_folder=output.parent,
)
artifact = wandb.artifact(
name=f"run-{wandb.run_id()}-statistics",
type="markdown",
description="Statistics metrics",
)
wandb.log_artifact(artifact, local_path=output, name=output.name)
......@@ -276,7 +276,11 @@ def run(config: dict, nerval_threshold: float, output_json: Path | None):
update_config(config)
# Start "Weights & Biases" as soon as possible
wandb.init(config)
wandb.init(
wandb_params=config.get("wandb", {}).get("init", {}),
config={wandb.Config.EVALUATION.value: config},
output_folder=config["training"]["output_folder"],
)
mlflow_logging = bool(config.get("mlflow"))
......
......@@ -115,7 +115,11 @@ def run(config: dict):
update_config(config)
# Start "Weights & Biases" as soon as possible
wandb.init(config)
wandb.init(
wandb_params=config.get("wandb", {}).get("init", {}),
config={wandb.Config.TRAINING.value: config},
output_folder=config["training"]["output_folder"],
)
if config.get("mlflow") and not MLFLOW_AVAILABLE:
logger.error(
......
import functools
import os
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING
......@@ -12,6 +13,15 @@ if TYPE_CHECKING:
WANDB_AVAILABLE = False
class Config(str, Enum):
ANALYZE = "analyze"
TRAINING = "training"
EVALUATION = "evaluation"
def __str__(self):
return self.value
def wandb_required(func):
"""
Always check that "Weights & Biases" is available before executing the function.
......@@ -26,22 +36,28 @@ def wandb_required(func):
return wrapper
def init(config: dict) -> None:
if not config.get("wandb"):
def init(wandb_params: dict | None, config: dict, output_folder: Path) -> None:
if not wandb_params:
return
# Store "Weights & Biases" files in the output folder by default
if not os.getenv("WANDB_DIR"):
os.environ["WANDB_DIR"] = str(config["training"]["output_folder"])
os.environ["WANDB_DIR"] = str(output_folder)
# Path should already exist when "Weights & Biases" is instantiated
Path(os.environ["WANDB_DIR"]).mkdir(parents=True, exist_ok=True)
wandb_params = config["wandb"].get("init", {})
wandb_config = wandb_params.pop("config", {})
wandb.init(
**wandb_params,
config={**config, **wandb_config},
)
# Update config to log
wandb_config = wandb_params.get("config", {})
forbidden_keys = set(wandb_config.keys()).intersection(list(map(str, Config)))
assert (
not forbidden_keys
), f"Keys {list(forbidden_keys)} are forbidden in W&B config"
wandb_params = wandb_params.copy() # Avoid recursions
wandb_params["config"] = {**config, **wandb_config}
wandb.init(**wandb_params)
global WANDB_AVAILABLE
WANDB_AVAILABLE = True
......
......@@ -4,11 +4,60 @@
Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.
| Parameter | Description | Type | Default |
| --------------- | -------------------------------- | -------------- | ------- |
| `--labels` | Path to the `labels.json` file. | `pathlib.Path` | |
| `--tokens` | Path to the `tokens.yml` file. | `pathlib.Path` | |
| `--output-file` | Where the summary will be saved. | `pathlib.Path` | |
| Parameter | Description | Type | Default |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- | ------- |
| `--labels` | Path to the `labels.json` file. | `pathlib.Path` | |
| `--tokens` | Path to the `tokens.yml` file. | `pathlib.Path` | |
| `--output-file` | Where the summary will be saved. | `pathlib.Path` | |
| `--wandb` | Keys and values to use to initialise your experiment on W&B. See the full list of available keys on [the official documentation](https://docs.wandb.ai/ref/python/init). | `dict` | |
## Weights & Biases logging
To log your statistics file on [Weights & Biases](https://wandb.ai/) (W&B), you need to:
- [login to W&B](https://docs.wandb.ai/ref/cli/wandb-login) via
```shell
wandb login
```
### Resume run
To be sure that your statistics file is linked to your DAN training, we strongly recommend you to either reuse [your `wandb.init` parameter of your DAN training configuration](../train/config.md#weights-biases-logging) or define these two keys:
- `id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `resume` with the value `auto`.
The final configuration should look like:
```json
{
"id": "<unique_ID>",
"resume": "auto"
}
```
Otherwise, W&B will create a new run when you'll publish your statistics file.
### Offline mode
If you do not have Internet access during the file generation, you can set the `mode` key to `offline` to use W&B's offline mode. W&B will create a `wandb` folder next to the `--output-file` defined in the command.
The final configuration should look like:
```json
{
"mode": "offline"
}
```
Once your statistics file is complete, you can publish your W&B run with the [`wandb sync`](https://docs.wandb.ai/ref/cli/wandb-sync) command and **the `--append` parameter**:
```shell
wandb sync --project <wandb_project> --sync-all --append
```
As in online mode, we recommend you to set up a resume of your W&B runs (see [the dedicated section](#resume-run)).
## Examples
......
......@@ -293,16 +293,12 @@ wandb login
Using W&B during DAN training will allow you to follow the DAN training with a W&B run. This run will automatically record:
- a **configuration** using the DAN training configuration. Any `wandb.init.config.*` keys and values found in the DAN training configuration will be added to the W&B run configuration.
!!! warning
If a `wandb.init.config.*` key exists in the DAN training configuration (e.g `dataset`, `model`, `training`...) then the W&B run configuration will record the `wandb.init.config.*` value instead of using the value of the DAN training configuration.
- **metrics** listed in the `training.metrics` key of the DAN training configuration. To edit the metrics to log to W&B see [the dedicated section](#metrics).
- **images** according to the `wandb.images` and `training.validation.*` keys of the DAN training configuration. To edit the images to log to W&B see [the dedicated section](#validation).
### Resume run
To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to define these two keys **before** starting your DAN training:
To be sure that your DAN training will only produce one W&B run even if your DAN training has been resumed, we strongly recommend you to either reuse [your `--wandb` parameter of your `analyze` command](../datasets/analyze.md#weights-biases-logging) or define these two keys **before** starting your DAN training:
- `wandb.init.id` with a unique ID that has never been used on your W&B project. We recommend you to generate a random 8-character word composed of letters and numbers using [the Short Unique ID (UUID) Generating Library](https://shortunique.id/).
- `wandb.init.resume` with the value `auto`.
......