diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..7d6f8a61515402d96d9d86de1546165d8d529a87 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "nerval"] + path = nerval + url = ../../ner/nerval.git diff --git a/Dockerfile b/Dockerfile index 5e12de4fdfa493b77667ff763b19c35a1a7dc9b2..7d50ce7bc4a70b549a6830ebeb334681e8e1d6a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,12 @@ RUN apt-get -y update && \ WORKDIR /src -# Install DAN as a package +# Copy submodule data +COPY nerval nerval + +# Copy DAN data COPY dan dan COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./ + +# Install DAN as a package RUN pip install . --no-cache-dir diff --git a/dan/ocr/evaluate.py b/dan/ocr/evaluate.py index 7d514a40714df6c6eb846f7e7463a570affc6c29..079b0e5d11455426132c410adf0f5383e9329e1f 100644 --- a/dan/ocr/evaluate.py +++ b/dan/ocr/evaluate.py @@ -5,17 +5,42 @@ Evaluate a trained DAN model. import logging import random +from argparse import ArgumentTypeError +from pathlib import Path +from typing import Dict, List import numpy as np import torch import torch.multiprocessing as mp +from dan.bio import convert +from dan.ocr.manager.metrics import Inference from dan.ocr.manager.training import Manager from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config -from dan.utils import read_json +from dan.utils import parse_tokens, read_json +from nerval.evaluate import evaluate +from nerval.parse import parse_bio +from nerval.utils import print_results logger = logging.getLogger(__name__) +NERVAL_THRESHOLD = 0.30 + + +def parse_threshold(value: str) -> float: + """ + Check that the string passed as parameter is a correct floating point number between 0 and 1 + """ + try: + value = float(value) + except ValueError: + raise ArgumentTypeError("Must be a floating point number.") + + if value < 0 or value > 1: + raise ArgumentTypeError("Must be between 0 and 1.") + + return value + def add_evaluate_parser(subcommands) -> None: parser = subcommands.add_parser( @@ -31,10 +56,55 @@ def add_evaluate_parser(subcommands) -> None: help="Configuration file.", ) + parser.add_argument( + "--nerval-threshold", + help="Distance threshold for the match between gold and predicted entity during Nerval evaluation.", + default=NERVAL_THRESHOLD, + type=parse_threshold, + ) + parser.set_defaults(func=run) -def eval(rank, config, mlflow_logging): +def eval_nerval( + all_inferences: Dict[str, List[Inference]], + tokens: Path, + threshold: float, +): + print("\n#### Nerval evaluation") + + def inferences_to_parsed_bio(attr: str): + bio_values = [] + for inference in inferences: + value = getattr(inference, attr) + bio_value = convert(value, ner_tokens=tokens) + bio_values.extend(bio_value.split("\n")) + + # Parse this BIO format + return parse_bio(bio_values) + + # Evaluate with Nerval + tokens = parse_tokens(tokens) + for split_name, inferences in all_inferences.items(): + ground_truths = inferences_to_parsed_bio("ground_truth") + predictions = inferences_to_parsed_bio("prediction") + + if not (ground_truths and predictions): + continue + + scores = { + key: { + k: round(value * 100, 2) if k in ["P", "R", "F1"] else value + for k, value in values.items() + } + for key, values in evaluate(ground_truths, predictions, threshold).items() + } + + print(f"\n##### {split_name}\n") + print_results(scores) + + +def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool): torch.manual_seed(0) torch.cuda.manual_seed(0) np.random.seed(0) @@ -62,10 +132,12 @@ def eval(rank, config, mlflow_logging): metric_names.append("ner") metrics_table = create_metrics_table(metric_names) + all_inferences = {} + for dataset_name in config["dataset"]["datasets"]: for set_name in ["train", "val", "test"]: logger.info(f"Evaluating on set `{set_name}`") - metrics = model.evaluate( + metrics, inferences = model.evaluate( "{}-{}".format(dataset_name, set_name), [ (dataset_name, set_name), @@ -75,11 +147,20 @@ def eval(rank, config, mlflow_logging): ) add_metrics_table_row(metrics_table, set_name, metrics) + all_inferences[set_name] = inferences + print("\n#### DAN evaluation\n") print(metrics_table) + if "ner" in metric_names: + eval_nerval( + all_inferences, + tokens=config["dataset"]["tokens"], + threshold=nerval_threshold, + ) + -def run(config: dict): +def run(config: dict, nerval_threshold: float): update_config(config) mlflow_logging = bool(config.get("mlflow")) @@ -94,8 +175,8 @@ def run(config: dict): ): mp.spawn( eval, - args=(config, mlflow_logging), + args=(config, nerval_threshold, mlflow_logging), nprocs=config["training"]["device"]["nb_gpu"], ) else: - eval(0, config, mlflow_logging) + eval(0, config, nerval_threshold, mlflow_logging) diff --git a/dan/ocr/manager/metrics.py b/dan/ocr/manager/metrics.py index 102fddafbaa334845dc9251c630df18a6ae353dc..988c90d073c95cb80a682cf3a916972da540233f 100644 --- a/dan/ocr/manager/metrics.py +++ b/dan/ocr/manager/metrics.py @@ -3,7 +3,7 @@ import re from collections import defaultdict from operator import attrgetter from pathlib import Path -from typing import Dict, List +from typing import Dict, List, NamedTuple import editdistance import numpy as np @@ -23,6 +23,16 @@ REGEX_ONLY_ONE_SPACE = re.compile(r"\s+") METRICS_KEYWORD = {"cer": "chars", "wer": "words", "ner": "tokens"} +class Inference(NamedTuple): + """ + Store a prediction with its ground truth to avoid + inferring again when we need to compute new metrics + """ + + ground_truth: str + prediction: str + + class MetricManager: def __init__(self, metric_names: List[str], dataset_name: str, tokens: Path | None): self.dataset_name: str = dataset_name diff --git a/dan/ocr/manager/training.py b/dan/ocr/manager/training.py index ba5160fa1d3ad9ca1037077292b3530861520231..ad35f823e312c51a21975229fff0da38bac0545d 100644 --- a/dan/ocr/manager/training.py +++ b/dan/ocr/manager/training.py @@ -6,7 +6,7 @@ from copy import deepcopy from enum import Enum from pathlib import Path from time import time -from typing import Dict +from typing import Dict, List, Tuple import numpy as np import torch @@ -20,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm -from dan.ocr.manager.metrics import MetricManager +from dan.ocr.manager.metrics import Inference, MetricManager from dan.ocr.manager.ocr import OCRDatasetManager from dan.ocr.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics from dan.ocr.schedulers import DropoutScheduler @@ -750,7 +750,7 @@ class GenericTrainingManager: def evaluate( self, custom_name, sets_list, metric_names, mlflow_logging=False - ) -> Dict[str, int | float]: + ) -> Tuple[Dict[str, int | float], List[Inference]]: """ Main loop for evaluation """ @@ -768,6 +768,9 @@ class GenericTrainingManager: tokens=self.tokens, ) + # Keep inferences in memory to evaluate with Nerval + inferences = [] + with tqdm(total=len(loader.dataset)) as pbar: pbar.set_description("Evaluation") with torch.no_grad(): @@ -792,6 +795,10 @@ class GenericTrainingManager: pbar.set_postfix(values=str(display_values)) pbar.update(len(batch_data["names"]) * self.nb_workers) + inferences.extend( + map(Inference, batch_values["str_y"], batch_values["str_x"]) + ) + # log metrics in MLflow logging_name = custom_name.split("-")[1] logging_tags_metrics( @@ -810,7 +817,7 @@ class GenericTrainingManager: # Log mlflow artifacts mlflow.log_artifact(path, "predictions") - return metrics + return metrics, inferences def output_pred(self, name): path = self.paths["results"] / "predict_{}_{}.yaml".format( diff --git a/docs/usage/datasets/analyze.md b/docs/usage/datasets/analyze.md index fec46d59b16ae9441ac06521df3d81f60e080bf8..799774e8fbb23e9de00c78308d3cdcdffaeb05c1 100644 --- a/docs/usage/datasets/analyze.md +++ b/docs/usage/datasets/analyze.md @@ -4,8 +4,6 @@ Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format. -The available arguments are: - | Parameter | Description | Type | Default | | --------------- | -------------------------------- | -------------- | ------- | | `--labels` | Path to the `labels.json` file. | `pathlib.Path` | | diff --git a/docs/usage/evaluate/index.md b/docs/usage/evaluate/index.md index f1cc273be210825c853219f5182625f0dc19f21a..2153ebd776c0767b8e55edf8ffaa9cec2422f307 100644 --- a/docs/usage/evaluate/index.md +++ b/docs/usage/evaluate/index.md @@ -1,18 +1,70 @@ # Evaluation +## Description + Use the `teklia-dan evaluate` command to evaluate a trained DAN model. To evaluate DAN on your dataset: 1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters. 1. Run `teklia-dan evaluate --config path/to/your/config.json`. -1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration. -1. A metrics Markdown table, providing results for each evaluated split, is also printed in the console (see table example below). -### Example output - Metrics Markdown table +This will, for each evaluated split: + +1. Create a YAML file with the evaluation results in the `results` subfolder of the `training.output_folder` indicated in your configuration. +1. Print in the console a metrics Markdown table (see [table example below](#htr-evaluation)). +1. Print in the console a [Nerval](https://gitlab.teklia.com/ner/nerval) metrics Markdown table, if the `dataset.tokens` parameter in your configuration is defined (see [table example below](#htr-and-ner-evaluation)). + +| Parameter | Description | Type | Default | +| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- | +| `--config` | Path to the configuration file. | `pathlib.Path` | | +| `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float` | `0.3` | + +## Example output + +### HTR evaluation + +``` +#### DAN evaluation + +| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | +| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | +| train | x | x | x | x | x | +| val | x | x | x | x | x | +| test | x | x | x | x | x | +``` + +### HTR and NER evaluation + +``` +#### DAN evaluation | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER | | :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | :-: | | train | x | x | x | x | x | x | | val | x | x | x | x | x | x | | test | x | x | x | x | x | x | + +#### Nerval evaluation + +##### train + +| tag | predicted | matched | Precision | Recall | F1 | Support | +| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: | +| Surname | x | x | x | x | x | x | +| All | x | x | x | x | x | x | + +##### val + +| tag | predicted | matched | Precision | Recall | F1 | Support | +| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: | +| Surname | x | x | x | x | x | x | +| All | x | x | x | x | x | x | + +##### test + +| tag | predicted | matched | Precision | Recall | F1 | Support | +| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: | +| Surname | x | x | x | x | x | x | +| All | x | x | x | x | x | x | +``` diff --git a/docs/usage/predict/index.md b/docs/usage/predict/index.md index b96149dbafaba253ec694f2d16cf16629aa6fc67..823399de80299417f9e9fa28a66ed0aaf67fe771 100644 --- a/docs/usage/predict/index.md +++ b/docs/usage/predict/index.md @@ -1,9 +1,9 @@ # Prediction -Use the `teklia-dan predict` command to apply a trained DAN model on an image. - ## Description +Use the `teklia-dan predict` command to apply a trained DAN model on an image. + | Parameter | Description | Type | Default | | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------------- | | `--image-dir` | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `pathlib.Path` | | diff --git a/nerval b/nerval new file mode 160000 index 0000000000000000000000000000000000000000..525c1a9e6d5a33075669085148247e2604dd092f --- /dev/null +++ b/nerval @@ -0,0 +1 @@ +Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f diff --git a/requirements.txt b/requirements.txt index 445189ae68ef6febce26b477a5360a308208a7e3..d065f01db77260ee824b2c39ea3e778332bb93a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ +-e ./nerval albumentations==1.3.1 arkindex-export==0.1.9 boto3==1.26.124 -editdistance==0.6.2 flashlight-text==0.0.4 imageio==2.26.1 imagesize==1.4.1 @@ -9,7 +9,6 @@ lxml==4.9.3 mdutils==1.6.0 nltk==3.8.1 numpy==1.24.3 -prettytable==3.8.0 PyYAML==6.0 scipy==1.10.1 sentencepiece==0.1.99 diff --git a/tests/data/evaluate/metrics_table.md b/tests/data/evaluate/metrics_table.md index d67456d844071ebb8a9332638a2f527d93c5e405..107bef4188e44a6f8e5509c74b7ceb8d4ded4625 100644 --- a/tests/data/evaluate/metrics_table.md +++ b/tests/data/evaluate/metrics_table.md @@ -1,5 +1,49 @@ +#### DAN evaluation + | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER | |:-----:|:-------------:|:---------:|:-------------:|:---------:|:------------------:|:----:| | train | 18.89 | 21.05 | 26.67 | 26.67 | 26.67 | 7.14 | | val | 8.82 | 11.54 | 50.0 | 50.0 | 50.0 | 0.0 | | test | 2.78 | 3.33 | 14.29 | 14.29 | 14.29 | 0.0 | + +#### Nerval evaluation + +##### train + +| tag | predicted | matched | Precision | Recall | F1 | Support | +|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:| +| Surname | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 | +| Patron | 2 | 0 | 0.0 | 0.0 | 0 | 1 | +| Operai | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 | +| Louche | 2 | 1 | 50.0 | 50.0 | 50.0 | 2 | +| Koala | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 | +| Firstname | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 | +| Chalumeau | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Batiment | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 | +| All | 15 | 12 | 80.0 | 85.71 | 82.76 | 14 | + +##### val + +| tag | predicted | matched | Precision | Recall | F1 | Support | +|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:| +| Surname | 1 | 0 | 0.0 | 0.0 | 0 | 1 | +| Patron | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Operai | 1 | 0 | 0.0 | 0.0 | 0 | 1 | +| Louche | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Koala | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Firstname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Chalumeau | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Batiment | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| All | 8 | 6 | 75.0 | 75.0 | 75.0 | 8 | + +##### test + +| tag | predicted | matched | Precision | Recall | F1 | Support | +|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:| +| Surname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Louche | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Koala | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Firstname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| Chalumeau | 1 | 0 | 0.0 | 0.0 | 0 | 1 | +| Batiment | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 | +| All | 6 | 5 | 83.33 | 83.33 | 83.33 | 6 | diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 98fbd93898f09b75377b3b8e9cd21ddbc1b7521b..0bdf51968cf985a7c1fa1dcac00aedbe4b474091 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -103,7 +103,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config): # Use the tmp_path as base folder evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate" - evaluate.run(evaluate_config) + evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD) # Check that the evaluation results are correct for split_name, expected_res in zip( @@ -129,7 +129,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config): # Check the metrics Markdown table captured_std = capsys.readouterr() - last_printed_lines = captured_std.out.split("\n")[-6:] + last_printed_lines = captured_std.out.split("\n")[10:] assert ( "\n".join(last_printed_lines) == Path(FIXTURES / "evaluate" / "metrics_table.md").read_text()