Merge branch 'nerval-evaluate' into 'main'

Evaluate predictions with nerval Closes #231 See merge request !339

Merge branch 'nerval-evaluate' into 'main'
38c3bded · Yoann Schneider · 98f6b78c · 95907769 · 38c3bded · 38c3bded
Commit 38c3bded authored 1 year ago by Yoann Schneider
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "nerval"]
+	path = nerval
+	url = ../../ner/nerval.git
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,12 @@ RUN apt-get -y update && \

 WORKDIR /src

-# Install DAN as a package
+# Copy submodule data
+COPY nerval nerval
+
+# Copy DAN data
 COPY dan dan
 COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
+
+# Install DAN as a package
 RUN pip install . --no-cache-dir
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -5,17 +5,42 @@ Evaluate a trained DAN model.

 import logging
 import random
+from argparse import ArgumentTypeError
+from pathlib import Path
+from typing import Dict, List

 import numpy as np
 import torch
 import torch.multiprocessing as mp

+from dan.bio import convert
+from dan.ocr.manager.metrics import Inference
 from dan.ocr.manager.training import Manager
 from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
-from dan.utils import read_json
+from dan.utils import parse_tokens, read_json
+from nerval.evaluate import evaluate
+from nerval.parse import parse_bio
+from nerval.utils import print_results

 logger = logging.getLogger(__name__)

+NERVAL_THRESHOLD = 0.30
+
+
+def parse_threshold(value: str) -> float:
+    """
+    Check that the string passed as parameter is a correct floating point number between 0 and 1
+    """
+    try:
+        value = float(value)
+    except ValueError:
+        raise ArgumentTypeError("Must be a floating point number.")
+
+    if value < 0 or value > 1:
+        raise ArgumentTypeError("Must be between 0 and 1.")
+
+    return value
+

 def add_evaluate_parser(subcommands) -> None:
    parser = subcommands.add_parser(
@@ -31,10 +56,55 @@ def add_evaluate_parser(subcommands) -> None:
        help="Configuration file.",
    )

+    parser.add_argument(
+        "--nerval-threshold",
+        help="Distance threshold for the match between gold and predicted entity during Nerval evaluation.",
+        default=NERVAL_THRESHOLD,
+        type=parse_threshold,
+    )
+
    parser.set_defaults(func=run)


-def eval(rank, config, mlflow_logging):
+def eval_nerval(
+    all_inferences: Dict[str, List[Inference]],
+    tokens: Path,
+    threshold: float,
+):
+    print("\n#### Nerval evaluation")
+
+    def inferences_to_parsed_bio(attr: str):
+        bio_values = []
+        for inference in inferences:
+            value = getattr(inference, attr)
+            bio_value = convert(value, ner_tokens=tokens)
+            bio_values.extend(bio_value.split("\n"))
+
+        # Parse this BIO format
+        return parse_bio(bio_values)
+
+    # Evaluate with Nerval
+    tokens = parse_tokens(tokens)
+    for split_name, inferences in all_inferences.items():
+        ground_truths = inferences_to_parsed_bio("ground_truth")
+        predictions = inferences_to_parsed_bio("prediction")
+
+        if not (ground_truths and predictions):
+            continue
+
+        scores = {
+            key: {
+                k: round(value * 100, 2) if k in ["P", "R", "F1"] else value
+                for k, value in values.items()
+            }
+            for key, values in evaluate(ground_truths, predictions, threshold).items()
+        }
+
+        print(f"\n##### {split_name}\n")
+        print_results(scores)
+
+
+def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    np.random.seed(0)
@@ -62,10 +132,12 @@ def eval(rank, config, mlflow_logging):
        metric_names.append("ner")

    metrics_table = create_metrics_table(metric_names)
+    all_inferences = {}
+
    for dataset_name in config["dataset"]["datasets"]:
        for set_name in ["train", "val", "test"]:
            logger.info(f"Evaluating on set `{set_name}`")
-            metrics = model.evaluate(
+            metrics, inferences = model.evaluate(
                "{}-{}".format(dataset_name, set_name),
                [
                    (dataset_name, set_name),
@@ -75,11 +147,20 @@ def eval(rank, config, mlflow_logging):
            )

            add_metrics_table_row(metrics_table, set_name, metrics)
+            all_inferences[set_name] = inferences

+    print("\n#### DAN evaluation\n")
    print(metrics_table)

+    if "ner" in metric_names:
+        eval_nerval(
+            all_inferences,
+            tokens=config["dataset"]["tokens"],
+            threshold=nerval_threshold,
+        )
+

-def run(config: dict):
+def run(config: dict, nerval_threshold: float):
    update_config(config)

    mlflow_logging = bool(config.get("mlflow"))
@@ -94,8 +175,8 @@ def run(config: dict):
    ):
        mp.spawn(
            eval,
-            args=(config, mlflow_logging),
+            args=(config, nerval_threshold, mlflow_logging),
            nprocs=config["training"]["device"]["nb_gpu"],
        )
    else:
-        eval(0, config, mlflow_logging)
+        eval(0, config, nerval_threshold, mlflow_logging)
--- a/dan/ocr/manager/metrics.py
+++ b/dan/ocr/manager/metrics.py
@@ -3,7 +3,7 @@ import re
 from collections import defaultdict
 from operator import attrgetter
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, NamedTuple

 import editdistance
 import numpy as np
@@ -23,6 +23,16 @@ REGEX_ONLY_ONE_SPACE = re.compile(r"\s+")
 METRICS_KEYWORD = {"cer": "chars", "wer": "words", "ner": "tokens"}


+class Inference(NamedTuple):
+    """
+    Store a prediction with its ground truth to avoid
+    inferring again when we need to compute new metrics
+    """
+
+    ground_truth: str
+    prediction: str
+
+
 class MetricManager:
    def __init__(self, metric_names: List[str], dataset_name: str, tokens: Path | None):
        self.dataset_name: str = dataset_name

--- a/dan/ocr/manager/training.py
+++ b/dan/ocr/manager/training.py
@@ -6,7 +6,7 @@ from copy import deepcopy
 from enum import Enum
 from pathlib import Path
 from time import time
-from typing import Dict
+from typing import Dict, List, Tuple

 import numpy as np
 import torch
@@ -20,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm

-from dan.ocr.manager.metrics import MetricManager
+from dan.ocr.manager.metrics import Inference, MetricManager
 from dan.ocr.manager.ocr import OCRDatasetManager
 from dan.ocr.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics
 from dan.ocr.schedulers import DropoutScheduler
@@ -750,7 +750,7 @@ class GenericTrainingManager:

    def evaluate(
        self, custom_name, sets_list, metric_names, mlflow_logging=False
-    ) -> Dict[str, int | float]:
+    ) -> Tuple[Dict[str, int | float], List[Inference]]:
        """
        Main loop for evaluation
        """
@@ -768,6 +768,9 @@ class GenericTrainingManager:
            tokens=self.tokens,
        )

+        # Keep inferences in memory to evaluate with Nerval
+        inferences = []
+
        with tqdm(total=len(loader.dataset)) as pbar:
            pbar.set_description("Evaluation")
            with torch.no_grad():
@@ -792,6 +795,10 @@ class GenericTrainingManager:
                    pbar.set_postfix(values=str(display_values))
                    pbar.update(len(batch_data["names"]) * self.nb_workers)

+                    inferences.extend(
+                        map(Inference, batch_values["str_y"], batch_values["str_x"])
+                    )
+
                # log metrics in MLflow
                logging_name = custom_name.split("-")[1]
                logging_tags_metrics(
@@ -810,7 +817,7 @@ class GenericTrainingManager:
            # Log mlflow artifacts
            mlflow.log_artifact(path, "predictions")

-        return metrics
+        return metrics, inferences

    def output_pred(self, name):
        path = self.paths["results"] / "predict_{}_{}.yaml".format(

--- a/docs/usage/datasets/analyze.md
+++ b/docs/usage/datasets/analyze.md
@@ -4,8 +4,6 @@

 Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.

-The available arguments are:
-
 | Parameter       | Description                      | Type           | Default |
 | --------------- | -------------------------------- | -------------- | ------- |
 | `--labels`      | Path to the `labels.json` file.  | `pathlib.Path` |         |

--- a/docs/usage/evaluate/index.md
+++ b/docs/usage/evaluate/index.md
 # Evaluation

+## Description
+
 Use the `teklia-dan evaluate` command to evaluate a trained DAN model.

 To evaluate DAN on your dataset:

 1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters.
 1. Run `teklia-dan evaluate --config path/to/your/config.json`.
-1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration.
-1. A metrics Markdown table, providing results for each evaluated split, is also printed in the console (see table example below).

-### Example output - Metrics Markdown table
+This will, for each evaluated split:
+
+1. Create a YAML file with the evaluation results in the `results` subfolder of the `training.output_folder` indicated in your configuration.
+1. Print in the console a metrics Markdown table (see [table example below](#htr-evaluation)).
+1. Print in the console a [Nerval](https://gitlab.teklia.com/ner/nerval) metrics Markdown table, if the `dataset.tokens` parameter in your configuration is defined (see [table example below](#htr-and-ner-evaluation)).
+
+| Parameter            | Description                                                                                                                                                                                              | Type           | Default |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
+| `--config`           | Path to the configuration file.                                                                                                                                                                          | `pathlib.Path` |         |
+| `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float`        | `0.3`   |
+
+## Example output
+
+### HTR evaluation
+
+```
+#### DAN evaluation
+
+| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) |
+| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: |
+| train |       x       |     x     |       x       |     x     |         x          |
+|  val  |       x       |     x     |       x       |     x     |         x          |
+| test  |       x       |     x     |       x       |     x     |         x          |
+```
+
+### HTR and NER evaluation
+
+```
+#### DAN evaluation

 | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER |
 | :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | :-: |
 | train |       x       |     x     |       x       |     x     |         x          |  x  |
 |  val  |       x       |     x     |       x       |     x     |         x          |  x  |
 | test  |       x       |     x     |       x       |     x     |         x          |  x  |
+
+#### Nerval evaluation
+
+##### train
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+
+##### val
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+
+##### test
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+```
--- a/docs/usage/predict/index.md
+++ b/docs/usage/predict/index.md
 # Prediction

-Use the `teklia-dan predict` command to apply a trained DAN model on an image.
-
 ## Description

+Use the `teklia-dan predict` command to apply a trained DAN model on an image.
+
 | Parameter                   | Description                                                                                                                           | Type           | Default       |
 | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------------- |
 | `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                       | `pathlib.Path` |               |

--- a/nerval @ 525c1a9e
+++ b/nerval @ 525c1a9e
+Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
--- a/requirements.txt
+++ b/requirements.txt
+-e ./nerval
 albumentations==1.3.1
 arkindex-export==0.1.9
 boto3==1.26.124
-editdistance==0.6.2
 flashlight-text==0.0.4
 imageio==2.26.1
 imagesize==1.4.1
@@ -9,7 +9,6 @@ lxml==4.9.3
 mdutils==1.6.0
 nltk==3.8.1
 numpy==1.24.3
-prettytable==3.8.0
 PyYAML==6.0
 scipy==1.10.1
 sentencepiece==0.1.99

--- a/tests/data/evaluate/metrics_table.md
+++ b/tests/data/evaluate/metrics_table.md
+#### DAN evaluation
+
 | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER  |
 |:-----:|:-------------:|:---------:|:-------------:|:---------:|:------------------:|:----:|
 | train |     18.89     |   21.05   |     26.67     |   26.67   |       26.67        | 7.14 |
 |  val  |      8.82     |   11.54   |      50.0     |    50.0   |        50.0        | 0.0  |
 |  test |      2.78     |    3.33   |     14.29     |   14.29   |       14.29        | 0.0  |
+
+#### Nerval evaluation
+
+##### train
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|   Patron  |     2     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Operai  |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|   Louche  |     2     |    1    |    50.0   |  50.0  |  50.0 |    2    |
+|   Koala   |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+| Firstname |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+| Chalumeau |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|  Batiment |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|    All    |     15    |    12   |    80.0   | 85.71  | 82.76 |    14   |
+
+##### val
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Patron  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Operai  |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Louche  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Koala   |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Firstname |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Chalumeau |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|  Batiment |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|    All    |     8     |    6    |    75.0   |  75.0  |  75.0 |    8    |
+
+##### test
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Louche  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Koala   |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Firstname |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Chalumeau |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|  Batiment |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|    All    |     6     |    5    |   83.33   | 83.33  | 83.33 |    6    |
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -103,7 +103,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
    # Use the tmp_path as base folder
    evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"

-    evaluate.run(evaluate_config)
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)

    # Check that the evaluation results are correct
    for split_name, expected_res in zip(
@@ -129,7 +129,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):

    # Check the metrics Markdown table
    captured_std = capsys.readouterr()
-    last_printed_lines = captured_std.out.split("\n")[-6:]
+    last_printed_lines = captured_std.out.split("\n")[10:]
    assert (
        "\n".join(last_printed_lines)
        == Path(FIXTURES / "evaluate" / "metrics_table.md").read_text()