Skip to content
Snippets Groups Projects
Commit 38c3bded authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'nerval-evaluate' into 'main'

Evaluate predictions with nerval

Closes #231

See merge request !339
parents 98f6b78c 95907769
No related branches found
No related tags found
1 merge request!339Evaluate predictions with nerval
[submodule "nerval"]
path = nerval
url = ../../ner/nerval.git
......@@ -7,7 +7,12 @@ RUN apt-get -y update && \
WORKDIR /src
# Install DAN as a package
# Copy submodule data
COPY nerval nerval
# Copy DAN data
COPY dan dan
COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
# Install DAN as a package
RUN pip install . --no-cache-dir
......@@ -5,17 +5,42 @@ Evaluate a trained DAN model.
import logging
import random
from argparse import ArgumentTypeError
from pathlib import Path
from typing import Dict, List
import numpy as np
import torch
import torch.multiprocessing as mp
from dan.bio import convert
from dan.ocr.manager.metrics import Inference
from dan.ocr.manager.training import Manager
from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
from dan.utils import read_json
from dan.utils import parse_tokens, read_json
from nerval.evaluate import evaluate
from nerval.parse import parse_bio
from nerval.utils import print_results
logger = logging.getLogger(__name__)
NERVAL_THRESHOLD = 0.30
def parse_threshold(value: str) -> float:
"""
Check that the string passed as parameter is a correct floating point number between 0 and 1
"""
try:
value = float(value)
except ValueError:
raise ArgumentTypeError("Must be a floating point number.")
if value < 0 or value > 1:
raise ArgumentTypeError("Must be between 0 and 1.")
return value
def add_evaluate_parser(subcommands) -> None:
parser = subcommands.add_parser(
......@@ -31,10 +56,55 @@ def add_evaluate_parser(subcommands) -> None:
help="Configuration file.",
)
parser.add_argument(
"--nerval-threshold",
help="Distance threshold for the match between gold and predicted entity during Nerval evaluation.",
default=NERVAL_THRESHOLD,
type=parse_threshold,
)
parser.set_defaults(func=run)
def eval(rank, config, mlflow_logging):
def eval_nerval(
all_inferences: Dict[str, List[Inference]],
tokens: Path,
threshold: float,
):
print("\n#### Nerval evaluation")
def inferences_to_parsed_bio(attr: str):
bio_values = []
for inference in inferences:
value = getattr(inference, attr)
bio_value = convert(value, ner_tokens=tokens)
bio_values.extend(bio_value.split("\n"))
# Parse this BIO format
return parse_bio(bio_values)
# Evaluate with Nerval
tokens = parse_tokens(tokens)
for split_name, inferences in all_inferences.items():
ground_truths = inferences_to_parsed_bio("ground_truth")
predictions = inferences_to_parsed_bio("prediction")
if not (ground_truths and predictions):
continue
scores = {
key: {
k: round(value * 100, 2) if k in ["P", "R", "F1"] else value
for k, value in values.items()
}
for key, values in evaluate(ground_truths, predictions, threshold).items()
}
print(f"\n##### {split_name}\n")
print_results(scores)
def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
......@@ -62,10 +132,12 @@ def eval(rank, config, mlflow_logging):
metric_names.append("ner")
metrics_table = create_metrics_table(metric_names)
all_inferences = {}
for dataset_name in config["dataset"]["datasets"]:
for set_name in ["train", "val", "test"]:
logger.info(f"Evaluating on set `{set_name}`")
metrics = model.evaluate(
metrics, inferences = model.evaluate(
"{}-{}".format(dataset_name, set_name),
[
(dataset_name, set_name),
......@@ -75,11 +147,20 @@ def eval(rank, config, mlflow_logging):
)
add_metrics_table_row(metrics_table, set_name, metrics)
all_inferences[set_name] = inferences
print("\n#### DAN evaluation\n")
print(metrics_table)
if "ner" in metric_names:
eval_nerval(
all_inferences,
tokens=config["dataset"]["tokens"],
threshold=nerval_threshold,
)
def run(config: dict):
def run(config: dict, nerval_threshold: float):
update_config(config)
mlflow_logging = bool(config.get("mlflow"))
......@@ -94,8 +175,8 @@ def run(config: dict):
):
mp.spawn(
eval,
args=(config, mlflow_logging),
args=(config, nerval_threshold, mlflow_logging),
nprocs=config["training"]["device"]["nb_gpu"],
)
else:
eval(0, config, mlflow_logging)
eval(0, config, nerval_threshold, mlflow_logging)
......@@ -3,7 +3,7 @@ import re
from collections import defaultdict
from operator import attrgetter
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, NamedTuple
import editdistance
import numpy as np
......@@ -23,6 +23,16 @@ REGEX_ONLY_ONE_SPACE = re.compile(r"\s+")
METRICS_KEYWORD = {"cer": "chars", "wer": "words", "ner": "tokens"}
class Inference(NamedTuple):
"""
Store a prediction with its ground truth to avoid
inferring again when we need to compute new metrics
"""
ground_truth: str
prediction: str
class MetricManager:
def __init__(self, metric_names: List[str], dataset_name: str, tokens: Path | None):
self.dataset_name: str = dataset_name
......
......@@ -6,7 +6,7 @@ from copy import deepcopy
from enum import Enum
from pathlib import Path
from time import time
from typing import Dict
from typing import Dict, List, Tuple
import numpy as np
import torch
......@@ -20,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from dan.ocr.manager.metrics import MetricManager
from dan.ocr.manager.metrics import Inference, MetricManager
from dan.ocr.manager.ocr import OCRDatasetManager
from dan.ocr.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics
from dan.ocr.schedulers import DropoutScheduler
......@@ -750,7 +750,7 @@ class GenericTrainingManager:
def evaluate(
self, custom_name, sets_list, metric_names, mlflow_logging=False
) -> Dict[str, int | float]:
) -> Tuple[Dict[str, int | float], List[Inference]]:
"""
Main loop for evaluation
"""
......@@ -768,6 +768,9 @@ class GenericTrainingManager:
tokens=self.tokens,
)
# Keep inferences in memory to evaluate with Nerval
inferences = []
with tqdm(total=len(loader.dataset)) as pbar:
pbar.set_description("Evaluation")
with torch.no_grad():
......@@ -792,6 +795,10 @@ class GenericTrainingManager:
pbar.set_postfix(values=str(display_values))
pbar.update(len(batch_data["names"]) * self.nb_workers)
inferences.extend(
map(Inference, batch_values["str_y"], batch_values["str_x"])
)
# log metrics in MLflow
logging_name = custom_name.split("-")[1]
logging_tags_metrics(
......@@ -810,7 +817,7 @@ class GenericTrainingManager:
# Log mlflow artifacts
mlflow.log_artifact(path, "predictions")
return metrics
return metrics, inferences
def output_pred(self, name):
path = self.paths["results"] / "predict_{}_{}.yaml".format(
......
......@@ -4,8 +4,6 @@
Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.
The available arguments are:
| Parameter | Description | Type | Default |
| --------------- | -------------------------------- | -------------- | ------- |
| `--labels` | Path to the `labels.json` file. | `pathlib.Path` | |
......
# Evaluation
## Description
Use the `teklia-dan evaluate` command to evaluate a trained DAN model.
To evaluate DAN on your dataset:
1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters.
1. Run `teklia-dan evaluate --config path/to/your/config.json`.
1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration.
1. A metrics Markdown table, providing results for each evaluated split, is also printed in the console (see table example below).
### Example output - Metrics Markdown table
This will, for each evaluated split:
1. Create a YAML file with the evaluation results in the `results` subfolder of the `training.output_folder` indicated in your configuration.
1. Print in the console a metrics Markdown table (see [table example below](#htr-evaluation)).
1. Print in the console a [Nerval](https://gitlab.teklia.com/ner/nerval) metrics Markdown table, if the `dataset.tokens` parameter in your configuration is defined (see [table example below](#htr-and-ner-evaluation)).
| Parameter | Description | Type | Default |
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
| `--config` | Path to the configuration file. | `pathlib.Path` | |
| `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float` | `0.3` |
## Example output
### HTR evaluation
```
#### DAN evaluation
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) |
| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: |
| train | x | x | x | x | x |
| val | x | x | x | x | x |
| test | x | x | x | x | x |
```
### HTR and NER evaluation
```
#### DAN evaluation
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER |
| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | :-: |
| train | x | x | x | x | x | x |
| val | x | x | x | x | x | x |
| test | x | x | x | x | x | x |
#### Nerval evaluation
##### train
| tag | predicted | matched | Precision | Recall | F1 | Support |
| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
| Surname | x | x | x | x | x | x |
| All | x | x | x | x | x | x |
##### val
| tag | predicted | matched | Precision | Recall | F1 | Support |
| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
| Surname | x | x | x | x | x | x |
| All | x | x | x | x | x | x |
##### test
| tag | predicted | matched | Precision | Recall | F1 | Support |
| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
| Surname | x | x | x | x | x | x |
| All | x | x | x | x | x | x |
```
# Prediction
Use the `teklia-dan predict` command to apply a trained DAN model on an image.
## Description
Use the `teklia-dan predict` command to apply a trained DAN model on an image.
| Parameter | Description | Type | Default |
| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------------- |
| `--image-dir` | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `pathlib.Path` | |
......
Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
-e ./nerval
albumentations==1.3.1
arkindex-export==0.1.9
boto3==1.26.124
editdistance==0.6.2
flashlight-text==0.0.4
imageio==2.26.1
imagesize==1.4.1
......@@ -9,7 +9,6 @@ lxml==4.9.3
mdutils==1.6.0
nltk==3.8.1
numpy==1.24.3
prettytable==3.8.0
PyYAML==6.0
scipy==1.10.1
sentencepiece==0.1.99
......
#### DAN evaluation
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER |
|:-----:|:-------------:|:---------:|:-------------:|:---------:|:------------------:|:----:|
| train | 18.89 | 21.05 | 26.67 | 26.67 | 26.67 | 7.14 |
| val | 8.82 | 11.54 | 50.0 | 50.0 | 50.0 | 0.0 |
| test | 2.78 | 3.33 | 14.29 | 14.29 | 14.29 | 0.0 |
#### Nerval evaluation
##### train
| tag | predicted | matched | Precision | Recall | F1 | Support |
|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
| Surname | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 |
| Patron | 2 | 0 | 0.0 | 0.0 | 0 | 1 |
| Operai | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 |
| Louche | 2 | 1 | 50.0 | 50.0 | 50.0 | 2 |
| Koala | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 |
| Firstname | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 |
| Chalumeau | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Batiment | 2 | 2 | 100.0 | 100.0 | 100.0 | 2 |
| All | 15 | 12 | 80.0 | 85.71 | 82.76 | 14 |
##### val
| tag | predicted | matched | Precision | Recall | F1 | Support |
|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
| Surname | 1 | 0 | 0.0 | 0.0 | 0 | 1 |
| Patron | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Operai | 1 | 0 | 0.0 | 0.0 | 0 | 1 |
| Louche | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Koala | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Firstname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Chalumeau | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Batiment | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| All | 8 | 6 | 75.0 | 75.0 | 75.0 | 8 |
##### test
| tag | predicted | matched | Precision | Recall | F1 | Support |
|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
| Surname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Louche | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Koala | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Firstname | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| Chalumeau | 1 | 0 | 0.0 | 0.0 | 0 | 1 |
| Batiment | 1 | 1 | 100.0 | 100.0 | 100.0 | 1 |
| All | 6 | 5 | 83.33 | 83.33 | 83.33 | 6 |
......@@ -103,7 +103,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
# Use the tmp_path as base folder
evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
evaluate.run(evaluate_config)
evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
# Check that the evaluation results are correct
for split_name, expected_res in zip(
......@@ -129,7 +129,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
# Check the metrics Markdown table
captured_std = capsys.readouterr()
last_printed_lines = captured_std.out.split("\n")[-6:]
last_printed_lines = captured_std.out.split("\n")[10:]
assert (
"\n".join(last_printed_lines)
== Path(FIXTURES / "evaluate" / "metrics_table.md").read_text()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment