diff --git a/dan/ocr/evaluate.py b/dan/ocr/evaluate.py index 7c100d730401056cd2fa0c4f6ec2137d80414094..9a5805b9943811b0aed880a1a2ead99bf60f3648 100644 --- a/dan/ocr/evaluate.py +++ b/dan/ocr/evaluate.py @@ -6,6 +6,7 @@ Evaluate a trained DAN model. """ +import json import logging import random from argparse import ArgumentTypeError @@ -73,6 +74,13 @@ def add_evaluate_parser(subcommands) -> None: type=parse_threshold, ) + parser.add_argument( + "--output-json", + help="Where to save evaluation results in JSON format.", + default=None, + type=Path, + ) + parser.set_defaults(func=run) @@ -161,7 +169,13 @@ def eval_nerval( print_results(scores) -def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool): +def eval( + rank, + config: dict, + nerval_threshold: float, + output_json: Path | None, + mlflow_logging: bool, +): torch.manual_seed(0) torch.cuda.manual_seed(0) np.random.seed(0) @@ -218,8 +232,12 @@ def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool): print_worst_predictions(all_inferences) + # Save to JSON + if output_json is not None: + output_json.write_text(json.dumps(all_inferences, indent=2)) + -def run(config: dict, nerval_threshold: float): +def run(config: dict, nerval_threshold: float, output_json: Path | None): update_config(config) mlflow_logging = bool(config.get("mlflow")) @@ -234,8 +252,8 @@ def run(config: dict, nerval_threshold: float): ): mp.spawn( eval, - args=(config, nerval_threshold, mlflow_logging), + args=(config, nerval_threshold, output_json, mlflow_logging), nprocs=config["training"]["device"]["nb_gpu"], ) else: - eval(0, config, nerval_threshold, mlflow_logging) + eval(0, config, nerval_threshold, output_json, mlflow_logging) diff --git a/docs/usage/evaluate/index.md b/docs/usage/evaluate/index.md index 94d57d18c812acbe184c721c0de3e52ad196d8a2..3f77356b3248ca1b288a9a994e8261d622b0d10a 100644 --- a/docs/usage/evaluate/index.md +++ b/docs/usage/evaluate/index.md @@ -23,6 +23,7 @@ This will, for each evaluated split: | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- | | `--config` | Path to the configuration file. | `pathlib.Path` | | | `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float` | `0.3` | +| `--output-json` | Where to save evaluation results in JSON format. | `pathlib.Path` | `None` | ## Examples diff --git a/tests/data/evaluate/inference.json b/tests/data/evaluate/inference.json new file mode 100644 index 0000000000000000000000000000000000000000..6aed3409128701223d850f182504b950550f5f64 --- /dev/null +++ b/tests/data/evaluate/inference.json @@ -0,0 +1,36 @@ +{ + "train": [ + [ + "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png", + "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c512241", + "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c5Patron?12241", + "", + 0.125 + ], + [ + "0dfe8bcd-ed0b-453e-bf19-cc697012296e.png", + "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1J \u24c0ch \u24c4E dachyle", + "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1S \u24c0ch \u24c4E dactylo \u24c518376", + "", + 0.4286 + ] + ], + "val": [ + [ + "2c242f5c-e979-43c4-b6f2-a6d4815b651d.png", + "\u24c8A \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4A \u24c514331", + "\u24c8d \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4d \u24c514 31", + "", + 0.5 + ] + ], + "test": [ + [ + "ffdec445-7f14-4f5f-be44-68d0844d0df1.png", + "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8V \u24c0Belle m\u00e8re", + "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8v \u24c0Belle m\u00e8re", + "", + 0.1429 + ] + ] +} diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 26b3c1a43b4f9457fa7c05155ed452534428a502..77a9c806dc0a9b5eb4855d11f4f066d9b18ce881 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -3,6 +3,7 @@ # -*- coding: utf-8 -*- +import json import shutil from pathlib import Path @@ -199,11 +200,23 @@ def test_eval_nerval(capsys, evaluate_config): ), ), ) -def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config): +@pytest.mark.parametrize("is_output_json", ((True, False))) +def test_evaluate( + capsys, training_res, val_res, test_res, is_output_json, evaluate_config, tmp_path +): + evaluate_path = FIXTURES / "evaluate" + # Use the tmp_path as base folder - evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate" + evaluate_config["training"]["output_folder"] = evaluate_path - evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD) + output_json = tmp_path / "inference.json" if is_output_json else None + + evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=output_json) + + if is_output_json: + assert json.loads(output_json.read_text()) == json.loads( + (evaluate_path / "inference.json").read_text() + ) # Check that the evaluation results are correct for split_name, expected_res in zip( @@ -365,7 +378,7 @@ def test_evaluate_language_model( "weight": language_model_weight, } - evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD) + evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=None) # Check that the evaluation results are correct for split_name, expected_res in [