Merge branch 'save-detailed-results-evaluation' into 'main'

Add an option to save all results in the `evaluate` command Closes #277 See merge request !404

Merge branch 'save-detailed-results-evaluation' into 'main'
20161125 · Yoann Schneider · 823416d4 · c5ede812 · 20161125 · 20161125
Commit 20161125 authored 8 months ago by Yoann Schneider
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -6,6 +6,7 @@
 Evaluate a trained DAN model.
 """

+import json
 import logging
 import random
 from argparse import ArgumentTypeError
@@ -73,6 +74,13 @@ def add_evaluate_parser(subcommands) -> None:
        type=parse_threshold,
    )

+    parser.add_argument(
+        "--output-json",
+        help="Where to save evaluation results in JSON format.",
+        default=None,
+        type=Path,
+    )
+
    parser.set_defaults(func=run)


@@ -161,7 +169,13 @@ def eval_nerval(
        print_results(scores)


-def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
+def eval(
+    rank,
+    config: dict,
+    nerval_threshold: float,
+    output_json: Path | None,
+    mlflow_logging: bool,
+):
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    np.random.seed(0)
@@ -218,8 +232,12 @@ def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):

    print_worst_predictions(all_inferences)

+    # Save to JSON
+    if output_json is not None:
+        output_json.write_text(json.dumps(all_inferences, indent=2))
+

-def run(config: dict, nerval_threshold: float):
+def run(config: dict, nerval_threshold: float, output_json: Path | None):
    update_config(config)

    mlflow_logging = bool(config.get("mlflow"))
@@ -234,8 +252,8 @@ def run(config: dict, nerval_threshold: float):
    ):
        mp.spawn(
            eval,
-            args=(config, nerval_threshold, mlflow_logging),
+            args=(config, nerval_threshold, output_json, mlflow_logging),
            nprocs=config["training"]["device"]["nb_gpu"],
        )
    else:
-        eval(0, config, nerval_threshold, mlflow_logging)
+        eval(0, config, nerval_threshold, output_json, mlflow_logging)
--- a/docs/usage/evaluate/index.md
+++ b/docs/usage/evaluate/index.md
@@ -23,6 +23,7 @@ This will, for each evaluated split:
 | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
 | `--config`           | Path to the configuration file.                                                                                                                                                                          | `pathlib.Path` |         |
 | `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float`        | `0.3`   |
+| `--output-json`      | Where to save evaluation results in JSON format.                                                                                                                                                         | `pathlib.Path` | `None`  |

 ## Examples


--- a/tests/data/evaluate/inference.json
+++ b/tests/data/evaluate/inference.json
+{
+    "train": [
+        [
+            "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png",
+            "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c512241",
+            "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c5Patron?12241",
+            "",
+            0.125
+        ],
+        [
+            "0dfe8bcd-ed0b-453e-bf19-cc697012296e.png",
+            "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1J \u24c0ch \u24c4E dachyle",
+            "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1S \u24c0ch \u24c4E dactylo \u24c518376",
+            "",
+            0.4286
+        ]
+    ],
+    "val": [
+        [
+            "2c242f5c-e979-43c4-b6f2-a6d4815b651d.png",
+            "\u24c8A \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4A \u24c514331",
+            "\u24c8d \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4d \u24c514 31",
+            "",
+            0.5
+        ]
+    ],
+    "test": [
+        [
+            "ffdec445-7f14-4f5f-be44-68d0844d0df1.png",
+            "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8V \u24c0Belle m\u00e8re",
+            "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8v \u24c0Belle m\u00e8re",
+            "",
+            0.1429
+        ]
+    ]
+}
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -3,6 +3,7 @@

 # -*- coding: utf-8 -*-

+import json
 import shutil
 from pathlib import Path

@@ -199,11 +200,23 @@ def test_eval_nerval(capsys, evaluate_config):
        ),
    ),
 )
-def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
+@pytest.mark.parametrize("is_output_json", ((True, False)))
+def test_evaluate(
+    capsys, training_res, val_res, test_res, is_output_json, evaluate_config, tmp_path
+):
+    evaluate_path = FIXTURES / "evaluate"
+
    # Use the tmp_path as base folder
-    evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
+    evaluate_config["training"]["output_folder"] = evaluate_path

-    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
+    output_json = tmp_path / "inference.json" if is_output_json else None
+
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=output_json)
+
+    if is_output_json:
+        assert json.loads(output_json.read_text()) == json.loads(
+            (evaluate_path / "inference.json").read_text()
+        )

    # Check that the evaluation results are correct
    for split_name, expected_res in zip(
@@ -365,7 +378,7 @@ def test_evaluate_language_model(
        "weight": language_model_weight,
    }

-    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=None)

    # Check that the evaluation results are correct
    for split_name, expected_res in [