Merge branch 'eval-support-empty-strings' into 'main'

Support empty strings during evaluation Closes #261 See merge request !390

Merge branch 'eval-support-empty-strings' into 'main'
ed481c6f · Yoann Schneider · bada606d · 72c96bc6 · ed481c6f · ed481c6f
Commit ed481c6f authored 1 year ago by Yoann Schneider
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
    rev: 0.7.17
    hooks:
    - id: mdformat
-      exclude: tests/data/analyze|tests/data/evaluate/metrics_table.md
+      exclude: tests/data/analyze|tests/data/evaluate/.*.md
      # Optionally add plugins
      additional_dependencies:
      - mdformat-mkdocs[recommended]
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -31,6 +31,8 @@ logger = logging.getLogger(__name__)
 NERVAL_THRESHOLD = 0.30
 NB_WORST_PREDICTIONS = 5

+EMPTY_STRING = "∅"
+

 def parse_threshold(value: str) -> float:
    """
@@ -87,14 +89,24 @@ def print_worst_predictions(all_inferences: Dict[str, List[Inference]]):
        reverse=True,
    )[:NB_WORST_PREDICTIONS]
    for inference in worst_inferences:
+        if not inference.ground_truth:
+            logger.warning(
+                f"Ground truth is empty for {inference.image}. `{EMPTY_STRING}` will be displayed"
+            )
+
+        if not inference.prediction:
+            logger.warning(
+                f"Prediction is empty for {inference.image}. `{EMPTY_STRING}` will be displayed"
+            )
+
        alignment = getNiceAlignment(
            align(
-                inference.ground_truth,
-                inference.prediction,
+                inference.ground_truth or EMPTY_STRING,
+                inference.prediction or EMPTY_STRING,
                task="path",
            ),
-            inference.ground_truth,
-            inference.prediction,
+            inference.ground_truth or EMPTY_STRING,
+            inference.prediction or EMPTY_STRING,
        )
        alignment_str = f'{alignment["query_aligned"]}\n{alignment["matched_aligned"]}\n{alignment["target_aligned"]}'
        table.add_row([inference.image, round(inference.wer * 100, 2), alignment_str])

--- a/tests/data/evaluate/eval_nerval.md
+++ b/tests/data/evaluate/eval_nerval.md
+
+#### Nerval evaluation
+
+##### test
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Patron  |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Operai  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Louche  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Koala   |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Firstname |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Chalumeau |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|  Batiment |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|    All    |     8     |    7    |    87.5   |  87.5  |  87.5 |    8    |
--- a/tests/data/evaluate/worst_predictions.md
+++ b/tests/data/evaluate/worst_predictions.md
+
+#### 5 worst prediction(s)
+
+|                Image name                | WER  |        Alignment between ground truth - prediction        |
+|:----------------------------------------:|:----:|:---------------------------------------------------------:|
+| 0dfe8bcd-ed0b-453e-bf19-cc697012296e.png | 100  |                         Some text                         |
+|                                          |      |                         .--------                         |
+|                                          |      |                         ∅--------                         |
+| 2c242f5c-e979-43c4-b6f2-a6d4815b651d.png | 100  |                         ∅--------                         |
+|                                          |      |                         .--------                         |
+|                                          |      |                         Some text                         |
+| 0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png | 12.5 | ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ-------12241 |
+|                                          |      | |||||||||||||||||||||||||||||||||||||||||||||-------||||| |
+|                                          |      | ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241 |
+| ffdec445-7f14-4f5f-be44-68d0844d0df1.png |  0   |                             ∅                             |
+|                                          |      |                             |                             |
+|                                          |      |                             ∅                             |
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -50,6 +50,100 @@ def test_add_metrics_table_row():
    assert metrics_table.rows == [["train", 130.23, 100, "−"]]


+def test_print_worst_predictions(capsys):
+    evaluate.print_worst_predictions(
+        all_inferences={
+            "test": [
+                Inference(
+                    image="0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png",
+                    ground_truth="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ12241",
+                    prediction="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    lm_prediction="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    wer=0.125,
+                ),
+                # Test with empty strings
+                Inference(
+                    image="0dfe8bcd-ed0b-453e-bf19-cc697012296e.png",
+                    ground_truth="Some text",
+                    prediction="",
+                    lm_prediction="",
+                    wer=1,
+                ),
+                Inference(
+                    image="2c242f5c-e979-43c4-b6f2-a6d4815b651d.png",
+                    ground_truth="",
+                    prediction="Some text",
+                    lm_prediction="",
+                    wer=1,
+                ),
+                Inference(
+                    image="ffdec445-7f14-4f5f-be44-68d0844d0df1.png",
+                    ground_truth="",
+                    prediction="",
+                    lm_prediction="Some text",
+                    wer=0,
+                ),
+            ]
+        }
+    )
+
+    # Check the metrics Markdown table
+    captured_std = capsys.readouterr()
+    last_printed_lines = captured_std.out.split("\n")
+    assert (
+        "\n".join(last_printed_lines)
+        == Path(FIXTURES / "evaluate" / "worst_predictions.md").read_text()
+    )
+
+
+def test_eval_nerval(capsys, evaluate_config):
+    evaluate.eval_nerval(
+        all_inferences={
+            "test": [
+                Inference(
+                    image="0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png",
+                    ground_truth="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ12241",
+                    prediction="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    lm_prediction="ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    wer=0.125,
+                ),
+                # Test with empty strings
+                Inference(
+                    image="0dfe8bcd-ed0b-453e-bf19-cc697012296e.png",
+                    ground_truth="Some text",
+                    prediction="",
+                    lm_prediction="",
+                    wer=1,
+                ),
+                Inference(
+                    image="2c242f5c-e979-43c4-b6f2-a6d4815b651d.png",
+                    ground_truth="",
+                    prediction="Some text",
+                    lm_prediction="",
+                    wer=1,
+                ),
+                Inference(
+                    image="ffdec445-7f14-4f5f-be44-68d0844d0df1.png",
+                    ground_truth="",
+                    prediction="",
+                    lm_prediction="Some text",
+                    wer=0,
+                ),
+            ]
+        },
+        tokens=evaluate_config["dataset"]["tokens"],
+        threshold=evaluate.NERVAL_THRESHOLD,
+    )
+
+    # Check the metrics Markdown table
+    captured_std = capsys.readouterr()
+    last_printed_lines = captured_std.out.split("\n")
+    assert (
+        "\n".join(last_printed_lines)
+        == Path(FIXTURES / "evaluate" / "eval_nerval.md").read_text()
+    )
+
+
 @pytest.mark.parametrize(
    "training_res, val_res, test_res",
    (