From 1ff1ca528e6b16c6fcb6b37a35463942c203dcec Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Thu, 11 Apr 2024 15:34:12 +0000
Subject: [PATCH] Sort rows alphabetically, place ALL at the very bottom

---
 nerval/__init__.py            |  2 ++
 nerval/evaluate.py            | 13 +++++++------
 nerval/parse.py               |  8 +++++---
 nerval/utils.py               | 27 ++++++++++++++++++++-------
 tests/test_compute_matches.py | 12 ++++++------
 tests/test_compute_scores.py  | 10 +++++-----
 tests/test_parse_bio.py       |  8 ++++----
 tests/test_run.py             |  6 +++---
 8 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/nerval/__init__.py b/nerval/__init__.py
index 22615ee..c198911 100644
--- a/nerval/__init__.py
+++ b/nerval/__init__.py
@@ -5,3 +5,5 @@ logging.basicConfig(
     format="%(asctime)s %(levelname)s/%(name)s: %(message)s",
 )
 logger = logging.getLogger(__name__)
+
+ALL_ENTITIES = "ALL"
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index ce27e99..1c94728 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -6,6 +6,7 @@ from typing import List
 import editdistance
 import edlib
 
+from nerval import ALL_ENTITIES
 from nerval.parse import (
     BEGINNING_POS,
     NOT_ENTITY_TAG,
@@ -56,14 +57,14 @@ def compute_matches(
     labels_annot : list of strings,   example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...]
     labels_predict : list of string , example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...]
 
-    Output : {TAG1 : nb_entity_matched, ...}, example : {'All': 1, 'OCC': 0, 'PER': 1}
+    Output : {TAG1 : nb_entity_matched, ...}, example : {'ALL': 1, 'OCC': 0, 'PER': 1}
     """
     assert annotation, "Annotation is empty"
     assert prediction, "Prediction is empty"
     assert labels_annot, "Annotation labels are empty"
     assert labels_predict, "Prediction labels are empty"
 
-    entity_count = {"All": 0}
+    entity_count = {ALL_ENTITIES: 0}
     last_tag = NOT_ENTITY_TAG
 
     # Track indexes of characters found for continuation of nested entities
@@ -172,7 +173,7 @@ def compute_matches(
                     else 0
                 )
                 entity_count[last_tag] = entity_count.get(last_tag, 0) + score
-                entity_count["All"] += score
+                entity_count[ALL_ENTITIES] += score
                 current_ref = []
                 current_compar = []
 
@@ -382,9 +383,9 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool):
 
         count += 1
         scores = run(annot, predict, threshold, verbose)
-        precision += scores["All"]["P"]
-        recall += scores["All"]["R"]
-        f1 += scores["All"]["F1"]
+        precision += scores[ALL_ENTITIES]["P"]
+        recall += scores[ALL_ENTITIES]["R"]
+        f1 += scores[ALL_ENTITIES]["F1"]
 
     if not count:
         raise Exception("No file were counted")
diff --git a/nerval/parse.py b/nerval/parse.py
index 04173bf..31cc3c3 100644
--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -1,6 +1,8 @@
 import re
 from typing import List
 
+from nerval import ALL_ENTITIES
+
 NOT_ENTITY_TAG = "O"
 BEGINNING_POS = ["B", "S", "U"]
 
@@ -57,7 +59,7 @@ def parse_bio(lines: List[str]) -> dict:
     """
     words = []
     labels = []
-    entity_count = {"All": 0}
+    entity_count = {ALL_ENTITIES: 0}
     last_tag = None
 
     if "§" in " ".join(lines):
@@ -140,7 +142,7 @@ def parse_bio(lines: List[str]) -> dict:
         # Count nb entity for each type
         if get_position_label(label) in BEGINNING_POS:
             entity_count[tag] = entity_count.get(tag, 0) + 1
-            entity_count["All"] += 1
+            entity_count[ALL_ENTITIES] += 1
 
         last_tag = tag
 
@@ -156,7 +158,7 @@ def parse_bio(lines: List[str]) -> dict:
             result["labels"]
         ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)'
         for tag in result["entity_count"]:
-            if tag != "All":
+            if tag != ALL_ENTITIES:
                 assert (
                     result["labels"].count(f"B-{tag}") == result["entity_count"][tag]
                 ), f'Found {result["entity_count"][tag]} entities for {result["labels"].count(f"B-{tag}")} label(s) for entity {tag}'
diff --git a/nerval/utils.py b/nerval/utils.py
index da6a26e..fbfa5d1 100644
--- a/nerval/utils.py
+++ b/nerval/utils.py
@@ -1,5 +1,7 @@
 from prettytable import MARKDOWN, PrettyTable
 
+from nerval import ALL_ENTITIES
+
 
 def print_markdown_table(header: list[str], rows: list[list]) -> None:
     """Prints a Markdown table filled with the provided header and rows."""
@@ -10,6 +12,17 @@ def print_markdown_table(header: list[str], rows: list[list]) -> None:
     table.align = "r"
     # First column should be left aligned still
     table.align[header[0]] = "l"
+
+    def _special_sort(row: list[str]) -> str:
+        if row[0] == ALL_ENTITIES:
+            # Place the line for all entities at the very top
+            return ""
+        return row[0]
+
+    rows.sort(key=_special_sort)
+    # Place ALL_ENTITIES row at the end
+    rows.append(rows.pop(0))
+
     table.add_rows(rows)
     print(table)
 
@@ -45,13 +58,13 @@ def print_results(scores: dict) -> None:
 
 def print_result_compact(scores: dict) -> None:
     result = [
-        "All",
-        scores["All"]["predicted"],
-        scores["All"]["matched"],
-        round(scores["All"]["P"], 3),
-        round(scores["All"]["R"], 3),
-        round(scores["All"]["F1"], 3),
-        scores["All"]["Support"],
+        ALL_ENTITIES,
+        scores[ALL_ENTITIES]["predicted"],
+        scores[ALL_ENTITIES]["matched"],
+        round(scores[ALL_ENTITIES]["P"], 3),
+        round(scores[ALL_ENTITIES]["R"], 3),
+        round(scores[ALL_ENTITIES]["F1"], 3),
+        scores[ALL_ENTITIES]["Support"],
     ]
     print_markdown_table(
         ["tag", "predicted", "matched", "Precision", "Recall", "F1", "Support"],
diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py
index 6e72e40..c230f13 100644
--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -1,6 +1,6 @@
 import pytest
 
-from nerval import evaluate
+from nerval import ALL_ENTITIES, evaluate
 
 THRESHOLD = 0.30
 
@@ -370,7 +370,7 @@ fake_predict_tags_bk_boundary_2 = [
                 fake_predict_tags_aligned,
                 THRESHOLD,
             ),
-            {"All": 1, "PER": 1, "LOC": 0, "DAT": 0},
+            {ALL_ENTITIES: 1, "PER": 1, "LOC": 0, "DAT": 0},
         ),
         (
             (
@@ -380,7 +380,7 @@ fake_predict_tags_bk_boundary_2 = [
                 fake_tags_aligned_nested_perfect,
                 THRESHOLD,
             ),
-            {"All": 3, "PER": 1, "LOC": 2},
+            {ALL_ENTITIES: 3, "PER": 1, "LOC": 2},
         ),
         (
             (
@@ -390,7 +390,7 @@ fake_predict_tags_bk_boundary_2 = [
                 fake_tags_aligned_nested_false,
                 THRESHOLD,
             ),
-            {"All": 2, "PER": 1, "LOC": 1},
+            {ALL_ENTITIES: 2, "PER": 1, "LOC": 1},
         ),
         (
             (
@@ -400,7 +400,7 @@ fake_predict_tags_bk_boundary_2 = [
                 fake_predict_tags_bk_boundary,
                 THRESHOLD,
             ),
-            {"All": 0, "PER": 0},
+            {ALL_ENTITIES: 0, "PER": 0},
         ),
         (
             (
@@ -410,7 +410,7 @@ fake_predict_tags_bk_boundary_2 = [
                 fake_predict_tags_bk_boundary_2,
                 THRESHOLD,
             ),
-            {"All": 1, "PER": 1},
+            {ALL_ENTITIES: 1, "PER": 1},
         ),
     ],
 )
diff --git a/tests/test_compute_scores.py b/tests/test_compute_scores.py
index 239754f..1d3d320 100644
--- a/tests/test_compute_scores.py
+++ b/tests/test_compute_scores.py
@@ -1,15 +1,15 @@
 import pytest
 
-from nerval import evaluate
+from nerval import ALL_ENTITIES, evaluate
 
 
 @pytest.mark.parametrize(
     ("annot", "predict", "matches"),
     [
         (
-            {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
-            {"All": 3, "DAT": 1, "***": 1, "PER": 1},
-            {"All": 1, "PER": 1, "LOC": 0, "DAT": 0},
+            {ALL_ENTITIES: 3, "DAT": 1, "LOC": 1, "PER": 1},
+            {ALL_ENTITIES: 3, "DAT": 1, "***": 1, "PER": 1},
+            {ALL_ENTITIES: 1, "PER": 1, "LOC": 0, "DAT": 0},
         ),
     ],
 )
@@ -31,7 +31,7 @@ def test_compute_scores(annot, predict, matches):
             "matched": 0,
             "Support": 1,
         },
-        "All": {
+        ALL_ENTITIES: {
             "P": 0.3333333333333333,
             "R": 0.3333333333333333,
             "F1": 0.3333333333333333,
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index e50f6af..3266cea 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -2,11 +2,11 @@ import re
 
 import pytest
 
-from nerval import evaluate
+from nerval import ALL_ENTITIES, evaluate
 from nerval.parse import get_type_label, parse_line
 
 expected_parsed_annot = {
-    "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
+    "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "LOC": 1, "PER": 1},
     "labels": [
         "B-PER",
         "I-PER",
@@ -57,7 +57,7 @@ expected_parsed_annot = {
 }
 
 expected_parsed_predict = {
-    "entity_count": {"All": 3, "DAT": 1, "***": 1, "PER": 1},
+    "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "***": 1, "PER": 1},
     "labels": [
         "B-PER",
         "I-PER",
@@ -108,7 +108,7 @@ expected_parsed_predict = {
 }
 
 expected_parsed_end_of_file = {
-    "entity_count": {"All": 3, "LOC": 2, "PER": 1},
+    "entity_count": {ALL_ENTITIES: 3, "LOC": 2, "PER": 1},
     "labels": [
         "B-PER",
         "I-PER",
diff --git a/tests/test_run.py b/tests/test_run.py
index 44f4e26..2e0c08f 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pytest
 
-from nerval import evaluate
+from nerval import ALL_ENTITIES, evaluate
 
 
 @pytest.mark.parametrize(
@@ -29,7 +29,7 @@ from nerval import evaluate
                     "matched": 0,
                     "Support": 1,
                 },
-                "All": {
+                ALL_ENTITIES: {
                     "P": 0.3333333333333333,
                     "R": 0.3333333333333333,
                     "F1": 0.3333333333333333,
@@ -59,7 +59,7 @@ from nerval import evaluate
             pytest.lazy_fixture("nested_bio"),
             pytest.lazy_fixture("nested_bio"),
             {
-                "All": {
+                ALL_ENTITIES: {
                     "P": 1.0,
                     "R": 1.0,
                     "F1": 1.0,
-- 
GitLab