From 1ff1ca528e6b16c6fcb6b37a35463942c203dcec Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Thu, 11 Apr 2024 15:34:12 +0000 Subject: [PATCH] Sort rows alphabetically, place ALL at the very bottom --- nerval/__init__.py | 2 ++ nerval/evaluate.py | 13 +++++++------ nerval/parse.py | 8 +++++--- nerval/utils.py | 27 ++++++++++++++++++++------- tests/test_compute_matches.py | 12 ++++++------ tests/test_compute_scores.py | 10 +++++----- tests/test_parse_bio.py | 8 ++++---- tests/test_run.py | 6 +++--- 8 files changed, 52 insertions(+), 34 deletions(-) diff --git a/nerval/__init__.py b/nerval/__init__.py index 22615ee..c198911 100644 --- a/nerval/__init__.py +++ b/nerval/__init__.py @@ -5,3 +5,5 @@ logging.basicConfig( format="%(asctime)s %(levelname)s/%(name)s: %(message)s", ) logger = logging.getLogger(__name__) + +ALL_ENTITIES = "ALL" diff --git a/nerval/evaluate.py b/nerval/evaluate.py index ce27e99..1c94728 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -6,6 +6,7 @@ from typing import List import editdistance import edlib +from nerval import ALL_ENTITIES from nerval.parse import ( BEGINNING_POS, NOT_ENTITY_TAG, @@ -56,14 +57,14 @@ def compute_matches( labels_annot : list of strings, example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...] labels_predict : list of string , example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...] - Output : {TAG1 : nb_entity_matched, ...}, example : {'All': 1, 'OCC': 0, 'PER': 1} + Output : {TAG1 : nb_entity_matched, ...}, example : {'ALL': 1, 'OCC': 0, 'PER': 1} """ assert annotation, "Annotation is empty" assert prediction, "Prediction is empty" assert labels_annot, "Annotation labels are empty" assert labels_predict, "Prediction labels are empty" - entity_count = {"All": 0} + entity_count = {ALL_ENTITIES: 0} last_tag = NOT_ENTITY_TAG # Track indexes of characters found for continuation of nested entities @@ -172,7 +173,7 @@ def compute_matches( else 0 ) entity_count[last_tag] = entity_count.get(last_tag, 0) + score - entity_count["All"] += score + entity_count[ALL_ENTITIES] += score current_ref = [] current_compar = [] @@ -382,9 +383,9 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool): count += 1 scores = run(annot, predict, threshold, verbose) - precision += scores["All"]["P"] - recall += scores["All"]["R"] - f1 += scores["All"]["F1"] + precision += scores[ALL_ENTITIES]["P"] + recall += scores[ALL_ENTITIES]["R"] + f1 += scores[ALL_ENTITIES]["F1"] if not count: raise Exception("No file were counted") diff --git a/nerval/parse.py b/nerval/parse.py index 04173bf..31cc3c3 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -1,6 +1,8 @@ import re from typing import List +from nerval import ALL_ENTITIES + NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] @@ -57,7 +59,7 @@ def parse_bio(lines: List[str]) -> dict: """ words = [] labels = [] - entity_count = {"All": 0} + entity_count = {ALL_ENTITIES: 0} last_tag = None if "§" in " ".join(lines): @@ -140,7 +142,7 @@ def parse_bio(lines: List[str]) -> dict: # Count nb entity for each type if get_position_label(label) in BEGINNING_POS: entity_count[tag] = entity_count.get(tag, 0) + 1 - entity_count["All"] += 1 + entity_count[ALL_ENTITIES] += 1 last_tag = tag @@ -156,7 +158,7 @@ def parse_bio(lines: List[str]) -> dict: result["labels"] ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)' for tag in result["entity_count"]: - if tag != "All": + if tag != ALL_ENTITIES: assert ( result["labels"].count(f"B-{tag}") == result["entity_count"][tag] ), f'Found {result["entity_count"][tag]} entities for {result["labels"].count(f"B-{tag}")} label(s) for entity {tag}' diff --git a/nerval/utils.py b/nerval/utils.py index da6a26e..fbfa5d1 100644 --- a/nerval/utils.py +++ b/nerval/utils.py @@ -1,5 +1,7 @@ from prettytable import MARKDOWN, PrettyTable +from nerval import ALL_ENTITIES + def print_markdown_table(header: list[str], rows: list[list]) -> None: """Prints a Markdown table filled with the provided header and rows.""" @@ -10,6 +12,17 @@ def print_markdown_table(header: list[str], rows: list[list]) -> None: table.align = "r" # First column should be left aligned still table.align[header[0]] = "l" + + def _special_sort(row: list[str]) -> str: + if row[0] == ALL_ENTITIES: + # Place the line for all entities at the very top + return "" + return row[0] + + rows.sort(key=_special_sort) + # Place ALL_ENTITIES row at the end + rows.append(rows.pop(0)) + table.add_rows(rows) print(table) @@ -45,13 +58,13 @@ def print_results(scores: dict) -> None: def print_result_compact(scores: dict) -> None: result = [ - "All", - scores["All"]["predicted"], - scores["All"]["matched"], - round(scores["All"]["P"], 3), - round(scores["All"]["R"], 3), - round(scores["All"]["F1"], 3), - scores["All"]["Support"], + ALL_ENTITIES, + scores[ALL_ENTITIES]["predicted"], + scores[ALL_ENTITIES]["matched"], + round(scores[ALL_ENTITIES]["P"], 3), + round(scores[ALL_ENTITIES]["R"], 3), + round(scores[ALL_ENTITIES]["F1"], 3), + scores[ALL_ENTITIES]["Support"], ] print_markdown_table( ["tag", "predicted", "matched", "Precision", "Recall", "F1", "Support"], diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py index 6e72e40..c230f13 100644 --- a/tests/test_compute_matches.py +++ b/tests/test_compute_matches.py @@ -1,6 +1,6 @@ import pytest -from nerval import evaluate +from nerval import ALL_ENTITIES, evaluate THRESHOLD = 0.30 @@ -370,7 +370,7 @@ fake_predict_tags_bk_boundary_2 = [ fake_predict_tags_aligned, THRESHOLD, ), - {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}, + {ALL_ENTITIES: 1, "PER": 1, "LOC": 0, "DAT": 0}, ), ( ( @@ -380,7 +380,7 @@ fake_predict_tags_bk_boundary_2 = [ fake_tags_aligned_nested_perfect, THRESHOLD, ), - {"All": 3, "PER": 1, "LOC": 2}, + {ALL_ENTITIES: 3, "PER": 1, "LOC": 2}, ), ( ( @@ -390,7 +390,7 @@ fake_predict_tags_bk_boundary_2 = [ fake_tags_aligned_nested_false, THRESHOLD, ), - {"All": 2, "PER": 1, "LOC": 1}, + {ALL_ENTITIES: 2, "PER": 1, "LOC": 1}, ), ( ( @@ -400,7 +400,7 @@ fake_predict_tags_bk_boundary_2 = [ fake_predict_tags_bk_boundary, THRESHOLD, ), - {"All": 0, "PER": 0}, + {ALL_ENTITIES: 0, "PER": 0}, ), ( ( @@ -410,7 +410,7 @@ fake_predict_tags_bk_boundary_2 = [ fake_predict_tags_bk_boundary_2, THRESHOLD, ), - {"All": 1, "PER": 1}, + {ALL_ENTITIES: 1, "PER": 1}, ), ], ) diff --git a/tests/test_compute_scores.py b/tests/test_compute_scores.py index 239754f..1d3d320 100644 --- a/tests/test_compute_scores.py +++ b/tests/test_compute_scores.py @@ -1,15 +1,15 @@ import pytest -from nerval import evaluate +from nerval import ALL_ENTITIES, evaluate @pytest.mark.parametrize( ("annot", "predict", "matches"), [ ( - {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, - {"All": 3, "DAT": 1, "***": 1, "PER": 1}, - {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}, + {ALL_ENTITIES: 3, "DAT": 1, "LOC": 1, "PER": 1}, + {ALL_ENTITIES: 3, "DAT": 1, "***": 1, "PER": 1}, + {ALL_ENTITIES: 1, "PER": 1, "LOC": 0, "DAT": 0}, ), ], ) @@ -31,7 +31,7 @@ def test_compute_scores(annot, predict, matches): "matched": 0, "Support": 1, }, - "All": { + ALL_ENTITIES: { "P": 0.3333333333333333, "R": 0.3333333333333333, "F1": 0.3333333333333333, diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index e50f6af..3266cea 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -2,11 +2,11 @@ import re import pytest -from nerval import evaluate +from nerval import ALL_ENTITIES, evaluate from nerval.parse import get_type_label, parse_line expected_parsed_annot = { - "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, + "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "LOC": 1, "PER": 1}, "labels": [ "B-PER", "I-PER", @@ -57,7 +57,7 @@ expected_parsed_annot = { } expected_parsed_predict = { - "entity_count": {"All": 3, "DAT": 1, "***": 1, "PER": 1}, + "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "***": 1, "PER": 1}, "labels": [ "B-PER", "I-PER", @@ -108,7 +108,7 @@ expected_parsed_predict = { } expected_parsed_end_of_file = { - "entity_count": {"All": 3, "LOC": 2, "PER": 1}, + "entity_count": {ALL_ENTITIES: 3, "LOC": 2, "PER": 1}, "labels": [ "B-PER", "I-PER", diff --git a/tests/test_run.py b/tests/test_run.py index 44f4e26..2e0c08f 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest -from nerval import evaluate +from nerval import ALL_ENTITIES, evaluate @pytest.mark.parametrize( @@ -29,7 +29,7 @@ from nerval import evaluate "matched": 0, "Support": 1, }, - "All": { + ALL_ENTITIES: { "P": 0.3333333333333333, "R": 0.3333333333333333, "F1": 0.3333333333333333, @@ -59,7 +59,7 @@ from nerval import evaluate pytest.lazy_fixture("nested_bio"), pytest.lazy_fixture("nested_bio"), { - "All": { + ALL_ENTITIES: { "P": 1.0, "R": 1.0, "F1": 1.0, -- GitLab