Merge branch 'correct_entities_beginning' into 'master'

Correct research of predicted entity beginning See merge request teklia/nerval!5

Merge branch 'correct_entities_beginning' into 'master'
5af16c98 · kermorvant · a1613265 · 0a216a31 · 5af16c98 · 5af16c98
Commit 5af16c98 authored 3 years ago by kermorvant
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -273,7 +273,17 @@ def compute_matches(
                        and j not in visited_predict
                    ):
                        j -= 1
-                    current_compar += prediction[j + 1 : i]
+
+                    if (
+                        "B" in labels_predict[j]
+                        and get_type_label(labels_predict[j]) == tag_ref
+                        and j not in visited_predict
+                    ):
+                        start = j
+                    else:
+                        start = j + 1
+
+                    current_compar += prediction[start:i]

                found_aligned_beginning = True
                current_compar.append(prediction[i])

--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -143,6 +143,78 @@ expected_matches = {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}
 expected_matches_nested_perfect = {"All": 3, "PER": 1, "LOC": 2}
 expected_matches_nested_false = {"All": 2, "PER": 1, "LOC": 1}

+fake_annot_backtrack_boundary = "The red dragon"
+
+fake_annot_tags_bk_boundary = [
+    "O",
+    "O",
+    "O",
+    "O",
+    "B-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+]
+
+fake_predict_tags_bk_boundary = [
+    "B-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+]
+
+expected_matches_bk_boundary = {"All": 0, "PER": 0}
+
+fake_annot_backtrack_boundary_2 = "A red dragon"
+
+fake_annot_tags_bk_boundary_2 = [
+    "O",
+    "O",
+    "B-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+]
+
+fake_predict_tags_bk_boundary_2 = [
+    "B-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+]
+
+expected_matches_bk_boundary_2 = {"All": 1, "PER": 1}
+

 @pytest.mark.parametrize(
    "test_input, expected",
@@ -174,6 +246,24 @@ expected_matches_nested_false = {"All": 2, "PER": 1, "LOC": 1}
            ),
            expected_matches_nested_false,
        ),
+        (
+            (
+                fake_annot_backtrack_boundary,
+                fake_annot_backtrack_boundary,
+                fake_annot_tags_bk_boundary,
+                fake_predict_tags_bk_boundary,
+            ),
+            expected_matches_bk_boundary,
+        ),
+        (
+            (
+                fake_annot_backtrack_boundary_2,
+                fake_annot_backtrack_boundary_2,
+                fake_annot_tags_bk_boundary_2,
+                fake_predict_tags_bk_boundary_2,
+            ),
+            expected_matches_bk_boundary_2,
+        ),
    ],
 )
 def test_compute_matches(test_input, expected):