Merge branch 'fix_indexerror_in_parsing' into 'master'

Fix indexerror in parsing See merge request teklia/nerval!9

Merge branch 'fix_indexerror_in_parsing' into 'master'
e2b6bf95 · kermorvant · fd3db043 · bb3fcc72 · e2b6bf95 · e2b6bf95
Commit e2b6bf95 authored 3 years ago by kermorvant
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"]
 def get_type_label(label: str) -> str:
    """Return the type (tag) of a label

-    Input format: "[BIELUS]-type"
+    Input format: "[BIESLU]-type"
    """
    try:
        tag = (
            NOT_ENTITY_TAG
            if label == NOT_ENTITY_TAG
-            else re.match(r"[BIELUS]-(.{3,4})", label)[1]
+            else re.match(r"[BIESLU]-(.{3,4})", label)[1]
        )
    except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))

    return tag

@@ -35,16 +35,16 @@ def get_type_label(label: str) -> str:
 def get_position_label(label: str) -> str:
    """Return the position of a label

-    Input format: "[BIELUS]-type"
+    Input format: "[BIESLU]-type"
    """
    try:
        pos = (
            NOT_ENTITY_TAG
            if label == NOT_ENTITY_TAG
-            else re.match(r"([BIELUS])-.{3,4}", label)[1]
+            else re.match(r"([BIESLU])-.{3,4}", label)[1]
        )
    except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))

    return pos

@@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict:
                    and get_type_label(future_label) != last_tag
                ):
                    index += 1
-                    future_label = lines[index].split()[1]
+                    if index < len(lines):
+                        future_label = lines[index].split()[1]

                # Check for continuation of the original entity
                if (

--- a/tests/bioues.bio
+++ b/tests/bioues.bio
--- a/tests/end_of_file.bio
+++ b/tests/end_of_file.bio
+Louis B-PER
+par I-PER
+la I-PER
+grâce I-PER
+de I-PER
+Dieu I-PER
+roy I-PER
+de I-PER
+France B-LOC
+et I-PER
+de I-PER
+Navarre B-LOC
--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [
    'I-PER', 'I-PER',
    'I-PER',
    'I-PER', 'I-PER',
-    'I-PER',
+    'O',
    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
    'O',
    'O'
@@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [
    'I-PER', 'I-PER',
    'I-PER',
    'I-PER', 'I-PER',
-    'I-PER',
+    'O',
    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
    'O',
    'O'

--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio"
 BAD_BIO = "tests/test_bad.bio"
 FAKE_ANNOT_BIO = "tests/test_annot.bio"
 FAKE_PREDICT_BIO = "tests/test_predict.bio"
-BIOUES_BIO = "tests/bioues.bio"
+BIOESLU_BIO = "tests/bioeslu.bio"
+END_OF_FILE_BIO = "tests/end_of_file.bio"


 expected_parsed_annot = {
@@ -113,6 +114,67 @@ expected_parsed_predict = {
    "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
 }

+expected_parsed_end_of_file = {
+    "entity_count": {"All": 3, "LOC": 2, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+    ],
+    "words": "Louis par la grâce de Dieu roy de France et de Navarre",
+}
+

 @pytest.mark.parametrize(
    "test_input, expected",
@@ -120,7 +182,8 @@ expected_parsed_predict = {
        (FAKE_ANNOT_BIO, expected_parsed_annot),
        (FAKE_PREDICT_BIO, expected_parsed_predict),
        (EMPTY_BIO, None),
-        (BIOUES_BIO, expected_parsed_annot),
+        (BIOESLU_BIO, expected_parsed_annot),
+        (END_OF_FILE_BIO, expected_parsed_end_of_file),
    ],
 )
 def test_parse_bio(test_input, expected):