From bb3fcc721cb17b6c7e8de46b8cd11a2db68b48fe Mon Sep 17 00:00:00 2001
From: Blanche Miret <bmiret@teklia.com>
Date: Wed, 2 Jun 2021 11:22:26 +0000
Subject: [PATCH] Fix indexerror in parsing

---
 nerval/evaluate.py                | 15 +++----
 tests/{bioues.bio => bioeslu.bio} |  0
 tests/end_of_file.bio             | 12 ++++++
 tests/test_compute_matches.py     |  4 +-
 tests/test_parse_bio.py           | 67 ++++++++++++++++++++++++++++++-
 5 files changed, 87 insertions(+), 11 deletions(-)
 rename tests/{bioues.bio => bioeslu.bio} (100%)
 create mode 100644 tests/end_of_file.bio

diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index 3621865..100a6c0 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"]
 def get_type_label(label: str) -> str:
     """Return the type (tag) of a label
 
-    Input format: "[BIELUS]-type"
+    Input format: "[BIESLU]-type"
     """
     try:
         tag = (
             NOT_ENTITY_TAG
             if label == NOT_ENTITY_TAG
-            else re.match(r"[BIELUS]-(.{3,4})", label)[1]
+            else re.match(r"[BIESLU]-(.{3,4})", label)[1]
         )
     except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
 
     return tag
 
@@ -35,16 +35,16 @@ def get_type_label(label: str) -> str:
 def get_position_label(label: str) -> str:
     """Return the position of a label
 
-    Input format: "[BIELUS]-type"
+    Input format: "[BIESLU]-type"
     """
     try:
         pos = (
             NOT_ENTITY_TAG
             if label == NOT_ENTITY_TAG
-            else re.match(r"([BIELUS])-.{3,4}", label)[1]
+            else re.match(r"([BIESLU])-.{3,4}", label)[1]
         )
     except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
 
     return pos
 
@@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict:
                     and get_type_label(future_label) != last_tag
                 ):
                     index += 1
-                    future_label = lines[index].split()[1]
+                    if index < len(lines):
+                        future_label = lines[index].split()[1]
 
                 # Check for continuation of the original entity
                 if (
diff --git a/tests/bioues.bio b/tests/bioeslu.bio
similarity index 100%
rename from tests/bioues.bio
rename to tests/bioeslu.bio
diff --git a/tests/end_of_file.bio b/tests/end_of_file.bio
new file mode 100644
index 0000000..56f8e52
--- /dev/null
+++ b/tests/end_of_file.bio
@@ -0,0 +1,12 @@
+Louis B-PER
+par I-PER
+la I-PER
+grÃ¢ce I-PER
+de I-PER
+Dieu I-PER
+roy I-PER
+de I-PER
+France B-LOC
+et I-PER
+de I-PER
+Navarre B-LOC
diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py
index d2f0a0b..f472e84 100644
--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [
     'I-PER', 'I-PER',
     'I-PER',
     'I-PER', 'I-PER',
-    'I-PER',
+    'O',
     'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
     'O'
@@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [
     'I-PER', 'I-PER',
     'I-PER',
     'I-PER', 'I-PER',
-    'I-PER',
+    'O',
     'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
     'O'
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 7084c5e..23df00b 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio"
 BAD_BIO = "tests/test_bad.bio"
 FAKE_ANNOT_BIO = "tests/test_annot.bio"
 FAKE_PREDICT_BIO = "tests/test_predict.bio"
-BIOUES_BIO = "tests/bioues.bio"
+BIOESLU_BIO = "tests/bioeslu.bio"
+END_OF_FILE_BIO = "tests/end_of_file.bio"
 
 
 expected_parsed_annot = {
@@ -113,6 +114,67 @@ expected_parsed_predict = {
     "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
 }
 
+expected_parsed_end_of_file = {
+    "entity_count": {"All": 3, "LOC": 2, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+    ],
+    "words": "Louis par la grÃ¢ce de Dieu roy de France et de Navarre",
+}
+
 
 @pytest.mark.parametrize(
     "test_input, expected",
@@ -120,7 +182,8 @@ expected_parsed_predict = {
         (FAKE_ANNOT_BIO, expected_parsed_annot),
         (FAKE_PREDICT_BIO, expected_parsed_predict),
         (EMPTY_BIO, None),
-        (BIOUES_BIO, expected_parsed_annot),
+        (BIOESLU_BIO, expected_parsed_annot),
+        (END_OF_FILE_BIO, expected_parsed_end_of_file),
     ],
 )
 def test_parse_bio(test_input, expected):
-- 
GitLab