From d77e9801adfe549ef6057d8f0c2240377934c51c Mon Sep 17 00:00:00 2001
From: Blanche Miret <bmiret@teklia.com>
Date: Wed, 2 Jun 2021 07:33:54 +0000
Subject: [PATCH] Handle all beginning labels

---
 nerval/evaluate.py      |  47 ++++++++++----
 tests/bioues.bio        |  10 +++
 tests/test_parse_bio.py | 141 +++++++++++++++++++++++++++-------------
 3 files changed, 143 insertions(+), 55 deletions(-)
 create mode 100644 tests/bioues.bio

diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index 8b23527..3621865 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -12,6 +12,7 @@ import termtables as tt
 NOT_ENTITY_TAG = "O"
 
 THRESHOLD = 0.30
+BEGINNING_POS = ["B", "S", "U"]
 
 
 def get_type_label(label: str) -> str:
@@ -31,6 +32,23 @@ def get_type_label(label: str) -> str:
     return tag
 
 
+def get_position_label(label: str) -> str:
+    """Return the position of a label
+
+    Input format: "[BIELUS]-type"
+    """
+    try:
+        pos = (
+            NOT_ENTITY_TAG
+            if label == NOT_ENTITY_TAG
+            else re.match(r"([BIELUS])-.{3,4}", label)[1]
+        )
+    except TypeError:
+        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+
+    return pos
+
+
 def parse_bio(path: str) -> dict:
     """Parse a BIO file to get text content, character-level NE labels and entity types count.
 
@@ -75,13 +93,17 @@ def parse_bio(path: str) -> dict:
         if index != 0:
 
             # If new word has same tag as previous, not new entity and in entity, continue entity
-            if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG:
+            if (
+                last_tag == tag
+                and get_position_label(label) not in BEGINNING_POS
+                and tag != NOT_ENTITY_TAG
+            ):
                 labels.append(f"I-{last_tag}")
 
             # If new word begins a new entity of different type, check for nested entity to correctly tag the space
             elif (
                 last_tag != tag
-                and "B" in label
+                and get_position_label(label) in BEGINNING_POS
                 and tag != NOT_ENTITY_TAG
                 and last_tag != NOT_ENTITY_TAG
             ):
@@ -99,7 +121,7 @@ def parse_bio(path: str) -> dict:
                 # Check for continuation of the original entity
                 if (
                     index < len(lines)
-                    and "B" not in future_label
+                    and get_position_label(future_label) not in BEGINNING_POS
                     and get_type_label(future_label) == last_tag
                 ):
                     labels.append(f"I-{last_tag}")
@@ -117,13 +139,13 @@ def parse_bio(path: str) -> dict:
                 in_nested_entity = False
 
         # Add a tag for each letter in the word
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
             labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1)
         else:
             labels += [label] * len(word)
 
         # Count nb entity for each type
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
             entity_count[tag] = entity_count.get(tag, 0) + 1
             entity_count["All"] += 1
 
@@ -171,7 +193,7 @@ def look_for_further_entity_part(index, tag, characters, labels):
             index += 1
         while (
             index < len(characters)
-            and "B" not in labels[index]
+            and get_position_label(labels[index]) not in BEGINNING_POS
             and get_type_label(labels[index]) == tag
         ):
             visited.append(index)
@@ -254,7 +276,7 @@ def compute_matches(
         else:
 
             # If beginning new entity
-            if "B" in label_ref:
+            if get_position_label(label_ref) in BEGINNING_POS:
                 current_ref, current_compar = [], []
                 last_tag = tag_ref
                 found_aligned_beginning = False
@@ -269,18 +291,21 @@ def compute_matches(
                     continue
 
                 # If just beginning new entity, backtrack tags on prediction string
-                if len(current_ref) == 1 and "B" not in labels_predict[i]:
+                if (
+                    len(current_ref) == 1
+                    and get_position_label(labels_predict[i]) not in BEGINNING_POS
+                ):
                     j = i - 1
                     while (
                         j >= 0
                         and get_type_label(labels_predict[j]) == tag_ref
-                        and "B" not in labels_predict[j]
+                        and get_position_label(labels_predict[j]) not in BEGINNING_POS
                         and j not in visited_predict
                     ):
                         j -= 1
 
                     if (
-                        "B" in labels_predict[j]
+                        get_position_label(labels_predict[j]) in BEGINNING_POS
                         and get_type_label(labels_predict[j]) == tag_ref
                         and j not in visited_predict
                     ):
@@ -372,7 +397,7 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
         elif not char == original[index_original]:
             new_label = (
                 last_label
-                if "B" not in last_label
+                if get_position_label(last_label) not in BEGINNING_POS
                 else f"I-{get_type_label(last_label)}"
             )
 
diff --git a/tests/bioues.bio b/tests/bioues.bio
new file mode 100644
index 0000000..6faa87d
--- /dev/null
+++ b/tests/bioues.bio
@@ -0,0 +1,10 @@
+GÃ©rard B-PER
+de I-PER
+Nerval I-PER
+was O
+born O
+in O
+Paris U-LOC
+in O
+1808 S-DAT
+. O
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 2572dc8..7084c5e 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -8,58 +8,110 @@ EMPTY_BIO = "tests/test_empty.bio"
 BAD_BIO = "tests/test_bad.bio"
 FAKE_ANNOT_BIO = "tests/test_annot.bio"
 FAKE_PREDICT_BIO = "tests/test_predict.bio"
+BIOUES_BIO = "tests/bioues.bio"
+
 
-# fmt: off
 expected_parsed_annot = {
-    'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
-    'labels': [
-        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'O',
-        'O', 'O', 'O',
-        'O',
-        'O', 'O', 'O', 'O',
-        'O',
-        'O', 'O',
-        'O',
-        'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
-        'O',
-        'O', 'O',
-        'O',
-        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
-        'O',
-        'O'
+    "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-DAT",
+        "I-DAT",
+        "I-DAT",
+        "I-DAT",
+        "O",
+        "O",
     ],
-    'words': 'GÃ©rard de Nerval was born in Paris in 1808 .'
+    "words": "GÃ©rard de Nerval was born in Paris in 1808 .",
 }
 
 expected_parsed_predict = {
-    'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
-    'labels': [
-        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'O',
-        'O', 'O', 'O', 'O', 'O',
-        'O',
-        'O', 'O',
-        'O',
-        'B-***', 'I-***', 'I-***', 'I-***', 'I-***',
-        'O',
-        'O', 'O',
-        'O',
-        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
-        'O',
-        'O', 'O'
+    "entity_count": {"All": 3, "DAT": 1, "***": 1, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-***",
+        "I-***",
+        "I-***",
+        "I-***",
+        "I-***",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-DAT",
+        "I-DAT",
+        "I-DAT",
+        "I-DAT",
+        "O",
+        "O",
+        "O",
     ],
-    'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.'
+    "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
 }
-# fmt: on
 
 
 @pytest.mark.parametrize(
@@ -68,6 +120,7 @@ expected_parsed_predict = {
         (FAKE_ANNOT_BIO, expected_parsed_annot),
         (FAKE_PREDICT_BIO, expected_parsed_predict),
         (EMPTY_BIO, None),
+        (BIOUES_BIO, expected_parsed_annot),
     ],
 )
 def test_parse_bio(test_input, expected):
-- 
GitLab