Handle all beginning labels

d77e9801 · Blanche Miret · kermorvant · 3b8f584e · d77e9801 · d77e9801
Commit d77e9801 authored 3 years ago by Blanche Miret Committed by kermorvant 3 years ago
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -12,6 +12,7 @@ import termtables as tt
 NOT_ENTITY_TAG = "O"

 THRESHOLD = 0.30
+BEGINNING_POS = ["B", "S", "U"]


 def get_type_label(label: str) -> str:
@@ -31,6 +32,23 @@ def get_type_label(label: str) -> str:
    return tag


+def get_position_label(label: str) -> str:
+    """Return the position of a label
+
+    Input format: "[BIELUS]-type"
+    """
+    try:
+        pos = (
+            NOT_ENTITY_TAG
+            if label == NOT_ENTITY_TAG
+            else re.match(r"([BIELUS])-.{3,4}", label)[1]
+        )
+    except TypeError:
+        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+
+    return pos
+
+
 def parse_bio(path: str) -> dict:
    """Parse a BIO file to get text content, character-level NE labels and entity types count.

@@ -75,13 +93,17 @@ def parse_bio(path: str) -> dict:
        if index != 0:

            # If new word has same tag as previous, not new entity and in entity, continue entity
-            if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG:
+            if (
+                last_tag == tag
+                and get_position_label(label) not in BEGINNING_POS
+                and tag != NOT_ENTITY_TAG
+            ):
                labels.append(f"I-{last_tag}")

            # If new word begins a new entity of different type, check for nested entity to correctly tag the space
            elif (
                last_tag != tag
-                and "B" in label
+                and get_position_label(label) in BEGINNING_POS
                and tag != NOT_ENTITY_TAG
                and last_tag != NOT_ENTITY_TAG
            ):
@@ -99,7 +121,7 @@ def parse_bio(path: str) -> dict:
                # Check for continuation of the original entity
                if (
                    index < len(lines)
-                    and "B" not in future_label
+                    and get_position_label(future_label) not in BEGINNING_POS
                    and get_type_label(future_label) == last_tag
                ):
                    labels.append(f"I-{last_tag}")
@@ -117,13 +139,13 @@ def parse_bio(path: str) -> dict:
                in_nested_entity = False

        # Add a tag for each letter in the word
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
            labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1)
        else:
            labels += [label] * len(word)

        # Count nb entity for each type
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
            entity_count[tag] = entity_count.get(tag, 0) + 1
            entity_count["All"] += 1

@@ -171,7 +193,7 @@ def look_for_further_entity_part(index, tag, characters, labels):
            index += 1
        while (
            index < len(characters)
-            and "B" not in labels[index]
+            and get_position_label(labels[index]) not in BEGINNING_POS
            and get_type_label(labels[index]) == tag
        ):
            visited.append(index)
@@ -254,7 +276,7 @@ def compute_matches(
        else:

            # If beginning new entity
-            if "B" in label_ref:
+            if get_position_label(label_ref) in BEGINNING_POS:
                current_ref, current_compar = [], []
                last_tag = tag_ref
                found_aligned_beginning = False
@@ -269,18 +291,21 @@ def compute_matches(
                    continue

                # If just beginning new entity, backtrack tags on prediction string
-                if len(current_ref) == 1 and "B" not in labels_predict[i]:
+                if (
+                    len(current_ref) == 1
+                    and get_position_label(labels_predict[i]) not in BEGINNING_POS
+                ):
                    j = i - 1
                    while (
                        j >= 0
                        and get_type_label(labels_predict[j]) == tag_ref
-                        and "B" not in labels_predict[j]
+                        and get_position_label(labels_predict[j]) not in BEGINNING_POS
                        and j not in visited_predict
                    ):
                        j -= 1

                    if (
-                        "B" in labels_predict[j]
+                        get_position_label(labels_predict[j]) in BEGINNING_POS
                        and get_type_label(labels_predict[j]) == tag_ref
                        and j not in visited_predict
                    ):
@@ -372,7 +397,7 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
        elif not char == original[index_original]:
            new_label = (
                last_label
-                if "B" not in last_label
+                if get_position_label(last_label) not in BEGINNING_POS
                else f"I-{get_type_label(last_label)}"
            )


--- a/tests/bioues.bio
+++ b/tests/bioues.bio
+Gérard B-PER
+de I-PER
+Nerval I-PER
+was O
+born O
+in O
+Paris U-LOC
+in O
+1808 S-DAT
+. O
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -8,58 +8,110 @@ EMPTY_BIO = "tests/test_empty.bio"
 BAD_BIO = "tests/test_bad.bio"
 FAKE_ANNOT_BIO = "tests/test_annot.bio"
 FAKE_PREDICT_BIO = "tests/test_predict.bio"
+BIOUES_BIO = "tests/bioues.bio"
+

-# fmt: off
 expected_parsed_annot = {
-    'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
-    'labels': [
-        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'O',
-        'O', 'O', 'O',
-        'O',
-        'O', 'O', 'O', 'O',
-        'O',
-        'O', 'O',
-        'O',
-        'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
-        'O',
-        'O', 'O',
-        'O',
-        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
-        'O',
-        'O'
+    "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "I-LOC",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-DAT",
+        "I-DAT",
+        "I-DAT",
+        "I-DAT",
+        "O",
+        "O",
    ],
-    'words': 'Gérard de Nerval was born in Paris in 1808 .'
+    "words": "Gérard de Nerval was born in Paris in 1808 .",
 }

 expected_parsed_predict = {
-    'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
-    'labels': [
-        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER',
-        'I-PER',
-        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
-        'O',
-        'O', 'O', 'O', 'O', 'O',
-        'O',
-        'O', 'O',
-        'O',
-        'B-***', 'I-***', 'I-***', 'I-***', 'I-***',
-        'O',
-        'O', 'O',
-        'O',
-        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
-        'O',
-        'O', 'O'
+    "entity_count": {"All": 3, "DAT": 1, "***": 1, "PER": 1},
+    "labels": [
+        "B-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "I-PER",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-***",
+        "I-***",
+        "I-***",
+        "I-***",
+        "I-***",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-DAT",
+        "I-DAT",
+        "I-DAT",
+        "I-DAT",
+        "O",
+        "O",
+        "O",
    ],
-    'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.'
+    "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
 }
-# fmt: on


 @pytest.mark.parametrize(
@@ -68,6 +120,7 @@ expected_parsed_predict = {
        (FAKE_ANNOT_BIO, expected_parsed_annot),
        (FAKE_PREDICT_BIO, expected_parsed_predict),
        (EMPTY_BIO, None),
+        (BIOUES_BIO, expected_parsed_annot),
    ],
 )
 def test_parse_bio(test_input, expected):