Thibault Lavigne · 60326c5d
--- a/nerval/evaluate.py

+ 36

− 11
+++ b/nerval/evaluate.py

+ 36

− 11
 @@ -12,6 +12,7 @@ import termtables as tt
 NOT_ENTITY_TAG = "O"

 THRESHOLD = 0.30
+BEGINNING_POS = ["B", "S", "U"]


 def get_type_label(label: str) -> str:
 @@ -31,6 +32,23 @@ def get_type_label(label: str) -> str:
    return tag


+def get_position_label(label: str) -> str:
+    """Return the position of a label
+
+    Input format: "[BIELUS]-type"
+    """
+    try:
+        pos = (
+            NOT_ENTITY_TAG
+            if label == NOT_ENTITY_TAG
+            else re.match(r"([BIELUS])-.{3,4}", label)[1]
+        )
+    except TypeError:
+        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+
+    return pos
+
+
 def parse_bio(path: str) -> dict:
    """Parse a BIO file to get text content, character-level NE labels and entity types count.

 @@ -75,13 +93,17 @@ def parse_bio(path: str) -> dict:
        if index != 0:

            # If new word has same tag as previous, not new entity and in entity, continue entity
-            if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG:
+            if (
+                last_tag == tag
+                and get_position_label(label) not in BEGINNING_POS
+                and tag != NOT_ENTITY_TAG
+            ):
                labels.append(f"I-{last_tag}")

            # If new word begins a new entity of different type, check for nested entity to correctly tag the space
            elif (
                last_tag != tag
-                and "B" in label
+                and get_position_label(label) in BEGINNING_POS
                and tag != NOT_ENTITY_TAG
                and last_tag != NOT_ENTITY_TAG
            ):
 @@ -99,7 +121,7 @@ def parse_bio(path: str) -> dict:
                # Check for continuation of the original entity
                if (
                    index < len(lines)
-                    and "B" not in future_label
+                    and get_position_label(future_label) not in BEGINNING_POS
                    and get_type_label(future_label) == last_tag
                ):
                    labels.append(f"I-{last_tag}")
 @@ -117,13 +139,13 @@ def parse_bio(path: str) -> dict:
                in_nested_entity = False

        # Add a tag for each letter in the word
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
            labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1)
        else:
            labels += [label] * len(word)

        # Count nb entity for each type
-        if "B" in label:
+        if get_position_label(label) in BEGINNING_POS:
            entity_count[tag] = entity_count.get(tag, 0) + 1
            entity_count["All"] += 1

 @@ -171,7 +193,7 @@ def look_for_further_entity_part(index, tag, characters, labels):
            index += 1
        while (
            index < len(characters)
-            and "B" not in labels[index]
+            and get_position_label(labels[index]) not in BEGINNING_POS
            and get_type_label(labels[index]) == tag
        ):
            visited.append(index)
 @@ -254,7 +276,7 @@ def compute_matches(
        else:

            # If beginning new entity
-            if "B" in label_ref:
+            if get_position_label(label_ref) in BEGINNING_POS:
                current_ref, current_compar = [], []
                last_tag = tag_ref
                found_aligned_beginning = False
 @@ -269,18 +291,21 @@ def compute_matches(
                    continue

                # If just beginning new entity, backtrack tags on prediction string
-                if len(current_ref) == 1 and "B" not in labels_predict[i]:
+                if (
+                    len(current_ref) == 1
+                    and get_position_label(labels_predict[i]) not in BEGINNING_POS
+                ):
                    j = i - 1
                    while (
                        j >= 0
                        and get_type_label(labels_predict[j]) == tag_ref
-                        and "B" not in labels_predict[j]
+                        and get_position_label(labels_predict[j]) not in BEGINNING_POS
                        and j not in visited_predict
                    ):
                        j -= 1

                    if (
-                        "B" in labels_predict[j]
+                        get_position_label(labels_predict[j]) in BEGINNING_POS
                        and get_type_label(labels_predict[j]) == tag_ref
                        and j not in visited_predict
                    ):
 @@ -372,7 +397,7 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
        elif not char == original[index_original]:
            new_label = (
                last_label
-                if "B" not in last_label
+                if get_position_label(last_label) not in BEGINNING_POS
                else f"I-{get_type_label(last_label)}"
            )