From fc146a8d07d7ebce06a770709a33feeb6f78483f Mon Sep 17 00:00:00 2001
From: Eva Bardou <>
Date: Thu, 23 Nov 2023 12:15:23 +0000
Subject: [PATCH] Convert NER prediction to BIO format

 dan/        |  85 ++++++++++++++++++++++++++++++++
 tests/ | 120 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 205 insertions(+)
 create mode 100644 dan/
 create mode 100644 tests/

diff --git a/dan/ b/dan/
new file mode 100644
index 00000000..d676d937
--- /dev/null
+++ b/dan/
@@ -0,0 +1,85 @@
+import logging
+import re
+from typing import Dict, List
+from dan.utils import EntityType
+logger = logging.getLogger(__name__)
+def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
+    # Mapping to find a starting token for an ending token efficiently
+    mapping_end_start: Dict[str, str] = {
+        entity_type.end: entity_type.start for entity_type in ner_tokens.values()
+    }
+    # Mapping to find the entity name for a starting token efficiently
+    mapping_start_name: Dict[str, str] = {
+        entity_type.start: name for name, entity_type in ner_tokens.items()
+    }
+    starting_tokens: List[str] = mapping_start_name.keys()
+    ending_tokens: List[str] = mapping_end_start.keys()
+    has_ending_tokens: bool = set(ending_tokens) != {
+        ""
+    }  # Whether ending tokens are used
+    # Spacing starting tokens and ending tokens (if necessary)
+    tokens_spacing: re.Pattern = re.compile(
+        r"([" + "".join([*starting_tokens, *ending_tokens]) + "])"
+    )
+    text: str = tokens_spacing.sub(r" \1 ", text)
+    iob: List[str] = []  # List of IOB formatted strings
+    entity_types: List[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+    for token in text.split():
+        # Encountering a starting token
+        if token in starting_tokens:
+            entity_types.append(token)
+            # Stopping any current entity type
+            inside = False
+            continue
+        # Encountering an ending token
+        elif has_ending_tokens and token in ending_tokens:
+            if not entity_types:
+                logger.warning(
+                    f"Missing starting token for ending token {token}, skipping the entity"
+                )
+                continue
+            # Making sure this ending token closes the current entity
+            assert (
+                entity_types[-1] == mapping_end_start[token]
+            ), f"Ending token {token} doesn't match the starting token {entity_types[-1]}"
+            # Removing the current entity from the queue as it is its end
+            entity_types.pop()
+            # If there is still entities in the queue, we continue in the parent one
+            # Else, we are not in any entity anymore
+            inside = bool(entity_types)
+            continue
+        # The token is not part of an entity
+        if not entity_types:
+            iob.append(f"{token} O")
+            continue
+        # The token is part of at least one entity
+        entity_name: str = mapping_start_name[entity_types[-1]]
+        if inside:
+            # Inside the same entity
+            iob.append(f"{token} I-{entity_name}")
+            continue
+        # Starting a new entity
+        iob.append(f"{token} B-{entity_name}")
+        inside = True
+    # Concatenating all formatted iob strings
+    return "\n".join(iob)
diff --git a/tests/ b/tests/
new file mode 100644
index 00000000..fce81268
--- /dev/null
+++ b/tests/
@@ -0,0 +1,120 @@
+import logging
+import pytest
+from import convert
+from dan.utils import EntityType
+ST_TEXT = """â’¶Bryan B â’·Paris â’¸January 1st, 1987
+â’¶Joe J â’·Grenoble â’¸August 24, 1995
+â’¶Hannah H â’·Lille â’¸September 15, 2002"""
+ST_ET_TEXT = """â’¶Bryanâ’· and â’¶Joeâ’· will visit the â’¸Eiffel Towerâ’¹ in â’¸Parisâ’¹ next â’ºTuesdayâ’».
+ⒶHannahⒷ will visit the ⒸPlace ⒶCharles de GaulleⒷ étoileⒹ on ⒺWednesdayⒻ."""
+def test_convert_with_error():
+    ner_tokens = {
+        "Person": EntityType(start="â’¶", end="â’·"),
+        "Location": EntityType(start="â’¸", end="â’¹"),
+    }
+    with pytest.raises(
+        AssertionError, match="Ending token â’¹ doesn't match the starting token â’¶"
+    ):
+        convert("â’¶Fredâ’¹", ner_tokens)
+def test_convert_with_warnings(caplog):
+    ner_tokens = {
+        "Person": EntityType(start="â’¶", end="â’·"),
+        "Location": EntityType(start="â’¸", end="â’¹"),
+    }
+    assert convert("Bryanâ’· and â’¶Joeâ’· will visit the Eiffel Towerâ’¹", ner_tokens).split(
+        "\n"
+    ) == [
+        "Bryan O",
+        "and O",
+        "Joe B-Person",
+        "will O",
+        "visit O",
+        "the O",
+        "Eiffel O",
+        "Tower O",
+    ]
+    assert [(level, message) for _, level, message in caplog.record_tuples] == [
+        (
+            logging.WARNING,
+            "Missing starting token for ending token â’·, skipping the entity",
+        ),
+        (
+            logging.WARNING,
+            "Missing starting token for ending token â’¹, skipping the entity",
+        ),
+    ]
+def test_convert_starting_tokens():
+    ner_tokens = {
+        "Person": EntityType(start="â’¶"),
+        "Location": EntityType(start="â’·"),
+        "Date": EntityType(start="â’¸"),
+    }
+    assert convert(ST_TEXT, ner_tokens).split("\n") == [
+        "Bryan B-Person",
+        "B I-Person",
+        "Paris B-Location",
+        "January B-Date",
+        "1st, I-Date",
+        "1987 I-Date",
+        "Joe B-Person",
+        "J I-Person",
+        "Grenoble B-Location",
+        "August B-Date",
+        "24, I-Date",
+        "1995 I-Date",
+        "Hannah B-Person",
+        "H I-Person",
+        "Lille B-Location",
+        "September B-Date",
+        "15, I-Date",
+        "2002 I-Date",
+    ]
+def test_convert_starting_and_ending_tokens():
+    ner_tokens = {
+        "Person": EntityType(start="â’¶", end="â’·"),
+        "Location": EntityType(start="â’¸", end="â’¹"),
+        "Date": EntityType(start="â’º", end="â’»"),
+    }
+    assert convert(ST_ET_TEXT, ner_tokens).split("\n") == [
+        "Bryan B-Person",
+        "and O",
+        "Joe B-Person",
+        "will O",
+        "visit O",
+        "the O",
+        "Eiffel B-Location",
+        "Tower I-Location",
+        "in O",
+        "Paris B-Location",
+        "next O",
+        "Tuesday B-Date",
+        ". O",
+        "Hannah B-Person",
+        "will O",
+        "visit O",
+        "the O",
+        "Place B-Location",
+        "Charles B-Person",
+        "de I-Person",
+        "Gaulle I-Person",
+        "étoile I-Location",
+        "on O",
+        "Wednesday B-Date",
+        ". O",
+    ]