Eva Bardou · Solene Tarride
--- a/dan/bio.py

+ 59

− 39
+++ b/dan/bio.py

+ 59

− 39
-from typing import Dict
+from operator import attrgetter
+from typing import Dict, List

 from dan.utils import EntityType


+def find_starting_token(ner_tokens, ending_token):
+    for entity_type in ner_tokens.values():
+        if entity_type.end == ending_token:
+            return entity_type.start
+
+
+def find_entity_name(ner_tokens, starting_token):
+    for name, entity_type in ner_tokens.items():
+        if entity_type.start == starting_token:
+            return name
+
+
 def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
-    iob_string: str = ""  # Full IOB formatted string
-    entity_types: list[str] = []  # Encountered entity types
-    inside: bool = False  # Whether we are inside an entity
+    # Spacing starting tokens and ending tokens (if necessary)
+    starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values()))
+    for starting_token in starting_tokens:
+        text = text.replace(starting_token, f" {starting_token} ")

-    for word in text.split():
-        if not word:
-            continue
+    ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values()))
+    for ending_token in ending_tokens:
+        text = text.replace(ending_token, f" {ending_token} ")

-        token_iterator = iter(ner_tokens.items())
-        while (ner_token := next(token_iterator, None)) is not None:
-            name, entity_type = ner_token
+    has_ending_tokens: bool = bool(len(ending_tokens))

-            if word[0] == entity_type.start:
-                word = word[1:]
+    iob: List[str] = []  # List of IOB formatted strings
+    entity_types: List[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+    for token in text.split():
+        # Encountering a starting token
+        if token in starting_tokens:
+            entity_types.append(token)
+
+            # Stopping any current entity type
+            if has_ending_tokens:
+                inside = False

-                entity_types.append(entity_type.start)
+            continue

-                if entity_type.end:
-                    inside = False
-                    continue
+        # Encountering an ending token
+        elif has_ending_tokens and token in ending_tokens:
+            # Making sure this ending token closes the current entity
+            assert entity_types[-1] == find_starting_token(ner_tokens, token)

-            elif entity_type.end and word[-1] == entity_type.end:
-                # Make sure the token is the closing of the current entity
-                assert entity_types[-1] == entity_type.start
+            # Removing the current entity from the queue as it is its end
+            entity_types.pop()

-                word = word[:-1]
+            # If there is still entities in the queue, we continue in the parent one
+            # Else, we are not in any entity anymore
+            inside = bool(entity_types)

-                # Remove from queue
-                entity_types.pop()
+            continue

-                # if there is no more entity, you remove inside
-                # else we continue parent entity
-                inside = bool(entity_types)
+        # The token is not part of an entity
+        if not entity_types:
+            iob.append(f"{token} O")
+            continue

-            # Not a NER token
+        # The token is part of at least one entity
+        entity_name = find_entity_name(ner_tokens, entity_types[-1])

-            # If there is no current entity type
-            if not entity_types:
-                iob_string += f"\n{word} O"
-                continue
+        if inside:
+            # Inside the same entity
+            iob.append(f"{token} I-{entity_name}")
+            continue

-            # There is at least one entity type
-            if inside:
-                # We are inside an entity
-                iob_string += f"\n{word} I-{name}"
-            else:
-                # We are starting an entity
-                inside = True
-                iob_string += f"\n{word} B-{name}"
+        # Starting a new entity
+        iob.append(f"{token} B-{entity_name}")
+        inside = True

-    print(iob_string)
+    # Concatenating all formatted iob strings
+    return "\n".join(iob)