Eva Bardou · Solene Tarride
--- a/dan/bio.py 0 → 100644

+ 57

− 0
+++ b/dan/bio.py 0 → 100644

+ 57

− 0
+from typing import Dict
+
+from dan.utils import EntityType
+
+
+def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
+    iob_string: str = ""  # Full IOB formatted string
+    entity_types: list[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+
+    for word in text.split():
+        if not word:
+            continue
+
+        token_iterator = iter(ner_tokens.items())
+        while (ner_token := next(token_iterator, None)) is not None:
+            name, entity_type = ner_token
+
+            if word[0] == entity_type.start:
+                word = word[1:]
+
+                entity_types.append(entity_type.start)
+
+                if entity_type.end:
+                    inside = False
+                    continue
+
+            elif entity_type.end and word[-1] == entity_type.end:
+                # Make sure the token is the closing of the current entity
+                assert entity_types[-1] == entity_type.start
+
+                word = word[:-1]
+
+                # Remove from queue
+                entity_types.pop()
+
+                # if there is no more entity, you remove inside
+                # else we continue parent entity
+                inside = bool(entity_types)
+
+            # Not a NER token
+
+            # If there is no current entity type
+            if not entity_types:
+                iob_string += f"\n{word} O"
+                continue
+
+            # There is at least one entity type
+            if inside:
+                # We are inside an entity
+                iob_string += f"\n{word} I-{name}"
+            else:
+                # We are starting an entity
+                inside = True
+                iob_string += f"\n{word} B-{name}"
+
+    print(iob_string)