From 7756f7cdf9f123470d1d495ed54c444c18f1c88d Mon Sep 17 00:00:00 2001
From: EvaBardou <bardou@teklia.com>
Date: Tue, 21 Nov 2023 19:27:00 +0100
Subject: [PATCH] Working version

---
 dan/bio.py | 98 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 59 insertions(+), 39 deletions(-)

diff --git a/dan/bio.py b/dan/bio.py
index f4ad1258..0e875201 100644
--- a/dan/bio.py
+++ b/dan/bio.py
@@ -1,57 +1,77 @@
-from typing import Dict
+from operator import attrgetter
+from typing import Dict, List
 
 from dan.utils import EntityType
 
 
+def find_starting_token(ner_tokens, ending_token):
+    for entity_type in ner_tokens.values():
+        if entity_type.end == ending_token:
+            return entity_type.start
+
+
+def find_entity_name(ner_tokens, starting_token):
+    for name, entity_type in ner_tokens.items():
+        if entity_type.start == starting_token:
+            return name
+
+
 def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
-    iob_string: str = ""  # Full IOB formatted string
-    entity_types: list[str] = []  # Encountered entity types
-    inside: bool = False  # Whether we are inside an entity
+    # Spacing starting tokens and ending tokens (if necessary)
+    starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values()))
+    for starting_token in starting_tokens:
+        text = text.replace(starting_token, f" {starting_token} ")
 
-    for word in text.split():
-        if not word:
-            continue
+    ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values()))
+    for ending_token in ending_tokens:
+        text = text.replace(ending_token, f" {ending_token} ")
 
-        token_iterator = iter(ner_tokens.items())
-        while (ner_token := next(token_iterator, None)) is not None:
-            name, entity_type = ner_token
+    has_ending_tokens: bool = bool(len(ending_tokens))
 
-            if word[0] == entity_type.start:
-                word = word[1:]
+    iob: List[str] = []  # List of IOB formatted strings
+    entity_types: List[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+    for token in text.split():
+        # Encountering a starting token
+        if token in starting_tokens:
+            entity_types.append(token)
+
+            # Stopping any current entity type
+            if has_ending_tokens:
+                inside = False
 
-                entity_types.append(entity_type.start)
+            continue
 
-                if entity_type.end:
-                    inside = False
-                    continue
+        # Encountering an ending token
+        elif has_ending_tokens and token in ending_tokens:
+            # Making sure this ending token closes the current entity
+            assert entity_types[-1] == find_starting_token(ner_tokens, token)
 
-            elif entity_type.end and word[-1] == entity_type.end:
-                # Make sure the token is the closing of the current entity
-                assert entity_types[-1] == entity_type.start
+            # Removing the current entity from the queue as it is its end
+            entity_types.pop()
 
-                word = word[:-1]
+            # If there is still entities in the queue, we continue in the parent one
+            # Else, we are not in any entity anymore
+            inside = bool(entity_types)
 
-                # Remove from queue
-                entity_types.pop()
+            continue
 
-                # if there is no more entity, you remove inside
-                # else we continue parent entity
-                inside = bool(entity_types)
+        # The token is not part of an entity
+        if not entity_types:
+            iob.append(f"{token} O")
+            continue
 
-            # Not a NER token
+        # The token is part of at least one entity
+        entity_name = find_entity_name(ner_tokens, entity_types[-1])
 
-            # If there is no current entity type
-            if not entity_types:
-                iob_string += f"\n{word} O"
-                continue
+        if inside:
+            # Inside the same entity
+            iob.append(f"{token} I-{entity_name}")
+            continue
 
-            # There is at least one entity type
-            if inside:
-                # We are inside an entity
-                iob_string += f"\n{word} I-{name}"
-            else:
-                # We are starting an entity
-                inside = True
-                iob_string += f"\n{word} B-{name}"
+        # Starting a new entity
+        iob.append(f"{token} B-{entity_name}")
+        inside = True
 
-    print(iob_string)
+    # Concatenating all formatted iob strings
+    return "\n".join(iob)
-- 
GitLab