Eva Bardou · 26236d27 · 3131fe57 · 7de169e6 · f6d5979a · 2d53fdc8
--- a/dan/bio.py 0 → 100644

+ 93

− 0
+++ b/dan/bio.py 0 → 100644

+ 93

− 0
+import logging
+from operator import attrgetter
+from typing import Dict, List, Optional
+
+from dan.utils import EntityType
+
+logger = logging.getLogger(__name__)
+
+
+def find_starting_token(
+    ner_tokens: Dict[str, EntityType], ending_token: str
+) -> Optional[str]:
+    for entity_type in ner_tokens.values():
+        if entity_type.end == ending_token:
+            return entity_type.start
+
+
+def find_entity_name(
+    ner_tokens: Dict[str, EntityType], starting_token: str
+) -> Optional[str]:
+    for name, entity_type in ner_tokens.items():
+        if entity_type.start == starting_token:
+            return name
+
+
+def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
+    # Spacing starting tokens and ending tokens (if necessary)
+    starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values()))
+    for starting_token in starting_tokens:
+        text = text.replace(starting_token, f" {starting_token} ")
+
+    ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values()))
+    has_ending_tokens: bool = set(ending_tokens) != {
+        ""
+    }  # Whether ending tokens are used
+    if has_ending_tokens:
+        for ending_token in ending_tokens:
+            text = text.replace(ending_token, f" {ending_token} ")
+
+    iob: List[str] = []  # List of IOB formatted strings
+    entity_types: List[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+    for token in text.split():
+        # Encountering a starting token
+        if token in starting_tokens:
+            entity_types.append(token)
+
+            # Stopping any current entity type
+            inside = False
+
+            continue
+
+        # Encountering an ending token
+        elif has_ending_tokens and token in ending_tokens:
+            if not entity_types:
+                logger.warning(
+                    f"Missing starting token for ending token {token}, skipping the entity"
+                )
+                continue
+
+            # Making sure this ending token closes the current entity
+            assert entity_types[-1] == find_starting_token(
+                ner_tokens, token
+            ), f"Ending token {token} doesn't match the starting token {entity_types[-1]}"
+
+            # Removing the current entity from the queue as it is its end
+            entity_types.pop()
+
+            # If there is still entities in the queue, we continue in the parent one
+            # Else, we are not in any entity anymore
+            inside = bool(entity_types)
+
+            continue
+
+        # The token is not part of an entity
+        if not entity_types:
+            iob.append(f"{token} O")
+            continue
+
+        # The token is part of at least one entity
+        entity_name: str = find_entity_name(ner_tokens, entity_types[-1])
+
+        if inside:
+            # Inside the same entity
+            iob.append(f"{token} I-{entity_name}")
+            continue
+
+        # Starting a new entity
+        iob.append(f"{token} B-{entity_name}")
+        inside = True
+
+    # Concatenating all formatted iob strings
+    return "\n".join(iob)