From 12144cda8865c6c1679e7a5e3b8c6927790ba969 Mon Sep 17 00:00:00 2001
From: EvaBardou <bardou@teklia.com>
Date: Tue, 21 Nov 2023 18:14:57 +0100
Subject: [PATCH] Convert NER prediction to BIO format

---
 dan/bio.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 dan/bio.py

diff --git a/dan/bio.py b/dan/bio.py
new file mode 100644
index 00000000..f4ad1258
--- /dev/null
+++ b/dan/bio.py
@@ -0,0 +1,57 @@
+from typing import Dict
+
+from dan.utils import EntityType
+
+
+def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
+    iob_string: str = ""  # Full IOB formatted string
+    entity_types: list[str] = []  # Encountered entity types
+    inside: bool = False  # Whether we are inside an entity
+
+    for word in text.split():
+        if not word:
+            continue
+
+        token_iterator = iter(ner_tokens.items())
+        while (ner_token := next(token_iterator, None)) is not None:
+            name, entity_type = ner_token
+
+            if word[0] == entity_type.start:
+                word = word[1:]
+
+                entity_types.append(entity_type.start)
+
+                if entity_type.end:
+                    inside = False
+                    continue
+
+            elif entity_type.end and word[-1] == entity_type.end:
+                # Make sure the token is the closing of the current entity
+                assert entity_types[-1] == entity_type.start
+
+                word = word[:-1]
+
+                # Remove from queue
+                entity_types.pop()
+
+                # if there is no more entity, you remove inside
+                # else we continue parent entity
+                inside = bool(entity_types)
+
+            # Not a NER token
+
+            # If there is no current entity type
+            if not entity_types:
+                iob_string += f"\n{word} O"
+                continue
+
+            # There is at least one entity type
+            if inside:
+                # We are inside an entity
+                iob_string += f"\n{word} I-{name}"
+            else:
+                # We are starting an entity
+                inside = True
+                iob_string += f"\n{word} B-{name}"
+
+    print(iob_string)
-- 
GitLab