From 12144cda8865c6c1679e7a5e3b8c6927790ba969 Mon Sep 17 00:00:00 2001 From: EvaBardou <bardou@teklia.com> Date: Tue, 21 Nov 2023 18:14:57 +0100 Subject: [PATCH] Convert NER prediction to BIO format --- dan/bio.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 dan/bio.py diff --git a/dan/bio.py b/dan/bio.py new file mode 100644 index 00000000..f4ad1258 --- /dev/null +++ b/dan/bio.py @@ -0,0 +1,57 @@ +from typing import Dict + +from dan.utils import EntityType + + +def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: + iob_string: str = "" # Full IOB formatted string + entity_types: list[str] = [] # Encountered entity types + inside: bool = False # Whether we are inside an entity + + for word in text.split(): + if not word: + continue + + token_iterator = iter(ner_tokens.items()) + while (ner_token := next(token_iterator, None)) is not None: + name, entity_type = ner_token + + if word[0] == entity_type.start: + word = word[1:] + + entity_types.append(entity_type.start) + + if entity_type.end: + inside = False + continue + + elif entity_type.end and word[-1] == entity_type.end: + # Make sure the token is the closing of the current entity + assert entity_types[-1] == entity_type.start + + word = word[:-1] + + # Remove from queue + entity_types.pop() + + # if there is no more entity, you remove inside + # else we continue parent entity + inside = bool(entity_types) + + # Not a NER token + + # If there is no current entity type + if not entity_types: + iob_string += f"\n{word} O" + continue + + # There is at least one entity type + if inside: + # We are inside an entity + iob_string += f"\n{word} I-{name}" + else: + # We are starting an entity + inside = True + iob_string += f"\n{word} B-{name}" + + print(iob_string) -- GitLab