From 7756f7cdf9f123470d1d495ed54c444c18f1c88d Mon Sep 17 00:00:00 2001 From: EvaBardou <bardou@teklia.com> Date: Tue, 21 Nov 2023 19:27:00 +0100 Subject: [PATCH] Working version --- dan/bio.py | 98 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/dan/bio.py b/dan/bio.py index f4ad1258..0e875201 100644 --- a/dan/bio.py +++ b/dan/bio.py @@ -1,57 +1,77 @@ -from typing import Dict +from operator import attrgetter +from typing import Dict, List from dan.utils import EntityType +def find_starting_token(ner_tokens, ending_token): + for entity_type in ner_tokens.values(): + if entity_type.end == ending_token: + return entity_type.start + + +def find_entity_name(ner_tokens, starting_token): + for name, entity_type in ner_tokens.items(): + if entity_type.start == starting_token: + return name + + def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: - iob_string: str = "" # Full IOB formatted string - entity_types: list[str] = [] # Encountered entity types - inside: bool = False # Whether we are inside an entity + # Spacing starting tokens and ending tokens (if necessary) + starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values())) + for starting_token in starting_tokens: + text = text.replace(starting_token, f" {starting_token} ") - for word in text.split(): - if not word: - continue + ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values())) + for ending_token in ending_tokens: + text = text.replace(ending_token, f" {ending_token} ") - token_iterator = iter(ner_tokens.items()) - while (ner_token := next(token_iterator, None)) is not None: - name, entity_type = ner_token + has_ending_tokens: bool = bool(len(ending_tokens)) - if word[0] == entity_type.start: - word = word[1:] + iob: List[str] = [] # List of IOB formatted strings + entity_types: List[str] = [] # Encountered entity types + inside: bool = False # Whether we are inside an entity + for token in text.split(): + # Encountering a starting token + if token in starting_tokens: + entity_types.append(token) + + # Stopping any current entity type + if has_ending_tokens: + inside = False - entity_types.append(entity_type.start) + continue - if entity_type.end: - inside = False - continue + # Encountering an ending token + elif has_ending_tokens and token in ending_tokens: + # Making sure this ending token closes the current entity + assert entity_types[-1] == find_starting_token(ner_tokens, token) - elif entity_type.end and word[-1] == entity_type.end: - # Make sure the token is the closing of the current entity - assert entity_types[-1] == entity_type.start + # Removing the current entity from the queue as it is its end + entity_types.pop() - word = word[:-1] + # If there is still entities in the queue, we continue in the parent one + # Else, we are not in any entity anymore + inside = bool(entity_types) - # Remove from queue - entity_types.pop() + continue - # if there is no more entity, you remove inside - # else we continue parent entity - inside = bool(entity_types) + # The token is not part of an entity + if not entity_types: + iob.append(f"{token} O") + continue - # Not a NER token + # The token is part of at least one entity + entity_name = find_entity_name(ner_tokens, entity_types[-1]) - # If there is no current entity type - if not entity_types: - iob_string += f"\n{word} O" - continue + if inside: + # Inside the same entity + iob.append(f"{token} I-{entity_name}") + continue - # There is at least one entity type - if inside: - # We are inside an entity - iob_string += f"\n{word} I-{name}" - else: - # We are starting an entity - inside = True - iob_string += f"\n{word} B-{name}" + # Starting a new entity + iob.append(f"{token} B-{entity_name}") + inside = True - print(iob_string) + # Concatenating all formatted iob strings + return "\n".join(iob) -- GitLab