Skip to content
Snippets Groups Projects
Commit 12144cda authored by Eva Bardou's avatar Eva Bardou :frog:
Browse files

Convert NER prediction to BIO format

parent 65128721
No related branches found
No related tags found
1 merge request!325Convert NER prediction to BIO format
from typing import Dict
from dan.utils import EntityType
def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
iob_string: str = "" # Full IOB formatted string
entity_types: list[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
for word in text.split():
if not word:
continue
token_iterator = iter(ner_tokens.items())
while (ner_token := next(token_iterator, None)) is not None:
name, entity_type = ner_token
if word[0] == entity_type.start:
word = word[1:]
entity_types.append(entity_type.start)
if entity_type.end:
inside = False
continue
elif entity_type.end and word[-1] == entity_type.end:
# Make sure the token is the closing of the current entity
assert entity_types[-1] == entity_type.start
word = word[:-1]
# Remove from queue
entity_types.pop()
# if there is no more entity, you remove inside
# else we continue parent entity
inside = bool(entity_types)
# Not a NER token
# If there is no current entity type
if not entity_types:
iob_string += f"\n{word} O"
continue
# There is at least one entity type
if inside:
# We are inside an entity
iob_string += f"\n{word} I-{name}"
else:
# We are starting an entity
inside = True
iob_string += f"\n{word} B-{name}"
print(iob_string)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment