Skip to content
Snippets Groups Projects

Convert NER prediction to BIO format

Merged Eva Bardou requested to merge convert-to-BIO into main
All threads resolved!
1 file
+ 57
0
Compare changes
  • Side-by-side
  • Inline
dan/bio.py 0 → 100644
+ 57
0
from typing import Dict
from dan.utils import EntityType
def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
iob_string: str = "" # Full IOB formatted string
entity_types: list[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
for word in text.split():
if not word:
continue
token_iterator = iter(ner_tokens.items())
while (ner_token := next(token_iterator, None)) is not None:
name, entity_type = ner_token
if word[0] == entity_type.start:
word = word[1:]
entity_types.append(entity_type.start)
if entity_type.end:
inside = False
continue
elif entity_type.end and word[-1] == entity_type.end:
# Make sure the token is the closing of the current entity
assert entity_types[-1] == entity_type.start
word = word[:-1]
# Remove from queue
entity_types.pop()
# if there is no more entity, you remove inside
# else we continue parent entity
inside = bool(entity_types)
# Not a NER token
# If there is no current entity type
if not entity_types:
iob_string += f"\n{word} O"
continue
# There is at least one entity type
if inside:
# We are inside an entity
iob_string += f"\n{word} I-{name}"
else:
# We are starting an entity
inside = True
iob_string += f"\n{word} B-{name}"
print(iob_string)
Loading