Skip to content
Snippets Groups Projects

Convert NER prediction to BIO format

Merged Eva Bardou requested to merge convert-to-BIO into main
All threads resolved!
1 file
+ 59
39
Compare changes
  • Side-by-side
  • Inline
+ 59
39
from typing import Dict
from operator import attrgetter
from typing import Dict, List
from dan.utils import EntityType
def find_starting_token(ner_tokens, ending_token):
for entity_type in ner_tokens.values():
if entity_type.end == ending_token:
return entity_type.start
def find_entity_name(ner_tokens, starting_token):
for name, entity_type in ner_tokens.items():
if entity_type.start == starting_token:
return name
def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
iob_string: str = "" # Full IOB formatted string
entity_types: list[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
# Spacing starting tokens and ending tokens (if necessary)
starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values()))
for starting_token in starting_tokens:
text = text.replace(starting_token, f" {starting_token} ")
for word in text.split():
if not word:
continue
ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values()))
for ending_token in ending_tokens:
text = text.replace(ending_token, f" {ending_token} ")
token_iterator = iter(ner_tokens.items())
while (ner_token := next(token_iterator, None)) is not None:
name, entity_type = ner_token
has_ending_tokens: bool = bool(len(ending_tokens))
if word[0] == entity_type.start:
word = word[1:]
iob: List[str] = [] # List of IOB formatted strings
entity_types: List[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
for token in text.split():
# Encountering a starting token
if token in starting_tokens:
entity_types.append(token)
# Stopping any current entity type
if has_ending_tokens:
inside = False
entity_types.append(entity_type.start)
continue
if entity_type.end:
inside = False
continue
# Encountering an ending token
elif has_ending_tokens and token in ending_tokens:
# Making sure this ending token closes the current entity
assert entity_types[-1] == find_starting_token(ner_tokens, token)
elif entity_type.end and word[-1] == entity_type.end:
# Make sure the token is the closing of the current entity
assert entity_types[-1] == entity_type.start
# Removing the current entity from the queue as it is its end
entity_types.pop()
word = word[:-1]
# If there is still entities in the queue, we continue in the parent one
# Else, we are not in any entity anymore
inside = bool(entity_types)
# Remove from queue
entity_types.pop()
continue
# if there is no more entity, you remove inside
# else we continue parent entity
inside = bool(entity_types)
# The token is not part of an entity
if not entity_types:
iob.append(f"{token} O")
continue
# Not a NER token
# The token is part of at least one entity
entity_name = find_entity_name(ner_tokens, entity_types[-1])
# If there is no current entity type
if not entity_types:
iob_string += f"\n{word} O"
continue
if inside:
# Inside the same entity
iob.append(f"{token} I-{entity_name}")
continue
# There is at least one entity type
if inside:
# We are inside an entity
iob_string += f"\n{word} I-{name}"
else:
# We are starting an entity
inside = True
iob_string += f"\n{word} B-{name}"
# Starting a new entity
iob.append(f"{token} B-{entity_name}")
inside = True
print(iob_string)
# Concatenating all formatted iob strings
return "\n".join(iob)
Loading