Skip to content
Snippets Groups Projects

Convert NER prediction to BIO format

Merged Eva Bardou requested to merge convert-to-BIO into main
Files
2
dan/bio.py 0 → 100644
+ 93
0
import logging
from operator import attrgetter
from typing import Dict, List, Optional
from dan.utils import EntityType
logger = logging.getLogger(__name__)
def find_starting_token(
ner_tokens: Dict[str, EntityType], ending_token: str
) -> Optional[str]:
for entity_type in ner_tokens.values():
if entity_type.end == ending_token:
return entity_type.start
def find_entity_name(
ner_tokens: Dict[str, EntityType], starting_token: str
) -> Optional[str]:
for name, entity_type in ner_tokens.items():
if entity_type.start == starting_token:
return name
def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
# Spacing starting tokens and ending tokens (if necessary)
starting_tokens: List[str] = list(map(attrgetter("start"), ner_tokens.values()))
for starting_token in starting_tokens:
text = text.replace(starting_token, f" {starting_token} ")
ending_tokens: List[str] = list(map(attrgetter("end"), ner_tokens.values()))
has_ending_tokens: bool = set(ending_tokens) != {
""
} # Whether ending tokens are used
if has_ending_tokens:
for ending_token in ending_tokens:
text = text.replace(ending_token, f" {ending_token} ")
iob: List[str] = [] # List of IOB formatted strings
entity_types: List[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
for token in text.split():
# Encountering a starting token
if token in starting_tokens:
entity_types.append(token)
# Stopping any current entity type
inside = False
continue
# Encountering an ending token
elif has_ending_tokens and token in ending_tokens:
if not entity_types:
logger.warning(
f"Missing starting token for ending token {token}, skipping the entity"
)
continue
# Making sure this ending token closes the current entity
assert entity_types[-1] == find_starting_token(
ner_tokens, token
), f"Ending token {token} doesn't match the starting token {entity_types[-1]}"
# Removing the current entity from the queue as it is its end
entity_types.pop()
# If there is still entities in the queue, we continue in the parent one
# Else, we are not in any entity anymore
inside = bool(entity_types)
continue
# The token is not part of an entity
if not entity_types:
iob.append(f"{token} O")
continue
# The token is part of at least one entity
entity_name: str = find_entity_name(ner_tokens, entity_types[-1])
if inside:
# Inside the same entity
iob.append(f"{token} I-{entity_name}")
continue
# Starting a new entity
iob.append(f"{token} B-{entity_name}")
inside = True
# Concatenating all formatted iob strings
return "\n".join(iob)
Loading