Skip to content
Snippets Groups Projects

Convert NER prediction to BIO format

Merged Eva Bardou requested to merge convert-to-BIO into main
All threads resolved!
Files
2
dan/bio.py 0 → 100644
+ 85
0
import logging
import re
from typing import Dict, List
from dan.utils import EntityType
logger = logging.getLogger(__name__)
def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
# Mapping to find a starting token for an ending token efficiently
mapping_end_start: Dict[str, str] = {
entity_type.end: entity_type.start for entity_type in ner_tokens.values()
}
# Mapping to find the entity name for a starting token efficiently
mapping_start_name: Dict[str, str] = {
entity_type.start: name for name, entity_type in ner_tokens.items()
}
starting_tokens: List[str] = mapping_start_name.keys()
ending_tokens: List[str] = mapping_end_start.keys()
has_ending_tokens: bool = set(ending_tokens) != {
""
} # Whether ending tokens are used
# Spacing starting tokens and ending tokens (if necessary)
tokens_spacing: re.Pattern = re.compile(
r"([" + "".join([*starting_tokens, *ending_tokens]) + "])"
)
text: str = tokens_spacing.sub(r" \1 ", text)
iob: List[str] = [] # List of IOB formatted strings
entity_types: List[str] = [] # Encountered entity types
inside: bool = False # Whether we are inside an entity
for token in text.split():
# Encountering a starting token
if token in starting_tokens:
entity_types.append(token)
# Stopping any current entity type
inside = False
continue
# Encountering an ending token
elif has_ending_tokens and token in ending_tokens:
if not entity_types:
logger.warning(
f"Missing starting token for ending token {token}, skipping the entity"
)
continue
# Making sure this ending token closes the current entity
assert (
entity_types[-1] == mapping_end_start[token]
), f"Ending token {token} doesn't match the starting token {entity_types[-1]}"
# Removing the current entity from the queue as it is its end
entity_types.pop()
# If there is still entities in the queue, we continue in the parent one
# Else, we are not in any entity anymore
inside = bool(entity_types)
continue
# The token is not part of an entity
if not entity_types:
iob.append(f"{token} O")
continue
# The token is part of at least one entity
entity_name: str = mapping_start_name[entity_types[-1]]
if inside:
# Inside the same entity
iob.append(f"{token} I-{entity_name}")
continue
# Starting a new entity
iob.append(f"{token} B-{entity_name}")
inside = True
# Concatenating all formatted iob strings
return "\n".join(iob)
Loading