Skip to content
Snippets Groups Projects
Commit 7b32a134 authored by Solene Tarride's avatar Solene Tarride
Browse files

Support nested entities with NestedDocument and NestedToken

parent f1e7208f
No related branches found
No related tags found
1 merge request!8Draft: Support nested entities
Pipeline #203240 failed
......@@ -25,3 +25,9 @@ def add_validate_parser(subcommands):
parser.add_argument(
"filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
)
parser.add_argument(
"--allow-nested",
help="Whether to allow nested entities.",
action="store_true",
default=False,
)
......@@ -105,7 +105,7 @@ class Token:
return _make_ner_label(tag=self.tag, label=self.label)
@property
def labels(self) -> list[str]:
def char_labels(self) -> list[str]:
"""Character-level IOB labels.
Examples:
......@@ -335,7 +335,7 @@ class Document:
tags = []
for token, next_token in pairwise(self.tokens + [None]):
# Add token tags
tags.extend(token.labels)
tags.extend(token.char_labels)
if next_token and (
token.label == next_token.label and not next_token.tag == Tag.BEGINNING
):
......
"""Parse nested BIO files."""
import logging
import re
from dataclasses import dataclass, field
from itertools import pairwise
from operator import attrgetter
from pathlib import Path
from bio_parser.parse.document import Token, Tag, Span
PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)")
"""Regex that parses a line of a BIO file"""
_logger = logging.getLogger(__name__)
@dataclass(slots=True)
class NestedToken:
"""Token as tokenized in the BIO document, that may contain multiple labels."""
idx: int
"""Index of the nested token in the document."""
text: str
"""Text representation of the nested token."""
@property
def _data(self) -> list[str]:
"""Nested BIO line parsing."""
parsed_global = PARSE_BIO_LINE.match(self.text)
text = parsed_global.group("text")
labels = list(parsed_global.group("labels").strip().split(" "))
return [f"{text} {label}" for label in labels]
@property
def tokens(self) -> list[Token]:
"""List of flat tokens associated to the nested token.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").tokens
[Token(idx=0, text='Jean B-child'), Token(idx=0, text='Jean B-name')]
"""
return [Token(idx=self.idx, text=text_repr) for text_repr in self._data]
@property
def word(self) -> str:
"""Text content of the nested token.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").word
'Jean'
"""
return self.tokens[0].word
@property
def labels(self) -> list[str | None]:
"""Named entity type of this token.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").label
["child", "name"]
"""
return [token.label for token in self.tokens]
@property
def tags(self) -> list[Tag]:
"""IOB tags of named entity tag.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").tags
[<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>]
"""
return [token.tag for token in self.tokens]
@property
def iob_labels(self) -> list[str]:
"""IOB label (Tag + Entity).
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").iob_label
['B-child', 'B-name']
"""
return [token.iob_label for token in self.tokens]
@property
def char_labels(self) -> list[list[str]]:
"""Character-level IOB labels.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").char_labels
[['B-child', 'I-child', 'I-child', 'I-child'], ['B-name', 'I-name', 'I-name', 'I-name']]
"""
return [token.char_labels for token in self.tokens]
@property
def chars(self) -> list[str]:
"""The list of characters making up the token.
Examples:
>>> NestedToken(idx=0, text="Jean B-child B-name").chars
['J', 'e', 'a', 'n']
"""
return self.tokens[0].chars
@dataclass(slots=True)
class NestedDocument:
"""Representation of a BIO document."""
filename: str
"""Document filename"""
bio_repr: str
"""Full BIO representation of the Document"""
nested_tokens: list[NestedToken] = field(default_factory=list)
"""List of the nested tokens in the Document"""
spans: list[Span] = field(default_factory=list)
"""List of the spans in the Document"""
def __post_init__(self):
"""Parses the tokens and the entity spans in the document."""
current_spans : dict[str, Span] = {} # Keep track of current spans by category
for idx, line in enumerate(self.bio_repr.splitlines()):
try:
nested_token = NestedToken(idx=idx, text=line)
self.nested_tokens.append(nested_token)
for idx, token in enumerate(nested_token.tokens):
# Build spans
match token.tag:
case Tag.OUTSIDE:
# Close all current spans
for span in current_spans.values():
self.spans.append(span)
current_spans = {}
case Tag.INSIDE:
if token.label in current_spans:
# Continue current span
current_spans[token.label].add_token(token)
else:
Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
case Tag.BEGINNING:
# End existing span if necessary
if token.label in current_spans:
span = current_spans.pop(token.label)
self.spans.append(span)
# Start a new span
current_spans[token.label] = Span()
current_spans[token.label].add_token(token)
except AssertionError as e:
_logger.error(f"Error on token n°{token.idx}: {e}")
raise Exception from e
# Last spans
for span in current_spans.values():
self.spans.append(span)
@property
def words(self) -> list[str]:
"""List of words making up the document."""
return list(map(attrgetter("word"), self.nested_tokens))
@property
def entities(self) -> list[tuple[str, str]]:
"""List of entities making up the document."""
return list(
map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)),
)
@property
def word_entities(self) -> list[tuple[str, str]]:
"""List of entities in the words making up the document."""
return list(
map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)),
)
@property
def text(self) -> str:
"""Join every word of the span by a whitespace."""
return " ".join(map(attrgetter("word"), self.nested_tokens))
@property
def chars(self) -> list[str]:
r"""Characters making up the token.
Examples:
>>> Document(bio_repr="I B-Animal\nrun I-Animal").chars
['I', ' ', 'r', 'u', 'n']
"""
return list(self.text)
@classmethod
def from_file(cls, filepath: Path) -> "NestedDocument":
"""Load a Document from a IOB file.
Args:
filepath (Path): Path to the file to load.
Returns:
Document: Parsed document
"""
return NestedDocument(filepath.stem, filepath.read_text())
......@@ -5,11 +5,12 @@ from dataclasses import asdict
from pathlib import Path
from bio_parser.parse.document import Document
from bio_parser.parse.nested_document import NestedDocument
logger = logging.getLogger(__name__)
def run(filepaths: list[Path]) -> None:
def run(filepaths: list[Path], allow_nested=False) -> None:
"""Validate the construction of multiple BIO files.
Args:
......@@ -18,7 +19,11 @@ def run(filepaths: list[Path]) -> None:
for filepath in filepaths:
logger.info(f"Parsing file @ `{filepath}`")
try:
doc = Document.from_file(filepath)
doc = (
NestedDocument.from_file(filepath)
if allow_nested
else Document.from_file(filepath)
)
filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
except Exception as e:
logger.error(f"Could not load the file @ `{filepath}`: {e}")
......
Charles B-child B-name
né I-child
à I-child
Beaune I-child B-location
en I-child
1836 I-child B-date
père O
Jean B-father B-name
Bigre I-father B-surname
charpentier I-father B-occupation
de I-father
cette I-father B-location
paroisse I-father I-location
mère O
Marie B-mother B-name
......@@ -7,7 +7,7 @@ FILEPATH = DATA_DIR / "valid.bio"
@pytest.fixture()
def document():
def document() -> Document:
return Document.from_file(FILEPATH)
......@@ -131,7 +131,7 @@ def test_parse_token(document: Document):
assert token.iob_label == "B-GPE"
# Check labels
assert token.labels == ["B-GPE", "I-GPE", "I-GPE"]
assert token.char_labels == ["B-GPE", "I-GPE", "I-GPE"]
# Check chars
assert token.chars == ["S", "a", "n"]
......@@ -152,7 +152,7 @@ def test_parse_token(document: Document):
assert token.iob_label == "I-GPE"
# Check labels
assert token.labels == [
assert token.char_labels == [
"I-GPE",
"I-GPE",
"I-GPE",
......@@ -183,7 +183,7 @@ def test_parse_token(document: Document):
assert token.iob_label == "O"
# Check labels
assert token.labels == ["O", "O", "O", "O", "O", "O"]
assert token.char_labels == ["O", "O", "O", "O", "O", "O"]
# Check chars
assert token.chars == ["r", "o", "b", "o", "t", "s"]
......
import pytest
from bio_parser.parse.document import Document, Span, Tag, Token, _make_ner_label
from bio_parser.parse.nested_document import NestedDocument, NestedToken
from tests.parse import DATA_DIR
FILEPATH = DATA_DIR / "valid_nested.bio"
@pytest.fixture()
def nested_document() -> NestedDocument:
return NestedDocument.from_file(FILEPATH)
def test_parse_document(nested_document: NestedDocument):
# Check words
assert nested_document.words == ["Charles", "", "à", "Beaune", "en", "1836", "père", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mère", "Marie"]
# Check entities
assert nested_document.entities == [
("child", "Charles né à Beaune en 1836"),
("name", "Charles"),
("location", "Beaune"),
("date", "1836"),
("father", "Jean Bigre charpentier de cette paroisse"),
("name", "Jean"),
("surname", "Bigre"),
("occupation", "charpentier"),
("location", "cette paroisse"),
("mother", "Marie"),
("name", "Marie")]
# Check word entities
assert nested_document.word_entities == [
(["child", "name"], "Charles"),
(["child"], ""), (["child"], "à"),
(["child", "location"], "Beaune"),
(["child"], "en"),
(["child", "date"], "1836"),
(["father", "name"], "Jean"),
(["father", "surname"], "Bigre"),
(["father", "occupation"], "charpentier"),
(["father"], "de"),
(["father", "location"], "cette"),
(["father", "location"], "paroisse"),
(["mother", "name"], "Marie")
]
# Check text
assert nested_document.text == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
# Check chars
assert nested_document.chars == list(
"Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment