diff --git a/bio_parser/parse/__init__.py b/bio_parser/parse/__init__.py index 06ae06dc92286f48f1a6652f6d3d3244933c70d4..2271f9c29b99d23209ecaf85e9998d803339c285 100644 --- a/bio_parser/parse/__init__.py +++ b/bio_parser/parse/__init__.py @@ -25,3 +25,9 @@ def add_validate_parser(subcommands): parser.add_argument( "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*" ) + parser.add_argument( + "--allow-nested", + help="Whether to allow nested entities.", + action="store_true", + default=False, + ) diff --git a/bio_parser/parse/document.py b/bio_parser/parse/document.py index f06e24c6da0aea2c7cbe8dcea19cfa7c0af0cc91..66c389e0ffb84e3bc9adb103d39144308403a3cd 100644 --- a/bio_parser/parse/document.py +++ b/bio_parser/parse/document.py @@ -105,7 +105,7 @@ class Token: return _make_ner_label(tag=self.tag, label=self.label) @property - def labels(self) -> list[str]: + def char_labels(self) -> list[str]: """Character-level IOB labels. Examples: @@ -335,7 +335,7 @@ class Document: tags = [] for token, next_token in pairwise(self.tokens + [None]): # Add token tags - tags.extend(token.labels) + tags.extend(token.char_labels) if next_token and ( token.label == next_token.label and not next_token.tag == Tag.BEGINNING ): diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py new file mode 100644 index 0000000000000000000000000000000000000000..3174959a02ad5bfa9bf01f95cf765e3b168dda0b --- /dev/null +++ b/bio_parser/parse/nested_document.py @@ -0,0 +1,207 @@ +"""Parse nested BIO files.""" +import logging +import re +from dataclasses import dataclass, field +from itertools import pairwise +from operator import attrgetter +from pathlib import Path +from bio_parser.parse.document import Token, Tag, Span + +PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)") + +"""Regex that parses a line of a BIO file""" + +_logger = logging.getLogger(__name__) + + +@dataclass(slots=True) +class NestedToken: + """Token as tokenized in the BIO document, that may contain multiple labels.""" + + idx: int + """Index of the nested token in the document.""" + text: str + """Text representation of the nested token.""" + + @property + def _data(self) -> list[str]: + """Nested BIO line parsing.""" + parsed_global = PARSE_BIO_LINE.match(self.text) + text = parsed_global.group("text") + labels = list(parsed_global.group("labels").strip().split(" ")) + return [f"{text} {label}" for label in labels] + + @property + def tokens(self) -> list[Token]: + """List of flat tokens associated to the nested token. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").tokens + [Token(idx=0, text='Jean B-child'), Token(idx=0, text='Jean B-name')] + """ + return [Token(idx=self.idx, text=text_repr) for text_repr in self._data] + + @property + def word(self) -> str: + """Text content of the nested token. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").word + 'Jean' + """ + return self.tokens[0].word + + @property + def labels(self) -> list[str | None]: + """Named entity type of this token. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").label + ["child", "name"] + """ + return [token.label for token in self.tokens] + + @property + def tags(self) -> list[Tag]: + """IOB tags of named entity tag. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").tags + [<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>] + """ + return [token.tag for token in self.tokens] + + @property + def iob_labels(self) -> list[str]: + """IOB label (Tag + Entity). + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").iob_label + ['B-child', 'B-name'] + """ + return [token.iob_label for token in self.tokens] + + @property + def char_labels(self) -> list[list[str]]: + """Character-level IOB labels. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").char_labels + [['B-child', 'I-child', 'I-child', 'I-child'], ['B-name', 'I-name', 'I-name', 'I-name']] + """ + return [token.char_labels for token in self.tokens] + + @property + def chars(self) -> list[str]: + """The list of characters making up the token. + + Examples: + >>> NestedToken(idx=0, text="Jean B-child B-name").chars + ['J', 'e', 'a', 'n'] + """ + return self.tokens[0].chars + + +@dataclass(slots=True) +class NestedDocument: + """Representation of a BIO document.""" + + filename: str + """Document filename""" + bio_repr: str + """Full BIO representation of the Document""" + nested_tokens: list[NestedToken] = field(default_factory=list) + """List of the nested tokens in the Document""" + + spans: list[Span] = field(default_factory=list) + """List of the spans in the Document""" + + def __post_init__(self): + """Parses the tokens and the entity spans in the document.""" + current_spans : dict[str, Span] = {} # Keep track of current spans by category + for idx, line in enumerate(self.bio_repr.splitlines()): + try: + nested_token = NestedToken(idx=idx, text=line) + self.nested_tokens.append(nested_token) + + for idx, token in enumerate(nested_token.tokens): + # Build spans + match token.tag: + case Tag.OUTSIDE: + # Close all current spans + for span in current_spans.values(): + self.spans.append(span) + current_spans = {} + + case Tag.INSIDE: + if token.label in current_spans: + # Continue current span + current_spans[token.label].add_token(token) + + else: + Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`." + + case Tag.BEGINNING: + # End existing span if necessary + if token.label in current_spans: + span = current_spans.pop(token.label) + self.spans.append(span) + + # Start a new span + current_spans[token.label] = Span() + current_spans[token.label].add_token(token) + + except AssertionError as e: + _logger.error(f"Error on token n°{token.idx}: {e}") + raise Exception from e + + # Last spans + for span in current_spans.values(): + self.spans.append(span) + + @property + def words(self) -> list[str]: + """List of words making up the document.""" + return list(map(attrgetter("word"), self.nested_tokens)) + + @property + def entities(self) -> list[tuple[str, str]]: + """List of entities making up the document.""" + return list( + map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)), + ) + + @property + def word_entities(self) -> list[tuple[str, str]]: + """List of entities in the words making up the document.""" + return list( + map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)), + ) + + @property + def text(self) -> str: + """Join every word of the span by a whitespace.""" + return " ".join(map(attrgetter("word"), self.nested_tokens)) + + + @property + def chars(self) -> list[str]: + r"""Characters making up the token. + + Examples: + >>> Document(bio_repr="I B-Animal\nrun I-Animal").chars + ['I', ' ', 'r', 'u', 'n'] + """ + return list(self.text) + + @classmethod + def from_file(cls, filepath: Path) -> "NestedDocument": + """Load a Document from a IOB file. + + Args: + filepath (Path): Path to the file to load. + + Returns: + Document: Parsed document + """ + return NestedDocument(filepath.stem, filepath.read_text()) diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py index 6f83192e82329f741ca5071ec912a6dc1a9c66c2..7d09b40912a1c7c0da4f1a6a70fdb78ceccd354a 100644 --- a/bio_parser/parse/validate.py +++ b/bio_parser/parse/validate.py @@ -5,11 +5,12 @@ from dataclasses import asdict from pathlib import Path from bio_parser.parse.document import Document +from bio_parser.parse.nested_document import NestedDocument logger = logging.getLogger(__name__) -def run(filepaths: list[Path]) -> None: +def run(filepaths: list[Path], allow_nested=False) -> None: """Validate the construction of multiple BIO files. Args: @@ -18,7 +19,11 @@ def run(filepaths: list[Path]) -> None: for filepath in filepaths: logger.info(f"Parsing file @ `{filepath}`") try: - doc = Document.from_file(filepath) + doc = ( + NestedDocument.from_file(filepath) + if allow_nested + else Document.from_file(filepath) + ) filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2)) except Exception as e: logger.error(f"Could not load the file @ `{filepath}`: {e}") diff --git a/tests/fixtures/parse/valid_nested.bio b/tests/fixtures/parse/valid_nested.bio new file mode 100644 index 0000000000000000000000000000000000000000..006431374ed3c4bb637f1e1452f76878341ad406 --- /dev/null +++ b/tests/fixtures/parse/valid_nested.bio @@ -0,0 +1,15 @@ +Charles B-child B-name +né I-child +à I-child +Beaune I-child B-location +en I-child +1836 I-child B-date +père O +Jean B-father B-name +Bigre I-father B-surname +charpentier I-father B-occupation +de I-father +cette I-father B-location +paroisse I-father I-location +mère O +Marie B-mother B-name diff --git a/tests/parse/test_document.py b/tests/parse/test_document.py index 565f2a1d5dd5489ce78a66f1bf62057e90cf35de..6b090a944fc4e1334728b83731d47a3e395eeabe 100644 --- a/tests/parse/test_document.py +++ b/tests/parse/test_document.py @@ -7,7 +7,7 @@ FILEPATH = DATA_DIR / "valid.bio" @pytest.fixture() -def document(): +def document() -> Document: return Document.from_file(FILEPATH) @@ -131,7 +131,7 @@ def test_parse_token(document: Document): assert token.iob_label == "B-GPE" # Check labels - assert token.labels == ["B-GPE", "I-GPE", "I-GPE"] + assert token.char_labels == ["B-GPE", "I-GPE", "I-GPE"] # Check chars assert token.chars == ["S", "a", "n"] @@ -152,7 +152,7 @@ def test_parse_token(document: Document): assert token.iob_label == "I-GPE" # Check labels - assert token.labels == [ + assert token.char_labels == [ "I-GPE", "I-GPE", "I-GPE", @@ -183,7 +183,7 @@ def test_parse_token(document: Document): assert token.iob_label == "O" # Check labels - assert token.labels == ["O", "O", "O", "O", "O", "O"] + assert token.char_labels == ["O", "O", "O", "O", "O", "O"] # Check chars assert token.chars == ["r", "o", "b", "o", "t", "s"] diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py new file mode 100644 index 0000000000000000000000000000000000000000..8bb7d211912c2f2f7a50fcadc21732088125a51e --- /dev/null +++ b/tests/parse/test_nested_document.py @@ -0,0 +1,56 @@ +import pytest +from bio_parser.parse.document import Document, Span, Tag, Token, _make_ner_label +from bio_parser.parse.nested_document import NestedDocument, NestedToken + +from tests.parse import DATA_DIR + +FILEPATH = DATA_DIR / "valid_nested.bio" + + +@pytest.fixture() +def nested_document() -> NestedDocument: + return NestedDocument.from_file(FILEPATH) + + +def test_parse_document(nested_document: NestedDocument): + # Check words + assert nested_document.words == ["Charles", "né", "à ", "Beaune", "en", "1836", "père", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mère", "Marie"] + + # Check entities + assert nested_document.entities == [ + ("child", "Charles né à Beaune en 1836"), + ("name", "Charles"), + ("location", "Beaune"), + ("date", "1836"), + ("father", "Jean Bigre charpentier de cette paroisse"), + ("name", "Jean"), + ("surname", "Bigre"), + ("occupation", "charpentier"), + ("location", "cette paroisse"), + ("mother", "Marie"), + ("name", "Marie")] + + # Check word entities + assert nested_document.word_entities == [ + (["child", "name"], "Charles"), + (["child"], "né"), (["child"], "à "), + (["child", "location"], "Beaune"), + (["child"], "en"), + (["child", "date"], "1836"), + (["father", "name"], "Jean"), + (["father", "surname"], "Bigre"), + (["father", "occupation"], "charpentier"), + (["father"], "de"), + (["father", "location"], "cette"), + (["father", "location"], "paroisse"), + (["mother", "name"], "Marie") + ] + + # Check text + assert nested_document.text == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie" + + # Check chars + assert nested_document.chars == list( + "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie" + ) +