From 9a1ec89cfc05ac156cddb5cd0de40464b3136865 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Wed, 18 Dec 2024 23:28:07 +0100 Subject: [PATCH] Build hierarchy and nested spans --- bio_parser/parse/__init__.py | 8 ++ bio_parser/parse/nested_document.py | 68 ++++++++- bio_parser/parse/validate.py | 5 +- config.yml | 10 ++ requirements.txt | 1 + tests/parse/test_nested_document.py | 6 +- tests/parse/test_validate.py | 212 ++++++++++++++++++++++++++++ 7 files changed, 303 insertions(+), 7 deletions(-) create mode 100644 config.yml diff --git a/bio_parser/parse/__init__.py b/bio_parser/parse/__init__.py index 2271f9c..6a75105 100644 --- a/bio_parser/parse/__init__.py +++ b/bio_parser/parse/__init__.py @@ -5,6 +5,8 @@ Validate a given BIO file. from argparse import ArgumentParser from pathlib import Path +import yaml + from bio_parser.parse.validate import run @@ -14,6 +16,11 @@ def _check_bio_ext(filename: str) -> Path: return filepath +def _load_yaml(config: str) -> Path: + with Path(config).open() as file: + return yaml.safe_load(file) + + def add_validate_parser(subcommands): parser: ArgumentParser = subcommands.add_parser( "validate", @@ -25,6 +32,7 @@ def add_validate_parser(subcommands): parser.add_argument( "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*" ) + parser.add_argument("config", help="Config with entity hierarchy.", type=_load_yaml) parser.add_argument( "--allow-nested", help="Whether to allow nested entities.", diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py index 5e497cc..465dd5f 100644 --- a/bio_parser/parse/nested_document.py +++ b/bio_parser/parse/nested_document.py @@ -4,6 +4,7 @@ import re from dataclasses import dataclass, field from operator import attrgetter from pathlib import Path +from typing import Any from bio_parser.parse.document import Span, Tag, Token @@ -110,16 +111,26 @@ class NestedDocument: filename: str """Document filename""" + bio_repr: str """Full BIO representation of the Document""" + + entity_hierarchy: dict[int, list[str]] + """Hierarchy between entities""" + nested_tokens: list[NestedToken] = field(default_factory=list) """List of the nested tokens in the Document""" spans: list[Span] = field(default_factory=list) """List of the spans in the Document""" - def __post_init__(self): - """Parses the tokens and the entity spans in the document.""" + nested_spans: list[dict[str, Any]] = field(default_factory=list) + """List of the nested spans in the Document""" + + hierarchy: list[dict[str, Any]] = field(default_factory=list) + """Hierarchy required for metrics""" + + def _build_spans(self): current_spans: dict[str, Span] = {} # Keep track of current spans by category for idx, line in enumerate(self.bio_repr.splitlines()): try: @@ -160,6 +171,55 @@ class NestedDocument: for span in current_spans.values(): self.spans.append(span) + def _build_nested_spans(self) -> list[dict[str, Span | list[Span]]]: + """Span hierarchy based on entity config.""" + + def get_span_level(span): + for level, categories in self.entity_hierarchy.items(): + if span.label in categories: + return level + return + + def is_inside(span, parent_span): + return ( + (span.idx >= parent_span.idx) + and (span.end <= parent_span.end) + and (parent_span != span) + ) + + def get_children(parent, candidates): + return [span for span in candidates if is_inside(span, parent)] + + parent_spans = [span for span in self.spans if get_span_level(span) == 0] + + self.nested_spans = [ + {"parent": span, "children": get_children(span, self.spans)} + for span in parent_spans + ] + + def _build_hierarchy(self) -> None: + self.hierarchy = [ + { + "category": span["parent"].label, + "children": [ + {"category": child.label, "children": child.text} + for child in span["children"] + ], + } + for span in self.nested_spans + ] + + def __post_init__(self): + """Parses the tokens and the entity spans in the document.""" + # Build spans + self._build_spans() + + # Build nested spans with hierarchy + self._build_nested_spans() + + # Build a simple hierarchy + self._build_hierarchy() + @property def words(self) -> list[str]: """List of words making up the document.""" @@ -198,7 +258,7 @@ class NestedDocument: return list(self.text) @classmethod - def from_file(cls, filepath: Path) -> "NestedDocument": + def from_file(cls, filepath: Path, config: dict) -> "NestedDocument": """Load a Document from a IOB file. Args: @@ -207,4 +267,4 @@ class NestedDocument: Returns: Document: Parsed document """ - return NestedDocument(filepath.stem, filepath.read_text()) + return NestedDocument(filepath.stem, filepath.read_text(), config) diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py index 7d09b40..d0c0fc3 100644 --- a/bio_parser/parse/validate.py +++ b/bio_parser/parse/validate.py @@ -10,7 +10,7 @@ from bio_parser.parse.nested_document import NestedDocument logger = logging.getLogger(__name__) -def run(filepaths: list[Path], allow_nested=False) -> None: +def run(filepaths: list[Path], config={}, allow_nested=False) -> None: """Validate the construction of multiple BIO files. Args: @@ -20,11 +20,12 @@ def run(filepaths: list[Path], allow_nested=False) -> None: logger.info(f"Parsing file @ `{filepath}`") try: doc = ( - NestedDocument.from_file(filepath) + NestedDocument.from_file(filepath, config) if allow_nested else Document.from_file(filepath) ) filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2)) + except Exception as e: logger.error(f"Could not load the file @ `{filepath}`: {e}") logger.info(f"The file @ `{filepath}` is valid!") diff --git a/config.yml b/config.yml new file mode 100644 index 0000000..0e66326 --- /dev/null +++ b/config.yml @@ -0,0 +1,10 @@ +0: + - child + - father + - mother +1: + - name + - surname + - surname + - occupation + - date diff --git a/requirements.txt b/requirements.txt index 9bc8586..a294442 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ +pyaml==24.12.1 rich==13.7.0 diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py index 658b775..e275321 100644 --- a/tests/parse/test_nested_document.py +++ b/tests/parse/test_nested_document.py @@ -5,11 +5,15 @@ from bio_parser.parse.nested_document import NestedDocument, NestedToken from tests.parse import DATA_DIR FILEPATH = DATA_DIR / "valid_nested.bio" +CONFIG = { + 0: ["child", "father", "mother"], + 1: ["name", "surname", "occupation", "location", "date"], +} @pytest.fixture() def nested_document() -> NestedDocument: - return NestedDocument.from_file(FILEPATH) + return NestedDocument.from_file(FILEPATH, CONFIG) def test_parse_document(nested_document: NestedDocument): diff --git a/tests/parse/test_validate.py b/tests/parse/test_validate.py index b4e4076..6f57330 100644 --- a/tests/parse/test_validate.py +++ b/tests/parse/test_validate.py @@ -39,3 +39,215 @@ def test_valid(): # Cleanup output.unlink() + + +def test_valid_nested(): + filepath = DATA_DIR / "valid_nested.bio" + config = { + 0: ["child", "father", "mother"], + 1: ["name", "surname", "occupation", "location", "date"], + } + + validate([filepath], config, allow_nested=True) + + # A JSON file should have been generated + output = filepath.with_suffix(".json") + assert output.exists() + + # Check content of JSON + assert json.loads(output.read_text()) == { + "filename": "valid_nested", + "bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n", + "entity_hierarchy": { + "0": ["child", "father", "mother"], + "1": ["name", "surname", "occupation", "location", "date"], + }, + "nested_tokens": [ + {"idx": 0, "text": "Charles B-child B-name"}, + {"idx": 1, "text": "né I-child"}, + {"idx": 2, "text": "à I-child"}, + {"idx": 3, "text": "Beaune I-child B-location"}, + {"idx": 4, "text": "en I-child"}, + {"idx": 5, "text": "1836 I-child B-date"}, + {"idx": 6, "text": "père O"}, + {"idx": 7, "text": "Jean B-father B-name"}, + {"idx": 8, "text": "Bigre I-father B-surname"}, + {"idx": 9, "text": "charpentier I-father B-occupation"}, + {"idx": 10, "text": "de I-father"}, + {"idx": 11, "text": "cette I-father B-location"}, + {"idx": 12, "text": "paroisse I-father I-location"}, + {"idx": 13, "text": "mère O"}, + {"idx": 14, "text": "Marie B-mother B-name"}, + ], + "spans": [ + { + "tokens": [ + {"idx": 0, "text": "Charles B-child"}, + {"idx": 1, "text": "né I-child"}, + {"idx": 2, "text": "à I-child"}, + {"idx": 3, "text": "Beaune I-child"}, + {"idx": 4, "text": "en I-child"}, + {"idx": 5, "text": "1836 I-child"}, + ] + }, + {"tokens": [{"idx": 0, "text": "Charles B-name"}]}, + {"tokens": [{"idx": 3, "text": "Beaune B-location"}]}, + {"tokens": [{"idx": 5, "text": "1836 B-date"}]}, + { + "tokens": [ + {"idx": 7, "text": "Jean B-father"}, + {"idx": 8, "text": "Bigre I-father"}, + {"idx": 9, "text": "charpentier I-father"}, + {"idx": 10, "text": "de I-father"}, + {"idx": 11, "text": "cette I-father"}, + {"idx": 12, "text": "paroisse I-father"}, + ] + }, + {"tokens": [{"idx": 7, "text": "Jean B-name"}]}, + {"tokens": [{"idx": 8, "text": "Bigre B-surname"}]}, + {"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]}, + { + "tokens": [ + {"idx": 11, "text": "cette B-location"}, + {"idx": 12, "text": "paroisse I-location"}, + ] + }, + {"tokens": [{"idx": 14, "text": "Marie B-mother"}]}, + {"tokens": [{"idx": 14, "text": "Marie B-name"}]}, + ], + "nested_spans": [ + { + "parent": { + "tokens": [ + {"idx": 0, "text": "Charles B-child"}, + {"idx": 1, "text": "né I-child"}, + {"idx": 2, "text": "à I-child"}, + {"idx": 3, "text": "Beaune I-child"}, + {"idx": 4, "text": "en I-child"}, + {"idx": 5, "text": "1836 I-child"}, + ] + }, + "children": [ + {"tokens": [{"idx": 0, "text": "Charles B-name"}]}, + {"tokens": [{"idx": 3, "text": "Beaune B-location"}]}, + {"tokens": [{"idx": 5, "text": "1836 B-date"}]}, + ], + }, + { + "parent": { + "tokens": [ + {"idx": 7, "text": "Jean B-father"}, + {"idx": 8, "text": "Bigre I-father"}, + {"idx": 9, "text": "charpentier I-father"}, + {"idx": 10, "text": "de I-father"}, + {"idx": 11, "text": "cette I-father"}, + {"idx": 12, "text": "paroisse I-father"}, + ] + }, + "children": [ + {"tokens": [{"idx": 7, "text": "Jean B-name"}]}, + {"tokens": [{"idx": 8, "text": "Bigre B-surname"}]}, + {"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]}, + { + "tokens": [ + {"idx": 11, "text": "cette B-location"}, + {"idx": 12, "text": "paroisse I-location"}, + ] + }, + ], + }, + { + "parent": {"tokens": [{"idx": 14, "text": "Marie B-mother"}]}, + "children": [{"tokens": [{"idx": 14, "text": "Marie B-name"}]}], + }, + ], + "hierarchy": [ + { + "category": "child", + "children": [ + {"category": "name", "children": "Charles"}, + {"category": "location", "children": "Beaune"}, + {"category": "date", "children": "1836"}, + ], + }, + { + "category": "father", + "children": [ + {"category": "name", "children": "Jean"}, + {"category": "surname", "children": "Bigre"}, + {"category": "occupation", "children": "charpentier"}, + {"category": "location", "children": "cette paroisse"}, + ], + }, + { + "category": "mother", + "children": [{"category": "name", "children": "Marie"}], + }, + ], + } + + # Cleanup + output.unlink() + + +def test_valid_not_nested(): + filepath = DATA_DIR / "valid_nested.bio" + config = { + 0: ["child", "father", "mother"], + 1: ["name", "surname", "occupation", "location", "date"], + } + + validate([filepath], config, allow_nested=False) + + # A JSON file should have been generated + output = filepath.with_suffix(".json") + assert output.exists() + + # Check content of JSON + assert json.loads(output.read_text()) == { + "filename": "valid_nested", + "bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n", + "tokens": [ + {"idx": 0, "text": "Charles B-child B-name"}, + {"idx": 1, "text": "né I-child"}, + {"idx": 2, "text": "à I-child"}, + {"idx": 3, "text": "Beaune I-child B-location"}, + {"idx": 4, "text": "en I-child"}, + {"idx": 5, "text": "1836 I-child B-date"}, + {"idx": 6, "text": "père O"}, + {"idx": 7, "text": "Jean B-father B-name"}, + {"idx": 8, "text": "Bigre I-father B-surname"}, + {"idx": 9, "text": "charpentier I-father B-occupation"}, + {"idx": 10, "text": "de I-father"}, + {"idx": 11, "text": "cette I-father B-location"}, + {"idx": 12, "text": "paroisse I-father I-location"}, + {"idx": 13, "text": "mère O"}, + {"idx": 14, "text": "Marie B-mother B-name"}, + ], + "spans": [ + { + "tokens": [ + {"idx": 0, "text": "Charles B-child B-name"}, + {"idx": 1, "text": "né I-child"}, + {"idx": 2, "text": "à I-child"}, + {"idx": 3, "text": "Beaune I-child B-location"}, + {"idx": 4, "text": "en I-child"}, + {"idx": 5, "text": "1836 I-child B-date"}, + ] + }, + { + "tokens": [ + {"idx": 7, "text": "Jean B-father B-name"}, + {"idx": 8, "text": "Bigre I-father B-surname"}, + {"idx": 9, "text": "charpentier I-father B-occupation"}, + {"idx": 10, "text": "de I-father"}, + {"idx": 11, "text": "cette I-father B-location"}, + {"idx": 12, "text": "paroisse I-father I-location"}, + ] + }, + {"tokens": [{"idx": 14, "text": "Marie B-mother B-name"}]}, + ], + } + + # Cleanup + output.unlink() -- GitLab