From 9a1ec89cfc05ac156cddb5cd0de40464b3136865 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Wed, 18 Dec 2024 23:28:07 +0100
Subject: [PATCH] Build hierarchy and nested spans

---
 bio_parser/parse/__init__.py        |   8 ++
 bio_parser/parse/nested_document.py |  68 ++++++++-
 bio_parser/parse/validate.py        |   5 +-
 config.yml                          |  10 ++
 requirements.txt                    |   1 +
 tests/parse/test_nested_document.py |   6 +-
 tests/parse/test_validate.py        | 212 ++++++++++++++++++++++++++++
 7 files changed, 303 insertions(+), 7 deletions(-)
 create mode 100644 config.yml

diff --git a/bio_parser/parse/__init__.py b/bio_parser/parse/__init__.py
index 2271f9c..6a75105 100644
--- a/bio_parser/parse/__init__.py
+++ b/bio_parser/parse/__init__.py
@@ -5,6 +5,8 @@ Validate a given BIO file.
 from argparse import ArgumentParser
 from pathlib import Path
 
+import yaml
+
 from bio_parser.parse.validate import run
 
 
@@ -14,6 +16,11 @@ def _check_bio_ext(filename: str) -> Path:
     return filepath
 
 
+def _load_yaml(config: str) -> Path:
+    with Path(config).open() as file:
+        return yaml.safe_load(file)
+
+
 def add_validate_parser(subcommands):
     parser: ArgumentParser = subcommands.add_parser(
         "validate",
@@ -25,6 +32,7 @@ def add_validate_parser(subcommands):
     parser.add_argument(
         "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
     )
+    parser.add_argument("config", help="Config with entity hierarchy.", type=_load_yaml)
     parser.add_argument(
         "--allow-nested",
         help="Whether to allow nested entities.",
diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py
index 5e497cc..465dd5f 100644
--- a/bio_parser/parse/nested_document.py
+++ b/bio_parser/parse/nested_document.py
@@ -4,6 +4,7 @@ import re
 from dataclasses import dataclass, field
 from operator import attrgetter
 from pathlib import Path
+from typing import Any
 
 from bio_parser.parse.document import Span, Tag, Token
 
@@ -110,16 +111,26 @@ class NestedDocument:
 
     filename: str
     """Document filename"""
+
     bio_repr: str
     """Full BIO representation of the Document"""
+
+    entity_hierarchy: dict[int, list[str]]
+    """Hierarchy between entities"""
+
     nested_tokens: list[NestedToken] = field(default_factory=list)
     """List of the nested tokens in the Document"""
 
     spans: list[Span] = field(default_factory=list)
     """List of the spans in the Document"""
 
-    def __post_init__(self):
-        """Parses the tokens and the entity spans in the document."""
+    nested_spans: list[dict[str, Any]] = field(default_factory=list)
+    """List of the nested spans in the Document"""
+
+    hierarchy: list[dict[str, Any]] = field(default_factory=list)
+    """Hierarchy required for metrics"""
+
+    def _build_spans(self):
         current_spans: dict[str, Span] = {}  # Keep track of current spans by category
         for idx, line in enumerate(self.bio_repr.splitlines()):
             try:
@@ -160,6 +171,55 @@ class NestedDocument:
         for span in current_spans.values():
             self.spans.append(span)
 
+    def _build_nested_spans(self) -> list[dict[str, Span | list[Span]]]:
+        """Span hierarchy based on entity config."""
+
+        def get_span_level(span):
+            for level, categories in self.entity_hierarchy.items():
+                if span.label in categories:
+                    return level
+            return
+
+        def is_inside(span, parent_span):
+            return (
+                (span.idx >= parent_span.idx)
+                and (span.end <= parent_span.end)
+                and (parent_span != span)
+            )
+
+        def get_children(parent, candidates):
+            return [span for span in candidates if is_inside(span, parent)]
+
+        parent_spans = [span for span in self.spans if get_span_level(span) == 0]
+
+        self.nested_spans = [
+            {"parent": span, "children": get_children(span, self.spans)}
+            for span in parent_spans
+        ]
+
+    def _build_hierarchy(self) -> None:
+        self.hierarchy = [
+            {
+                "category": span["parent"].label,
+                "children": [
+                    {"category": child.label, "children": child.text}
+                    for child in span["children"]
+                ],
+            }
+            for span in self.nested_spans
+        ]
+
+    def __post_init__(self):
+        """Parses the tokens and the entity spans in the document."""
+        # Build spans
+        self._build_spans()
+
+        # Build nested spans with hierarchy
+        self._build_nested_spans()
+
+        # Build a simple hierarchy
+        self._build_hierarchy()
+
     @property
     def words(self) -> list[str]:
         """List of words making up the document."""
@@ -198,7 +258,7 @@ class NestedDocument:
         return list(self.text)
 
     @classmethod
-    def from_file(cls, filepath: Path) -> "NestedDocument":
+    def from_file(cls, filepath: Path, config: dict) -> "NestedDocument":
         """Load a Document from a IOB file.
 
         Args:
@@ -207,4 +267,4 @@ class NestedDocument:
         Returns:
             Document: Parsed document
         """
-        return NestedDocument(filepath.stem, filepath.read_text())
+        return NestedDocument(filepath.stem, filepath.read_text(), config)
diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py
index 7d09b40..d0c0fc3 100644
--- a/bio_parser/parse/validate.py
+++ b/bio_parser/parse/validate.py
@@ -10,7 +10,7 @@ from bio_parser.parse.nested_document import NestedDocument
 logger = logging.getLogger(__name__)
 
 
-def run(filepaths: list[Path], allow_nested=False) -> None:
+def run(filepaths: list[Path], config={}, allow_nested=False) -> None:
     """Validate the construction of multiple BIO files.
 
     Args:
@@ -20,11 +20,12 @@ def run(filepaths: list[Path], allow_nested=False) -> None:
         logger.info(f"Parsing file @ `{filepath}`")
         try:
             doc = (
-                NestedDocument.from_file(filepath)
+                NestedDocument.from_file(filepath, config)
                 if allow_nested
                 else Document.from_file(filepath)
             )
             filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
+
         except Exception as e:
             logger.error(f"Could not load the file @ `{filepath}`: {e}")
         logger.info(f"The file @ `{filepath}` is valid!")
diff --git a/config.yml b/config.yml
new file mode 100644
index 0000000..0e66326
--- /dev/null
+++ b/config.yml
@@ -0,0 +1,10 @@
+0:
+  - child
+  - father
+  - mother
+1:
+  - name
+  - surname
+  - surname
+  - occupation
+  - date
diff --git a/requirements.txt b/requirements.txt
index 9bc8586..a294442 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
+pyaml==24.12.1
 rich==13.7.0
diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py
index 658b775..e275321 100644
--- a/tests/parse/test_nested_document.py
+++ b/tests/parse/test_nested_document.py
@@ -5,11 +5,15 @@ from bio_parser.parse.nested_document import NestedDocument, NestedToken
 from tests.parse import DATA_DIR
 
 FILEPATH = DATA_DIR / "valid_nested.bio"
+CONFIG = {
+    0: ["child", "father", "mother"],
+    1: ["name", "surname", "occupation", "location", "date"],
+}
 
 
 @pytest.fixture()
 def nested_document() -> NestedDocument:
-    return NestedDocument.from_file(FILEPATH)
+    return NestedDocument.from_file(FILEPATH, CONFIG)
 
 
 def test_parse_document(nested_document: NestedDocument):
diff --git a/tests/parse/test_validate.py b/tests/parse/test_validate.py
index b4e4076..6f57330 100644
--- a/tests/parse/test_validate.py
+++ b/tests/parse/test_validate.py
@@ -39,3 +39,215 @@ def test_valid():
 
     # Cleanup
     output.unlink()
+
+
+def test_valid_nested():
+    filepath = DATA_DIR / "valid_nested.bio"
+    config = {
+        0: ["child", "father", "mother"],
+        1: ["name", "surname", "occupation", "location", "date"],
+    }
+
+    validate([filepath], config, allow_nested=True)
+
+    # A JSON file should have been generated
+    output = filepath.with_suffix(".json")
+    assert output.exists()
+
+    # Check content of JSON
+    assert json.loads(output.read_text()) == {
+        "filename": "valid_nested",
+        "bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n",
+        "entity_hierarchy": {
+            "0": ["child", "father", "mother"],
+            "1": ["name", "surname", "occupation", "location", "date"],
+        },
+        "nested_tokens": [
+            {"idx": 0, "text": "Charles B-child B-name"},
+            {"idx": 1, "text": "né I-child"},
+            {"idx": 2, "text": "à I-child"},
+            {"idx": 3, "text": "Beaune I-child B-location"},
+            {"idx": 4, "text": "en I-child"},
+            {"idx": 5, "text": "1836 I-child B-date"},
+            {"idx": 6, "text": "père O"},
+            {"idx": 7, "text": "Jean B-father B-name"},
+            {"idx": 8, "text": "Bigre I-father B-surname"},
+            {"idx": 9, "text": "charpentier I-father B-occupation"},
+            {"idx": 10, "text": "de I-father"},
+            {"idx": 11, "text": "cette I-father B-location"},
+            {"idx": 12, "text": "paroisse I-father I-location"},
+            {"idx": 13, "text": "mère O"},
+            {"idx": 14, "text": "Marie B-mother B-name"},
+        ],
+        "spans": [
+            {
+                "tokens": [
+                    {"idx": 0, "text": "Charles B-child"},
+                    {"idx": 1, "text": "né I-child"},
+                    {"idx": 2, "text": "à I-child"},
+                    {"idx": 3, "text": "Beaune I-child"},
+                    {"idx": 4, "text": "en I-child"},
+                    {"idx": 5, "text": "1836 I-child"},
+                ]
+            },
+            {"tokens": [{"idx": 0, "text": "Charles B-name"}]},
+            {"tokens": [{"idx": 3, "text": "Beaune B-location"}]},
+            {"tokens": [{"idx": 5, "text": "1836 B-date"}]},
+            {
+                "tokens": [
+                    {"idx": 7, "text": "Jean B-father"},
+                    {"idx": 8, "text": "Bigre I-father"},
+                    {"idx": 9, "text": "charpentier I-father"},
+                    {"idx": 10, "text": "de I-father"},
+                    {"idx": 11, "text": "cette I-father"},
+                    {"idx": 12, "text": "paroisse I-father"},
+                ]
+            },
+            {"tokens": [{"idx": 7, "text": "Jean B-name"}]},
+            {"tokens": [{"idx": 8, "text": "Bigre B-surname"}]},
+            {"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]},
+            {
+                "tokens": [
+                    {"idx": 11, "text": "cette B-location"},
+                    {"idx": 12, "text": "paroisse I-location"},
+                ]
+            },
+            {"tokens": [{"idx": 14, "text": "Marie B-mother"}]},
+            {"tokens": [{"idx": 14, "text": "Marie B-name"}]},
+        ],
+        "nested_spans": [
+            {
+                "parent": {
+                    "tokens": [
+                        {"idx": 0, "text": "Charles B-child"},
+                        {"idx": 1, "text": "né I-child"},
+                        {"idx": 2, "text": "à I-child"},
+                        {"idx": 3, "text": "Beaune I-child"},
+                        {"idx": 4, "text": "en I-child"},
+                        {"idx": 5, "text": "1836 I-child"},
+                    ]
+                },
+                "children": [
+                    {"tokens": [{"idx": 0, "text": "Charles B-name"}]},
+                    {"tokens": [{"idx": 3, "text": "Beaune B-location"}]},
+                    {"tokens": [{"idx": 5, "text": "1836 B-date"}]},
+                ],
+            },
+            {
+                "parent": {
+                    "tokens": [
+                        {"idx": 7, "text": "Jean B-father"},
+                        {"idx": 8, "text": "Bigre I-father"},
+                        {"idx": 9, "text": "charpentier I-father"},
+                        {"idx": 10, "text": "de I-father"},
+                        {"idx": 11, "text": "cette I-father"},
+                        {"idx": 12, "text": "paroisse I-father"},
+                    ]
+                },
+                "children": [
+                    {"tokens": [{"idx": 7, "text": "Jean B-name"}]},
+                    {"tokens": [{"idx": 8, "text": "Bigre B-surname"}]},
+                    {"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]},
+                    {
+                        "tokens": [
+                            {"idx": 11, "text": "cette B-location"},
+                            {"idx": 12, "text": "paroisse I-location"},
+                        ]
+                    },
+                ],
+            },
+            {
+                "parent": {"tokens": [{"idx": 14, "text": "Marie B-mother"}]},
+                "children": [{"tokens": [{"idx": 14, "text": "Marie B-name"}]}],
+            },
+        ],
+        "hierarchy": [
+            {
+                "category": "child",
+                "children": [
+                    {"category": "name", "children": "Charles"},
+                    {"category": "location", "children": "Beaune"},
+                    {"category": "date", "children": "1836"},
+                ],
+            },
+            {
+                "category": "father",
+                "children": [
+                    {"category": "name", "children": "Jean"},
+                    {"category": "surname", "children": "Bigre"},
+                    {"category": "occupation", "children": "charpentier"},
+                    {"category": "location", "children": "cette paroisse"},
+                ],
+            },
+            {
+                "category": "mother",
+                "children": [{"category": "name", "children": "Marie"}],
+            },
+        ],
+    }
+
+    # Cleanup
+    output.unlink()
+
+
+def test_valid_not_nested():
+    filepath = DATA_DIR / "valid_nested.bio"
+    config = {
+        0: ["child", "father", "mother"],
+        1: ["name", "surname", "occupation", "location", "date"],
+    }
+
+    validate([filepath], config, allow_nested=False)
+
+    # A JSON file should have been generated
+    output = filepath.with_suffix(".json")
+    assert output.exists()
+
+    # Check content of JSON
+    assert json.loads(output.read_text()) == {
+        "filename": "valid_nested",
+        "bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n",
+        "tokens": [
+            {"idx": 0, "text": "Charles B-child B-name"},
+            {"idx": 1, "text": "né I-child"},
+            {"idx": 2, "text": "à I-child"},
+            {"idx": 3, "text": "Beaune I-child B-location"},
+            {"idx": 4, "text": "en I-child"},
+            {"idx": 5, "text": "1836 I-child B-date"},
+            {"idx": 6, "text": "père O"},
+            {"idx": 7, "text": "Jean B-father B-name"},
+            {"idx": 8, "text": "Bigre I-father B-surname"},
+            {"idx": 9, "text": "charpentier I-father B-occupation"},
+            {"idx": 10, "text": "de I-father"},
+            {"idx": 11, "text": "cette I-father B-location"},
+            {"idx": 12, "text": "paroisse I-father I-location"},
+            {"idx": 13, "text": "mère O"},
+            {"idx": 14, "text": "Marie B-mother B-name"},
+        ],
+        "spans": [
+            {
+                "tokens": [
+                    {"idx": 0, "text": "Charles B-child B-name"},
+                    {"idx": 1, "text": "né I-child"},
+                    {"idx": 2, "text": "à I-child"},
+                    {"idx": 3, "text": "Beaune I-child B-location"},
+                    {"idx": 4, "text": "en I-child"},
+                    {"idx": 5, "text": "1836 I-child B-date"},
+                ]
+            },
+            {
+                "tokens": [
+                    {"idx": 7, "text": "Jean B-father B-name"},
+                    {"idx": 8, "text": "Bigre I-father B-surname"},
+                    {"idx": 9, "text": "charpentier I-father B-occupation"},
+                    {"idx": 10, "text": "de I-father"},
+                    {"idx": 11, "text": "cette I-father B-location"},
+                    {"idx": 12, "text": "paroisse I-father I-location"},
+                ]
+            },
+            {"tokens": [{"idx": 14, "text": "Marie B-mother B-name"}]},
+        ],
+    }
+
+    # Cleanup
+    output.unlink()
-- 
GitLab