From 7b32a134636c21d4ac030e5262b5d0f0c08aac18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Thu, 12 Dec 2024 18:48:22 +0100
Subject: [PATCH] Support nested entities with NestedDocument and NestedToken

---
 bio_parser/parse/__init__.py          |   6 +
 bio_parser/parse/document.py          |   4 +-
 bio_parser/parse/nested_document.py   | 207 ++++++++++++++++++++++++++
 bio_parser/parse/validate.py          |   9 +-
 tests/fixtures/parse/valid_nested.bio |  15 ++
 tests/parse/test_document.py          |   8 +-
 tests/parse/test_nested_document.py   |  56 +++++++
 7 files changed, 297 insertions(+), 8 deletions(-)
 create mode 100644 bio_parser/parse/nested_document.py
 create mode 100644 tests/fixtures/parse/valid_nested.bio
 create mode 100644 tests/parse/test_nested_document.py

diff --git a/bio_parser/parse/__init__.py b/bio_parser/parse/__init__.py
index 06ae06d..2271f9c 100644
--- a/bio_parser/parse/__init__.py
+++ b/bio_parser/parse/__init__.py
@@ -25,3 +25,9 @@ def add_validate_parser(subcommands):
     parser.add_argument(
         "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
     )
+    parser.add_argument(
+        "--allow-nested",
+        help="Whether to allow nested entities.",
+        action="store_true",
+        default=False,
+    )
diff --git a/bio_parser/parse/document.py b/bio_parser/parse/document.py
index f06e24c..66c389e 100644
--- a/bio_parser/parse/document.py
+++ b/bio_parser/parse/document.py
@@ -105,7 +105,7 @@ class Token:
         return _make_ner_label(tag=self.tag, label=self.label)
 
     @property
-    def labels(self) -> list[str]:
+    def char_labels(self) -> list[str]:
         """Character-level IOB labels.
 
         Examples:
@@ -335,7 +335,7 @@ class Document:
         tags = []
         for token, next_token in pairwise(self.tokens + [None]):
             # Add token tags
-            tags.extend(token.labels)
+            tags.extend(token.char_labels)
             if next_token and (
                 token.label == next_token.label and not next_token.tag == Tag.BEGINNING
             ):
diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py
new file mode 100644
index 0000000..3174959
--- /dev/null
+++ b/bio_parser/parse/nested_document.py
@@ -0,0 +1,207 @@
+"""Parse nested BIO files."""
+import logging
+import re
+from dataclasses import dataclass, field
+from itertools import pairwise
+from operator import attrgetter
+from pathlib import Path
+from bio_parser.parse.document import Token, Tag, Span
+
+PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)")
+
+"""Regex that parses a line of a BIO file"""
+
+_logger = logging.getLogger(__name__)
+
+
+@dataclass(slots=True)
+class NestedToken:
+    """Token as tokenized in the BIO document, that may contain multiple labels."""
+
+    idx: int
+    """Index of the nested token in the document."""
+    text: str
+    """Text representation of the nested token."""
+
+    @property
+    def _data(self) -> list[str]:
+        """Nested BIO line parsing."""
+        parsed_global = PARSE_BIO_LINE.match(self.text)
+        text = parsed_global.group("text")
+        labels = list(parsed_global.group("labels").strip().split(" "))
+        return [f"{text} {label}" for label in labels]
+
+    @property
+    def tokens(self) -> list[Token]:
+        """List of flat tokens associated to the nested token.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").tokens
+            [Token(idx=0, text='Jean B-child'), Token(idx=0, text='Jean B-name')]
+        """
+        return [Token(idx=self.idx, text=text_repr) for text_repr in self._data]
+
+    @property
+    def word(self) -> str:
+        """Text content of the nested token.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").word
+            'Jean'
+        """
+        return self.tokens[0].word
+
+    @property
+    def labels(self) -> list[str | None]:
+        """Named entity type of this token.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").label
+            ["child", "name"]
+        """
+        return [token.label for token in self.tokens]
+        
+    @property
+    def tags(self) -> list[Tag]:
+        """IOB tags of named entity tag.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").tags
+            [<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>]
+        """
+        return [token.tag for token in self.tokens]
+    
+    @property
+    def iob_labels(self) -> list[str]:
+        """IOB label (Tag + Entity).
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").iob_label
+            ['B-child', 'B-name']
+        """
+        return [token.iob_label for token in self.tokens]
+
+    @property
+    def char_labels(self) -> list[list[str]]:
+        """Character-level IOB labels.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").char_labels
+            [['B-child', 'I-child', 'I-child', 'I-child'], ['B-name', 'I-name', 'I-name', 'I-name']]
+        """
+        return [token.char_labels for token in self.tokens]
+
+    @property
+    def chars(self) -> list[str]:
+        """The list of characters making up the token.
+
+        Examples:
+            >>> NestedToken(idx=0, text="Jean B-child B-name").chars
+            ['J', 'e', 'a', 'n']
+        """
+        return self.tokens[0].chars
+
+
+@dataclass(slots=True)
+class NestedDocument:
+    """Representation of a BIO document."""
+
+    filename: str
+    """Document filename"""
+    bio_repr: str
+    """Full BIO representation of the Document"""
+    nested_tokens: list[NestedToken] = field(default_factory=list)
+    """List of the nested tokens in the Document"""
+
+    spans: list[Span] = field(default_factory=list)
+    """List of the spans in the Document"""
+
+    def __post_init__(self):
+        """Parses the tokens and the entity spans in the document."""
+        current_spans : dict[str, Span] = {} # Keep track of current spans by category
+        for idx, line in enumerate(self.bio_repr.splitlines()):
+            try:
+                nested_token = NestedToken(idx=idx, text=line)
+                self.nested_tokens.append(nested_token)
+
+                for idx, token in enumerate(nested_token.tokens):
+                    # Build spans
+                    match token.tag:
+                        case Tag.OUTSIDE:
+                            # Close all current spans
+                            for span in current_spans.values():
+                                self.spans.append(span)
+                            current_spans = {}
+
+                        case Tag.INSIDE:
+                            if token.label in current_spans:
+                                # Continue current span
+                                current_spans[token.label].add_token(token)
+
+                            else:
+                                Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+
+                        case Tag.BEGINNING:
+                            # End existing span if necessary
+                            if token.label in current_spans:
+                                span = current_spans.pop(token.label)
+                                self.spans.append(span)
+
+                            # Start a new span
+                            current_spans[token.label] = Span()
+                            current_spans[token.label].add_token(token)
+
+            except AssertionError as e:
+                _logger.error(f"Error on token nÂ°{token.idx}: {e}")
+                raise Exception from e
+
+        # Last spans
+        for span in current_spans.values():
+            self.spans.append(span)
+
+    @property
+    def words(self) -> list[str]:
+        """List of words making up the document."""
+        return list(map(attrgetter("word"), self.nested_tokens))
+
+    @property
+    def entities(self) -> list[tuple[str, str]]:
+        """List of entities making up the document."""
+        return list(
+            map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)),
+        )
+
+    @property
+    def word_entities(self) -> list[tuple[str, str]]:
+        """List of entities in the words making up the document."""
+        return list(
+            map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)),
+        )
+
+    @property
+    def text(self) -> str:
+        """Join every word of the span by a whitespace."""
+        return " ".join(map(attrgetter("word"), self.nested_tokens))
+
+
+    @property
+    def chars(self) -> list[str]:
+        r"""Characters making up the token.
+
+        Examples:
+            >>> Document(bio_repr="I B-Animal\nrun I-Animal").chars
+            ['I', ' ', 'r', 'u', 'n']
+        """
+        return list(self.text)
+
+    @classmethod
+    def from_file(cls, filepath: Path) -> "NestedDocument":
+        """Load a Document from a IOB file.
+
+        Args:
+            filepath (Path): Path to the file to load.
+
+        Returns:
+            Document: Parsed document
+        """
+        return NestedDocument(filepath.stem, filepath.read_text())
diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py
index 6f83192..7d09b40 100644
--- a/bio_parser/parse/validate.py
+++ b/bio_parser/parse/validate.py
@@ -5,11 +5,12 @@ from dataclasses import asdict
 from pathlib import Path
 
 from bio_parser.parse.document import Document
+from bio_parser.parse.nested_document import NestedDocument
 
 logger = logging.getLogger(__name__)
 
 
-def run(filepaths: list[Path]) -> None:
+def run(filepaths: list[Path], allow_nested=False) -> None:
     """Validate the construction of multiple BIO files.
 
     Args:
@@ -18,7 +19,11 @@ def run(filepaths: list[Path]) -> None:
     for filepath in filepaths:
         logger.info(f"Parsing file @ `{filepath}`")
         try:
-            doc = Document.from_file(filepath)
+            doc = (
+                NestedDocument.from_file(filepath)
+                if allow_nested
+                else Document.from_file(filepath)
+            )
             filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
         except Exception as e:
             logger.error(f"Could not load the file @ `{filepath}`: {e}")
diff --git a/tests/fixtures/parse/valid_nested.bio b/tests/fixtures/parse/valid_nested.bio
new file mode 100644
index 0000000..0064313
--- /dev/null
+++ b/tests/fixtures/parse/valid_nested.bio
@@ -0,0 +1,15 @@
+Charles B-child B-name
+nÃ© I-child
+Ã  I-child 
+Beaune I-child B-location
+en I-child
+1836 I-child B-date
+pÃ¨re O
+Jean B-father B-name
+Bigre I-father B-surname
+charpentier I-father B-occupation
+de I-father
+cette I-father B-location
+paroisse I-father I-location
+mÃ¨re O
+Marie B-mother B-name
diff --git a/tests/parse/test_document.py b/tests/parse/test_document.py
index 565f2a1..6b090a9 100644
--- a/tests/parse/test_document.py
+++ b/tests/parse/test_document.py
@@ -7,7 +7,7 @@ FILEPATH = DATA_DIR / "valid.bio"
 
 
 @pytest.fixture()
-def document():
+def document() -> Document:
     return Document.from_file(FILEPATH)
 
 
@@ -131,7 +131,7 @@ def test_parse_token(document: Document):
     assert token.iob_label == "B-GPE"
 
     # Check labels
-    assert token.labels == ["B-GPE", "I-GPE", "I-GPE"]
+    assert token.char_labels == ["B-GPE", "I-GPE", "I-GPE"]
 
     # Check chars
     assert token.chars == ["S", "a", "n"]
@@ -152,7 +152,7 @@ def test_parse_token(document: Document):
     assert token.iob_label == "I-GPE"
 
     # Check labels
-    assert token.labels == [
+    assert token.char_labels == [
         "I-GPE",
         "I-GPE",
         "I-GPE",
@@ -183,7 +183,7 @@ def test_parse_token(document: Document):
     assert token.iob_label == "O"
 
     # Check labels
-    assert token.labels == ["O", "O", "O", "O", "O", "O"]
+    assert token.char_labels == ["O", "O", "O", "O", "O", "O"]
 
     # Check chars
     assert token.chars == ["r", "o", "b", "o", "t", "s"]
diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py
new file mode 100644
index 0000000..8bb7d21
--- /dev/null
+++ b/tests/parse/test_nested_document.py
@@ -0,0 +1,56 @@
+import pytest
+from bio_parser.parse.document import Document, Span, Tag, Token, _make_ner_label
+from bio_parser.parse.nested_document import NestedDocument, NestedToken
+
+from tests.parse import DATA_DIR
+
+FILEPATH = DATA_DIR / "valid_nested.bio"
+
+
+@pytest.fixture()
+def nested_document() -> NestedDocument:
+    return NestedDocument.from_file(FILEPATH)
+
+
+def test_parse_document(nested_document: NestedDocument):
+    # Check words
+    assert nested_document.words == ["Charles", "nÃ©", "Ã ", "Beaune", "en", "1836", "pÃ¨re", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mÃ¨re", "Marie"]
+
+    # Check entities
+    assert nested_document.entities == [
+        ("child", "Charles nÃ© Ã  Beaune en 1836"), 
+        ("name", "Charles"), 
+        ("location", "Beaune"), 
+        ("date", "1836"), 
+        ("father", "Jean Bigre charpentier de cette paroisse"), 
+        ("name", "Jean"),
+        ("surname", "Bigre"), 
+        ("occupation", "charpentier"), 
+        ("location", "cette paroisse"), 
+        ("mother", "Marie"), 
+        ("name", "Marie")]
+
+    # Check word entities
+    assert nested_document.word_entities == [
+        (["child", "name"], "Charles"), 
+        (["child"], "nÃ©"), (["child"], "Ã "), 
+        (["child", "location"], "Beaune"), 
+        (["child"], "en"), 
+        (["child", "date"], "1836"), 
+        (["father", "name"], "Jean"), 
+        (["father", "surname"], "Bigre"), 
+        (["father", "occupation"], "charpentier"), 
+        (["father"], "de"), 
+        (["father", "location"], "cette"), 
+        (["father", "location"], "paroisse"), 
+        (["mother", "name"], "Marie")
+        ]
+
+    # Check text
+    assert nested_document.text == "Charles nÃ© Ã  Beaune en 1836 pÃ¨re Jean Bigre charpentier de cette paroisse mÃ¨re Marie"
+
+    # Check chars
+    assert nested_document.chars == list(
+        "Charles nÃ© Ã  Beaune en 1836 pÃ¨re Jean Bigre charpentier de cette paroisse mÃ¨re Marie"
+    )
+
-- 
GitLab