Fix lint & tests

7bc3eb2c · Solene Tarride · 044ba51c · 7bc3eb2c · 7bc3eb2c · 7bc3eb2c
Commit 7bc3eb2c authored 2 months ago by Solene Tarride
--- a/bio_parser/parse/nested_document.py
+++ b/bio_parser/parse/nested_document.py
@@ -2,12 +2,14 @@
 import logging
 import re
 from dataclasses import dataclass, field
-from itertools import pairwise
 from operator import attrgetter
 from pathlib import Path
-from bio_parser.parse.document import Token, Tag, Span

-PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)")
+from bio_parser.parse.document import Span, Tag, Token
+
+PARSE_BIO_LINE = re.compile(
+    r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)"
+)

 """Regex that parses a line of a BIO file"""

@@ -60,7 +62,7 @@ class NestedToken:
            ["child", "name"]
        """
        return [token.label for token in self.tokens]
-        
+
    @property
    def tags(self) -> list[Tag]:
        """IOB tags of named entity tag.
@@ -70,7 +72,7 @@ class NestedToken:
            [<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>]
        """
        return [token.tag for token in self.tokens]
-    
+
    @property
    def iob_labels(self) -> list[str]:
        """IOB label (Tag + Entity).
@@ -118,7 +120,7 @@ class NestedDocument:

    def __post_init__(self):
        """Parses the tokens and the entity spans in the document."""
-        current_spans : dict[str, Span] = {} # Keep track of current spans by category
+        current_spans: dict[str, Span] = {}  # Keep track of current spans by category
        for idx, line in enumerate(self.bio_repr.splitlines()):
            try:
                nested_token = NestedToken(idx=idx, text=line)
@@ -134,12 +136,11 @@ class NestedDocument:
                            current_spans = {}

                        case Tag.INSIDE:
-                            if token.label in current_spans:
-                                # Continue current span
-                                current_spans[token.label].add_token(token)
-
-                            else:
-                                Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+                            assert (
+                                token.label in current_spans
+                            ), f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+                            # Continue current span
+                            current_spans[token.label].add_token(token)

                        case Tag.BEGINNING:
                            # End existing span if necessary
@@ -175,7 +176,10 @@ class NestedDocument:
    def word_entities(self) -> list[tuple[str, str]]:
        """List of entities in the words making up the document."""
        return list(
-            map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)),
+            map(
+                attrgetter("labels", "word"),
+                filter(lambda x: x.labels[0] is not None, self.nested_tokens),
+            ),
        )

    @property
@@ -183,7 +187,6 @@ class NestedDocument:
        """Join every word of the span by a whitespace."""
        return " ".join(map(attrgetter("word"), self.nested_tokens))

-
    @property
    def chars(self) -> list[str]:
        r"""Characters making up the token.

--- a/tests/fixtures/parse/valid_nested.bio
+++ b/tests/fixtures/parse/valid_nested.bio
 Charles B-child B-name
 né I-child
-à I-child 
+à I-child
 Beaune I-child B-location
 en I-child
 1836 I-child B-date

--- a/tests/parse/test_nested_document.py
+++ b/tests/parse/test_nested_document.py
 import pytest
-from bio_parser.parse.document import Document, Span, Tag, Token
+from bio_parser.parse.document import Tag
 from bio_parser.parse.nested_document import NestedDocument, NestedToken

 from tests.parse import DATA_DIR
@@ -14,41 +14,61 @@ def nested_document() -> NestedDocument:

 def test_parse_document(nested_document: NestedDocument):
    # Check words
-    assert nested_document.words == ["Charles", "né", "à", "Beaune", "en", "1836", "père", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mère", "Marie"]
+    assert nested_document.words == [
+        "Charles",
+        "né",
+        "à",
+        "Beaune",
+        "en",
+        "1836",
+        "père",
+        "Jean",
+        "Bigre",
+        "charpentier",
+        "de",
+        "cette",
+        "paroisse",
+        "mère",
+        "Marie",
+    ]

    # Check entities
    assert nested_document.entities == [
-        ("child", "Charles né à Beaune en 1836"), 
-        ("name", "Charles"), 
-        ("location", "Beaune"), 
-        ("date", "1836"), 
-        ("father", "Jean Bigre charpentier de cette paroisse"), 
+        ("child", "Charles né à Beaune en 1836"),
+        ("name", "Charles"),
+        ("location", "Beaune"),
+        ("date", "1836"),
+        ("father", "Jean Bigre charpentier de cette paroisse"),
        ("name", "Jean"),
-        ("surname", "Bigre"), 
-        ("occupation", "charpentier"), 
-        ("location", "cette paroisse"), 
-        ("mother", "Marie"), 
-        ("name", "Marie")]
+        ("surname", "Bigre"),
+        ("occupation", "charpentier"),
+        ("location", "cette paroisse"),
+        ("mother", "Marie"),
+        ("name", "Marie"),
+    ]

    # Check word entities
    assert nested_document.word_entities == [
-        (["child", "name"], "Charles"), 
-        (["child"], "né"), 
-        (["child"], "à"), 
-        (["child", "location"], "Beaune"), 
-        (["child"], "en"), 
-        (["child", "date"], "1836"), 
-        (["father", "name"], "Jean"), 
-        (["father", "surname"], "Bigre"), 
-        (["father", "occupation"], "charpentier"), 
-        (["father"], "de"), 
-        (["father", "location"], "cette"), 
-        (["father", "location"], "paroisse"), 
-        (["mother", "name"], "Marie")
-        ]
+        (["child", "name"], "Charles"),
+        (["child"], "né"),
+        (["child"], "à"),
+        (["child", "location"], "Beaune"),
+        (["child"], "en"),
+        (["child", "date"], "1836"),
+        (["father", "name"], "Jean"),
+        (["father", "surname"], "Bigre"),
+        (["father", "occupation"], "charpentier"),
+        (["father"], "de"),
+        (["father", "location"], "cette"),
+        (["father", "location"], "paroisse"),
+        (["mother", "name"], "Marie"),
+    ]

    # Check text
-    assert nested_document.text == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
+    assert (
+        nested_document.text
+        == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
+    )

    # Check chars
    assert nested_document.chars == list(
@@ -56,7 +76,6 @@ def test_parse_document(nested_document: NestedDocument):
    )


-
 def test_parse_nested_token(nested_document: NestedDocument):
    nested_token: NestedToken = nested_document.nested_tokens[0]

@@ -74,9 +93,9 @@ def test_parse_nested_token(nested_document: NestedDocument):

    # Check labels
    assert nested_token.char_labels == [
-        ['B-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], 
-        ['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name']
-        ]
+        ["B-child", "I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
+        ["B-name", "I-name", "I-name", "I-name", "I-name", "I-name", "I-name"],
+    ]

    # Check chars
    assert nested_token.chars == ["C", "h", "a", "r", "l", "e", "s"]
@@ -98,9 +117,16 @@ def test_parse_nested_token(nested_document: NestedDocument):

    # Check labels
    assert nested_token.char_labels == [
-        ['I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], 
-        ['B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'I-location']
-        ]
+        ["I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
+        [
+            "B-location",
+            "I-location",
+            "I-location",
+            "I-location",
+            "I-location",
+            "I-location",
+        ],
+    ]

    # Check chars
    assert nested_token.chars == ["B", "e", "a", "u", "n", "e"]
@@ -121,8 +147,7 @@ def test_parse_nested_token(nested_document: NestedDocument):
    assert nested_token.iob_labels == ["O"]

    # Check labels
-    assert nested_token.char_labels == [['O', 'O', 'O', 'O']]
+    assert nested_token.char_labels == [["O", "O", "O", "O"]]

    # Check chars
    assert nested_token.chars == ["m", "è", "r", "e"]
-