From 7bc3eb2c7dfe3dd170fbed7c5de678f1183c8868 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Mon, 16 Dec 2024 11:02:21 +0100
Subject: [PATCH] Fix lint & tests

---
 bio_parser/parse/nested_document.py   | 31 +++++----
 tests/fixtures/parse/valid_nested.bio |  2 +-
 tests/parse/test_nested_document.py   | 97 +++++++++++++++++----------
 3 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py
index 3174959..5e497cc 100644
--- a/bio_parser/parse/nested_document.py
+++ b/bio_parser/parse/nested_document.py
@@ -2,12 +2,14 @@
 import logging
 import re
 from dataclasses import dataclass, field
-from itertools import pairwise
 from operator import attrgetter
 from pathlib import Path
-from bio_parser.parse.document import Token, Tag, Span
 
-PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)")
+from bio_parser.parse.document import Span, Tag, Token
+
+PARSE_BIO_LINE = re.compile(
+    r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)"
+)
 
 """Regex that parses a line of a BIO file"""
 
@@ -60,7 +62,7 @@ class NestedToken:
             ["child", "name"]
         """
         return [token.label for token in self.tokens]
-        
+
     @property
     def tags(self) -> list[Tag]:
         """IOB tags of named entity tag.
@@ -70,7 +72,7 @@ class NestedToken:
             [<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>]
         """
         return [token.tag for token in self.tokens]
-    
+
     @property
     def iob_labels(self) -> list[str]:
         """IOB label (Tag + Entity).
@@ -118,7 +120,7 @@ class NestedDocument:
 
     def __post_init__(self):
         """Parses the tokens and the entity spans in the document."""
-        current_spans : dict[str, Span] = {} # Keep track of current spans by category
+        current_spans: dict[str, Span] = {}  # Keep track of current spans by category
         for idx, line in enumerate(self.bio_repr.splitlines()):
             try:
                 nested_token = NestedToken(idx=idx, text=line)
@@ -134,12 +136,11 @@ class NestedDocument:
                             current_spans = {}
 
                         case Tag.INSIDE:
-                            if token.label in current_spans:
-                                # Continue current span
-                                current_spans[token.label].add_token(token)
-
-                            else:
-                                Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+                            assert (
+                                token.label in current_spans
+                            ), f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+                            # Continue current span
+                            current_spans[token.label].add_token(token)
 
                         case Tag.BEGINNING:
                             # End existing span if necessary
@@ -175,7 +176,10 @@ class NestedDocument:
     def word_entities(self) -> list[tuple[str, str]]:
         """List of entities in the words making up the document."""
         return list(
-            map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)),
+            map(
+                attrgetter("labels", "word"),
+                filter(lambda x: x.labels[0] is not None, self.nested_tokens),
+            ),
         )
 
     @property
@@ -183,7 +187,6 @@ class NestedDocument:
         """Join every word of the span by a whitespace."""
         return " ".join(map(attrgetter("word"), self.nested_tokens))
 
-
     @property
     def chars(self) -> list[str]:
         r"""Characters making up the token.
diff --git a/tests/fixtures/parse/valid_nested.bio b/tests/fixtures/parse/valid_nested.bio
index 0064313..3d44c55 100644
--- a/tests/fixtures/parse/valid_nested.bio
+++ b/tests/fixtures/parse/valid_nested.bio
@@ -1,6 +1,6 @@
 Charles B-child B-name
 nÃ© I-child
-Ã  I-child 
+Ã  I-child
 Beaune I-child B-location
 en I-child
 1836 I-child B-date
diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py
index d2474e4..658b775 100644
--- a/tests/parse/test_nested_document.py
+++ b/tests/parse/test_nested_document.py
@@ -1,5 +1,5 @@
 import pytest
-from bio_parser.parse.document import Document, Span, Tag, Token
+from bio_parser.parse.document import Tag
 from bio_parser.parse.nested_document import NestedDocument, NestedToken
 
 from tests.parse import DATA_DIR
@@ -14,41 +14,61 @@ def nested_document() -> NestedDocument:
 
 def test_parse_document(nested_document: NestedDocument):
     # Check words
-    assert nested_document.words == ["Charles", "nÃ©", "Ã ", "Beaune", "en", "1836", "pÃ¨re", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mÃ¨re", "Marie"]
+    assert nested_document.words == [
+        "Charles",
+        "nÃ©",
+        "Ã ",
+        "Beaune",
+        "en",
+        "1836",
+        "pÃ¨re",
+        "Jean",
+        "Bigre",
+        "charpentier",
+        "de",
+        "cette",
+        "paroisse",
+        "mÃ¨re",
+        "Marie",
+    ]
 
     # Check entities
     assert nested_document.entities == [
-        ("child", "Charles nÃ© Ã  Beaune en 1836"), 
-        ("name", "Charles"), 
-        ("location", "Beaune"), 
-        ("date", "1836"), 
-        ("father", "Jean Bigre charpentier de cette paroisse"), 
+        ("child", "Charles nÃ© Ã  Beaune en 1836"),
+        ("name", "Charles"),
+        ("location", "Beaune"),
+        ("date", "1836"),
+        ("father", "Jean Bigre charpentier de cette paroisse"),
         ("name", "Jean"),
-        ("surname", "Bigre"), 
-        ("occupation", "charpentier"), 
-        ("location", "cette paroisse"), 
-        ("mother", "Marie"), 
-        ("name", "Marie")]
+        ("surname", "Bigre"),
+        ("occupation", "charpentier"),
+        ("location", "cette paroisse"),
+        ("mother", "Marie"),
+        ("name", "Marie"),
+    ]
 
     # Check word entities
     assert nested_document.word_entities == [
-        (["child", "name"], "Charles"), 
-        (["child"], "nÃ©"), 
-        (["child"], "Ã "), 
-        (["child", "location"], "Beaune"), 
-        (["child"], "en"), 
-        (["child", "date"], "1836"), 
-        (["father", "name"], "Jean"), 
-        (["father", "surname"], "Bigre"), 
-        (["father", "occupation"], "charpentier"), 
-        (["father"], "de"), 
-        (["father", "location"], "cette"), 
-        (["father", "location"], "paroisse"), 
-        (["mother", "name"], "Marie")
-        ]
+        (["child", "name"], "Charles"),
+        (["child"], "nÃ©"),
+        (["child"], "Ã "),
+        (["child", "location"], "Beaune"),
+        (["child"], "en"),
+        (["child", "date"], "1836"),
+        (["father", "name"], "Jean"),
+        (["father", "surname"], "Bigre"),
+        (["father", "occupation"], "charpentier"),
+        (["father"], "de"),
+        (["father", "location"], "cette"),
+        (["father", "location"], "paroisse"),
+        (["mother", "name"], "Marie"),
+    ]
 
     # Check text
-    assert nested_document.text == "Charles nÃ© Ã  Beaune en 1836 pÃ¨re Jean Bigre charpentier de cette paroisse mÃ¨re Marie"
+    assert (
+        nested_document.text
+        == "Charles nÃ© Ã  Beaune en 1836 pÃ¨re Jean Bigre charpentier de cette paroisse mÃ¨re Marie"
+    )
 
     # Check chars
     assert nested_document.chars == list(
@@ -56,7 +76,6 @@ def test_parse_document(nested_document: NestedDocument):
     )
 
 
-
 def test_parse_nested_token(nested_document: NestedDocument):
     nested_token: NestedToken = nested_document.nested_tokens[0]
 
@@ -74,9 +93,9 @@ def test_parse_nested_token(nested_document: NestedDocument):
 
     # Check labels
     assert nested_token.char_labels == [
-        ['B-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], 
-        ['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name']
-        ]
+        ["B-child", "I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
+        ["B-name", "I-name", "I-name", "I-name", "I-name", "I-name", "I-name"],
+    ]
 
     # Check chars
     assert nested_token.chars == ["C", "h", "a", "r", "l", "e", "s"]
@@ -98,9 +117,16 @@ def test_parse_nested_token(nested_document: NestedDocument):
 
     # Check labels
     assert nested_token.char_labels == [
-        ['I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], 
-        ['B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'I-location']
-        ]
+        ["I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
+        [
+            "B-location",
+            "I-location",
+            "I-location",
+            "I-location",
+            "I-location",
+            "I-location",
+        ],
+    ]
 
     # Check chars
     assert nested_token.chars == ["B", "e", "a", "u", "n", "e"]
@@ -121,8 +147,7 @@ def test_parse_nested_token(nested_document: NestedDocument):
     assert nested_token.iob_labels == ["O"]
 
     # Check labels
-    assert nested_token.char_labels == [['O', 'O', 'O', 'O']]
+    assert nested_token.char_labels == [["O", "O", "O", "O"]]
 
     # Check chars
     assert nested_token.chars == ["m", "Ã¨", "r", "e"]
-
-- 
GitLab