From 7bc3eb2c7dfe3dd170fbed7c5de678f1183c8868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Mon, 16 Dec 2024 11:02:21 +0100 Subject: [PATCH] Fix lint & tests --- bio_parser/parse/nested_document.py | 31 +++++---- tests/fixtures/parse/valid_nested.bio | 2 +- tests/parse/test_nested_document.py | 97 +++++++++++++++++---------- 3 files changed, 79 insertions(+), 51 deletions(-) diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py index 3174959..5e497cc 100644 --- a/bio_parser/parse/nested_document.py +++ b/bio_parser/parse/nested_document.py @@ -2,12 +2,14 @@ import logging import re from dataclasses import dataclass, field -from itertools import pairwise from operator import attrgetter from pathlib import Path -from bio_parser.parse.document import Token, Tag, Span -PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)") +from bio_parser.parse.document import Span, Tag, Token + +PARSE_BIO_LINE = re.compile( + r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)" +) """Regex that parses a line of a BIO file""" @@ -60,7 +62,7 @@ class NestedToken: ["child", "name"] """ return [token.label for token in self.tokens] - + @property def tags(self) -> list[Tag]: """IOB tags of named entity tag. @@ -70,7 +72,7 @@ class NestedToken: [<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>] """ return [token.tag for token in self.tokens] - + @property def iob_labels(self) -> list[str]: """IOB label (Tag + Entity). @@ -118,7 +120,7 @@ class NestedDocument: def __post_init__(self): """Parses the tokens and the entity spans in the document.""" - current_spans : dict[str, Span] = {} # Keep track of current spans by category + current_spans: dict[str, Span] = {} # Keep track of current spans by category for idx, line in enumerate(self.bio_repr.splitlines()): try: nested_token = NestedToken(idx=idx, text=line) @@ -134,12 +136,11 @@ class NestedDocument: current_spans = {} case Tag.INSIDE: - if token.label in current_spans: - # Continue current span - current_spans[token.label].add_token(token) - - else: - Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`." + assert ( + token.label in current_spans + ), f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`." + # Continue current span + current_spans[token.label].add_token(token) case Tag.BEGINNING: # End existing span if necessary @@ -175,7 +176,10 @@ class NestedDocument: def word_entities(self) -> list[tuple[str, str]]: """List of entities in the words making up the document.""" return list( - map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)), + map( + attrgetter("labels", "word"), + filter(lambda x: x.labels[0] is not None, self.nested_tokens), + ), ) @property @@ -183,7 +187,6 @@ class NestedDocument: """Join every word of the span by a whitespace.""" return " ".join(map(attrgetter("word"), self.nested_tokens)) - @property def chars(self) -> list[str]: r"""Characters making up the token. diff --git a/tests/fixtures/parse/valid_nested.bio b/tests/fixtures/parse/valid_nested.bio index 0064313..3d44c55 100644 --- a/tests/fixtures/parse/valid_nested.bio +++ b/tests/fixtures/parse/valid_nested.bio @@ -1,6 +1,6 @@ Charles B-child B-name né I-child -à I-child +à I-child Beaune I-child B-location en I-child 1836 I-child B-date diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py index d2474e4..658b775 100644 --- a/tests/parse/test_nested_document.py +++ b/tests/parse/test_nested_document.py @@ -1,5 +1,5 @@ import pytest -from bio_parser.parse.document import Document, Span, Tag, Token +from bio_parser.parse.document import Tag from bio_parser.parse.nested_document import NestedDocument, NestedToken from tests.parse import DATA_DIR @@ -14,41 +14,61 @@ def nested_document() -> NestedDocument: def test_parse_document(nested_document: NestedDocument): # Check words - assert nested_document.words == ["Charles", "né", "à ", "Beaune", "en", "1836", "père", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mère", "Marie"] + assert nested_document.words == [ + "Charles", + "né", + "à ", + "Beaune", + "en", + "1836", + "père", + "Jean", + "Bigre", + "charpentier", + "de", + "cette", + "paroisse", + "mère", + "Marie", + ] # Check entities assert nested_document.entities == [ - ("child", "Charles né à Beaune en 1836"), - ("name", "Charles"), - ("location", "Beaune"), - ("date", "1836"), - ("father", "Jean Bigre charpentier de cette paroisse"), + ("child", "Charles né à Beaune en 1836"), + ("name", "Charles"), + ("location", "Beaune"), + ("date", "1836"), + ("father", "Jean Bigre charpentier de cette paroisse"), ("name", "Jean"), - ("surname", "Bigre"), - ("occupation", "charpentier"), - ("location", "cette paroisse"), - ("mother", "Marie"), - ("name", "Marie")] + ("surname", "Bigre"), + ("occupation", "charpentier"), + ("location", "cette paroisse"), + ("mother", "Marie"), + ("name", "Marie"), + ] # Check word entities assert nested_document.word_entities == [ - (["child", "name"], "Charles"), - (["child"], "né"), - (["child"], "à "), - (["child", "location"], "Beaune"), - (["child"], "en"), - (["child", "date"], "1836"), - (["father", "name"], "Jean"), - (["father", "surname"], "Bigre"), - (["father", "occupation"], "charpentier"), - (["father"], "de"), - (["father", "location"], "cette"), - (["father", "location"], "paroisse"), - (["mother", "name"], "Marie") - ] + (["child", "name"], "Charles"), + (["child"], "né"), + (["child"], "à "), + (["child", "location"], "Beaune"), + (["child"], "en"), + (["child", "date"], "1836"), + (["father", "name"], "Jean"), + (["father", "surname"], "Bigre"), + (["father", "occupation"], "charpentier"), + (["father"], "de"), + (["father", "location"], "cette"), + (["father", "location"], "paroisse"), + (["mother", "name"], "Marie"), + ] # Check text - assert nested_document.text == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie" + assert ( + nested_document.text + == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie" + ) # Check chars assert nested_document.chars == list( @@ -56,7 +76,6 @@ def test_parse_document(nested_document: NestedDocument): ) - def test_parse_nested_token(nested_document: NestedDocument): nested_token: NestedToken = nested_document.nested_tokens[0] @@ -74,9 +93,9 @@ def test_parse_nested_token(nested_document: NestedDocument): # Check labels assert nested_token.char_labels == [ - ['B-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], - ['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name'] - ] + ["B-child", "I-child", "I-child", "I-child", "I-child", "I-child", "I-child"], + ["B-name", "I-name", "I-name", "I-name", "I-name", "I-name", "I-name"], + ] # Check chars assert nested_token.chars == ["C", "h", "a", "r", "l", "e", "s"] @@ -98,9 +117,16 @@ def test_parse_nested_token(nested_document: NestedDocument): # Check labels assert nested_token.char_labels == [ - ['I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], - ['B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'I-location'] - ] + ["I-child", "I-child", "I-child", "I-child", "I-child", "I-child"], + [ + "B-location", + "I-location", + "I-location", + "I-location", + "I-location", + "I-location", + ], + ] # Check chars assert nested_token.chars == ["B", "e", "a", "u", "n", "e"] @@ -121,8 +147,7 @@ def test_parse_nested_token(nested_document: NestedDocument): assert nested_token.iob_labels == ["O"] # Check labels - assert nested_token.char_labels == [['O', 'O', 'O', 'O']] + assert nested_token.char_labels == [["O", "O", "O", "O"]] # Check chars assert nested_token.chars == ["m", "è", "r", "e"] - -- GitLab