Skip to content
Snippets Groups Projects
Commit 7bc3eb2c authored by Solene Tarride's avatar Solene Tarride
Browse files

Fix lint & tests

parent 044ba51c
No related branches found
No related tags found
1 merge request!8Draft: Support nested entities
Pipeline #203611 failed
......@@ -2,12 +2,14 @@
import logging
import re
from dataclasses import dataclass, field
from itertools import pairwise
from operator import attrgetter
from pathlib import Path
from bio_parser.parse.document import Token, Tag, Span
PARSE_BIO_LINE = re.compile(r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)")
from bio_parser.parse.document import Span, Tag, Token
PARSE_BIO_LINE = re.compile(
r"(?P<text>[^\s]+)\s+(?P<labels>(?:[^\s]+(?:\-[^\s]+)?\s*)+)"
)
"""Regex that parses a line of a BIO file"""
......@@ -60,7 +62,7 @@ class NestedToken:
["child", "name"]
"""
return [token.label for token in self.tokens]
@property
def tags(self) -> list[Tag]:
"""IOB tags of named entity tag.
......@@ -70,7 +72,7 @@ class NestedToken:
[<Tag.BEGINNING: 'B'>, <Tag.BEGINNING: 'B'>]
"""
return [token.tag for token in self.tokens]
@property
def iob_labels(self) -> list[str]:
"""IOB label (Tag + Entity).
......@@ -118,7 +120,7 @@ class NestedDocument:
def __post_init__(self):
"""Parses the tokens and the entity spans in the document."""
current_spans : dict[str, Span] = {} # Keep track of current spans by category
current_spans: dict[str, Span] = {} # Keep track of current spans by category
for idx, line in enumerate(self.bio_repr.splitlines()):
try:
nested_token = NestedToken(idx=idx, text=line)
......@@ -134,12 +136,11 @@ class NestedDocument:
current_spans = {}
case Tag.INSIDE:
if token.label in current_spans:
# Continue current span
current_spans[token.label].add_token(token)
else:
Exception, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
assert (
token.label in current_spans
), f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
# Continue current span
current_spans[token.label].add_token(token)
case Tag.BEGINNING:
# End existing span if necessary
......@@ -175,7 +176,10 @@ class NestedDocument:
def word_entities(self) -> list[tuple[str, str]]:
"""List of entities in the words making up the document."""
return list(
map(attrgetter("labels", "word"), filter(lambda x: x.labels[0] != None, self.nested_tokens)),
map(
attrgetter("labels", "word"),
filter(lambda x: x.labels[0] is not None, self.nested_tokens),
),
)
@property
......@@ -183,7 +187,6 @@ class NestedDocument:
"""Join every word of the span by a whitespace."""
return " ".join(map(attrgetter("word"), self.nested_tokens))
@property
def chars(self) -> list[str]:
r"""Characters making up the token.
......
Charles B-child B-name
né I-child
à I-child
à I-child
Beaune I-child B-location
en I-child
1836 I-child B-date
......
import pytest
from bio_parser.parse.document import Document, Span, Tag, Token
from bio_parser.parse.document import Tag
from bio_parser.parse.nested_document import NestedDocument, NestedToken
from tests.parse import DATA_DIR
......@@ -14,41 +14,61 @@ def nested_document() -> NestedDocument:
def test_parse_document(nested_document: NestedDocument):
# Check words
assert nested_document.words == ["Charles", "", "à", "Beaune", "en", "1836", "père", "Jean", "Bigre", "charpentier", "de", "cette", "paroisse", "mère", "Marie"]
assert nested_document.words == [
"Charles",
"",
"à",
"Beaune",
"en",
"1836",
"père",
"Jean",
"Bigre",
"charpentier",
"de",
"cette",
"paroisse",
"mère",
"Marie",
]
# Check entities
assert nested_document.entities == [
("child", "Charles né à Beaune en 1836"),
("name", "Charles"),
("location", "Beaune"),
("date", "1836"),
("father", "Jean Bigre charpentier de cette paroisse"),
("child", "Charles né à Beaune en 1836"),
("name", "Charles"),
("location", "Beaune"),
("date", "1836"),
("father", "Jean Bigre charpentier de cette paroisse"),
("name", "Jean"),
("surname", "Bigre"),
("occupation", "charpentier"),
("location", "cette paroisse"),
("mother", "Marie"),
("name", "Marie")]
("surname", "Bigre"),
("occupation", "charpentier"),
("location", "cette paroisse"),
("mother", "Marie"),
("name", "Marie"),
]
# Check word entities
assert nested_document.word_entities == [
(["child", "name"], "Charles"),
(["child"], ""),
(["child"], "à"),
(["child", "location"], "Beaune"),
(["child"], "en"),
(["child", "date"], "1836"),
(["father", "name"], "Jean"),
(["father", "surname"], "Bigre"),
(["father", "occupation"], "charpentier"),
(["father"], "de"),
(["father", "location"], "cette"),
(["father", "location"], "paroisse"),
(["mother", "name"], "Marie")
]
(["child", "name"], "Charles"),
(["child"], ""),
(["child"], "à"),
(["child", "location"], "Beaune"),
(["child"], "en"),
(["child", "date"], "1836"),
(["father", "name"], "Jean"),
(["father", "surname"], "Bigre"),
(["father", "occupation"], "charpentier"),
(["father"], "de"),
(["father", "location"], "cette"),
(["father", "location"], "paroisse"),
(["mother", "name"], "Marie"),
]
# Check text
assert nested_document.text == "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
assert (
nested_document.text
== "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie"
)
# Check chars
assert nested_document.chars == list(
......@@ -56,7 +76,6 @@ def test_parse_document(nested_document: NestedDocument):
)
def test_parse_nested_token(nested_document: NestedDocument):
nested_token: NestedToken = nested_document.nested_tokens[0]
......@@ -74,9 +93,9 @@ def test_parse_nested_token(nested_document: NestedDocument):
# Check labels
assert nested_token.char_labels == [
['B-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'],
['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name']
]
["B-child", "I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
["B-name", "I-name", "I-name", "I-name", "I-name", "I-name", "I-name"],
]
# Check chars
assert nested_token.chars == ["C", "h", "a", "r", "l", "e", "s"]
......@@ -98,9 +117,16 @@ def test_parse_nested_token(nested_document: NestedDocument):
# Check labels
assert nested_token.char_labels == [
['I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'],
['B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'I-location']
]
["I-child", "I-child", "I-child", "I-child", "I-child", "I-child"],
[
"B-location",
"I-location",
"I-location",
"I-location",
"I-location",
"I-location",
],
]
# Check chars
assert nested_token.chars == ["B", "e", "a", "u", "n", "e"]
......@@ -121,8 +147,7 @@ def test_parse_nested_token(nested_document: NestedDocument):
assert nested_token.iob_labels == ["O"]
# Check labels
assert nested_token.char_labels == [['O', 'O', 'O', 'O']]
assert nested_token.char_labels == [["O", "O", "O", "O"]]
# Check chars
assert nested_token.chars == ["m", "è", "r", "e"]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment