diff --git a/tests/parse/test_nested_document.py b/tests/parse/test_nested_document.py index 8bb7d211912c2f2f7a50fcadc21732088125a51e..d2474e41289314ee8afbb2faf8bd4f7be91edcf1 100644 --- a/tests/parse/test_nested_document.py +++ b/tests/parse/test_nested_document.py @@ -1,5 +1,5 @@ import pytest -from bio_parser.parse.document import Document, Span, Tag, Token, _make_ner_label +from bio_parser.parse.document import Document, Span, Tag, Token from bio_parser.parse.nested_document import NestedDocument, NestedToken from tests.parse import DATA_DIR @@ -33,7 +33,8 @@ def test_parse_document(nested_document: NestedDocument): # Check word entities assert nested_document.word_entities == [ (["child", "name"], "Charles"), - (["child"], "né"), (["child"], "à "), + (["child"], "né"), + (["child"], "à "), (["child", "location"], "Beaune"), (["child"], "en"), (["child", "date"], "1836"), @@ -54,3 +55,74 @@ def test_parse_document(nested_document: NestedDocument): "Charles né à Beaune en 1836 père Jean Bigre charpentier de cette paroisse mère Marie" ) + + +def test_parse_nested_token(nested_document: NestedDocument): + nested_token: NestedToken = nested_document.nested_tokens[0] + + # Check word + assert nested_token.word == "Charles" + + # Check label + assert nested_token.labels == ["child", "name"] + + # Check label + assert nested_token.tags == [Tag.BEGINNING, Tag.BEGINNING] + + # Check IOB Label + assert nested_token.iob_labels == ["B-child", "B-name"] + + # Check labels + assert nested_token.char_labels == [ + ['B-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], + ['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name'] + ] + + # Check chars + assert nested_token.chars == ["C", "h", "a", "r", "l", "e", "s"] + + # I- token + nested_token: NestedToken = nested_document.nested_tokens[3] + + # Check word + assert nested_token.word == "Beaune" + + # Check label + assert nested_token.labels == ["child", "location"] + + # Check label + assert nested_token.tags == [Tag.INSIDE, Tag.BEGINNING] + + # Check IOB Label + assert nested_token.iob_labels == ["I-child", "B-location"] + + # Check labels + assert nested_token.char_labels == [ + ['I-child', 'I-child', 'I-child', 'I-child', 'I-child', 'I-child'], + ['B-location', 'I-location', 'I-location', 'I-location', 'I-location', 'I-location'] + ] + + # Check chars + assert nested_token.chars == ["B", "e", "a", "u", "n", "e"] + + # O token + nested_token: NestedToken = nested_document.nested_tokens[-2] + + # Check word + assert nested_token.word == "mère" + + # Check label + assert nested_token.labels == [None] + + # Check label + assert nested_token.tags == [Tag.OUTSIDE] + + # Check IOB Label + assert nested_token.iob_labels == ["O"] + + # Check labels + assert nested_token.char_labels == [['O', 'O', 'O', 'O']] + + # Check chars + assert nested_token.chars == ["m", "è", "r", "e"] +