test_parse_bio.py

import re

import pytest

from nerval import ALL_ENTITIES, evaluate
from nerval.parse import get_type_label, parse_line

expected_parsed_annot = {
    "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "LOC": 1, "PER": 1},
    "labels": [
        "B-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "B-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "O",
        "O",
        "O",
        "O",
        "B-DAT",
        "I-DAT",
        "I-DAT",
        "I-DAT",
        "O",
        "O",
    ],
    "words": "Gérard de Nerval was born in Paris in 1808 .",
}

expected_parsed_predict = {
    "entity_count": {ALL_ENTITIES: 3, "DAT": 1, "***": 1, "PER": 1},
    "labels": [
        "B-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "B-***",
        "I-***",
        "I-***",
        "I-***",
        "I-***",
        "O",
        "O",
        "O",
        "O",
        "B-DAT",
        "I-DAT",
        "I-DAT",
        "I-DAT",
        "O",
        "O",
        "O",
    ],
    "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
}

expected_parsed_end_of_file = {
    "entity_count": {ALL_ENTITIES: 3, "LOC": 2, "PER": 1},
    "labels": [
        "B-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "B-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "I-PER",
        "O",
        "B-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
        "I-LOC",
    ],
    "words": "Louis par la grâce de Dieu roy de France et de Navarre",
}


@pytest.mark.parametrize(
    ("test_input", "expected"),
    [
        (pytest.lazy_fixture("fake_annot_bio"), expected_parsed_annot),
        (pytest.lazy_fixture("fake_predict_bio"), expected_parsed_predict),
        (pytest.lazy_fixture("empty_bio"), None),
        (pytest.lazy_fixture("fake_annot_with_empty_lines_bio"), expected_parsed_annot),
        (pytest.lazy_fixture("bioeslu_bio"), expected_parsed_annot),
        (pytest.lazy_fixture("end_of_file_bio"), expected_parsed_end_of_file),
    ],
)
def test_parse_bio(test_input, expected):
    lines = test_input.read_text().strip().splitlines()
    assert evaluate.parse_bio(lines) == expected


def test_parse_bio_bad_input(bad_bio):
    lines = bad_bio.read_text().strip().splitlines()
    with pytest.raises(
        Exception, match=re.escape("The file is not in BIO format: check line 1 (file)")
    ):
        evaluate.parse_bio(lines)


@pytest.mark.parametrize(
    ("line", "word", "label"),
    [
        ("Hi B-ORG", "Hi", "B-ORG"),
        ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"),
        ("1258 B-Date et Lieu", "1258", "B-Date et Lieu"),
        ("Devoti B-Sous-titre", "Devoti", "B-Sous-titre"),
    ],
)
def test_parse_line(line, word, label):
    assert parse_line(index=0, line=line) == (word, label)


@pytest.mark.parametrize(
    "line",
    [("HiB-ORG"), ("HiB-ORG or maybe not")],
)
def test_parse_line_crash(line):
    with pytest.raises(
        Exception,
        match=re.escape(f"The file is not in BIO format: check line 0 ({line})"),
    ):
        parse_line(index=0, line=line)


@pytest.mark.parametrize(
    ("label", "expected_type"),
    [
        ("B-ORG", "ORG"),
        ("B-Date et Lieu", "Date et Lieu"),
        ("I-Date et Lieu", "Date et Lieu"),
        ("B-Sous-titre", "Sous-titre"),
    ],
)
def test_get_type_label(label, expected_type):
    assert get_type_label(label) == expected_type