diff --git a/nerval/parse.py b/nerval/parse.py index d0e1fe5467bd9a6ee3313821851c84a9ae971f98..5b9276301da07918b503f823a96bcfe4f5efff17 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -5,7 +5,7 @@ from pathlib import Path NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] -REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$") +REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") @@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): assert match_iob - return match_iob.group(1, 2) + word, label = match_iob.group(1, 2) + # We should have either one - (BLIU-) or none at all (O) + assert label.count("-") <= 1 + + return word, label except AssertionError: raise ( Exception( diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 7b20b5ed97f498776559f85be6e2b7a833aec9c1..625e9a25f30228b0d3d19cab9a0ca7e6cf14b1e2 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest from nerval import evaluate -from nerval.parse import parse_line +from nerval.parse import get_type_label, parse_line expected_parsed_annot = { "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, @@ -199,6 +199,7 @@ def test_parse_bio_no_input(): ( ("Hi B-ORG", "Hi", "B-ORG"), ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"), + ("1258 B-Date et Lieu", "1258", "B-Date et Lieu"), ), ) def test_parse_line(line, word, label): @@ -207,11 +208,20 @@ def test_parse_line(line, word, label): @pytest.mark.parametrize( "line", - ( - ("HiB-ORG"), - ("HiB-ORG or maybe not"), - ), + (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")), ) def test_parse_line_crash(line): with pytest.raises(Exception): parse_line(index=0, line=line, path=Path("")) + + +@pytest.mark.parametrize( + "label, expected_type", + ( + ("B-ORG", "ORG"), + ("B-Date et Lieu", "Date et Lieu"), + ("I-Date et Lieu", "Date et Lieu"), + ), +) +def test_get_type_label(label, expected_type): + assert get_type_label(label) == expected_type