diff --git a/nerval/parse.py b/nerval/parse.py index dd140f2cd8b0bb3096c601e90684c8fd2c578ba8..5b9276301da07918b503f823a96bcfe4f5efff17 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -5,7 +5,7 @@ from pathlib import Path NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] -REGEX_IOB_LINE = re.compile(r"^(.*) ((?:[BIESLU]-|O).*)$") +REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") @@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): assert match_iob - return match_iob.group(1, 2) + word, label = match_iob.group(1, 2) + # We should have either one - (BLIU-) or none at all (O) + assert label.count("-") <= 1 + + return word, label except AssertionError: raise ( Exception( diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 681ff5264b8f6dc9c598d6e2e9bc8ace264380d8..625e9a25f30228b0d3d19cab9a0ca7e6cf14b1e2 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -208,10 +208,7 @@ def test_parse_line(line, word, label): @pytest.mark.parametrize( "line", - ( - ("HiB-ORG"), - ("HiB-ORG or maybe not"), - ), + (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")), ) def test_parse_line_crash(line): with pytest.raises(Exception):