From 660fa714fb0bf74b6de93f6c50cfafb8b40596c1 Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Mon, 6 Mar 2023 15:59:35 +0100 Subject: [PATCH] forbid spaces in token and - in labels --- nerval/parse.py | 8 ++++++-- tests/test_parse_bio.py | 5 +---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/nerval/parse.py b/nerval/parse.py index dd140f2..5b92763 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -5,7 +5,7 @@ from pathlib import Path NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] -REGEX_IOB_LINE = re.compile(r"^(.*) ((?:[BIESLU]-|O).*)$") +REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") @@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): assert match_iob - return match_iob.group(1, 2) + word, label = match_iob.group(1, 2) + # We should have either one - (BLIU-) or none at all (O) + assert label.count("-") <= 1 + + return word, label except AssertionError: raise ( Exception( diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 681ff52..625e9a2 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -208,10 +208,7 @@ def test_parse_line(line, word, label): @pytest.mark.parametrize( "line", - ( - ("HiB-ORG"), - ("HiB-ORG or maybe not"), - ), + (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")), ) def test_parse_line_crash(line): with pytest.raises(Exception): -- GitLab