From 032c22676615cab5d102fc2138c87adbfa827bfa Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Mon, 6 Mar 2023 15:02:28 +0000 Subject: [PATCH] Allow spaces in label (followup) --- nerval/parse.py | 8 ++++++-- tests/test_parse_bio.py | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/nerval/parse.py b/nerval/parse.py index d0e1fe5..5b92763 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -5,7 +5,7 @@ from pathlib import Path NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] -REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$") +REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") @@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): assert match_iob - return match_iob.group(1, 2) + word, label = match_iob.group(1, 2) + # We should have either one - (BLIU-) or none at all (O) + assert label.count("-") <= 1 + + return word, label except AssertionError: raise ( Exception( diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 7b20b5e..625e9a2 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest from nerval import evaluate -from nerval.parse import parse_line +from nerval.parse import get_type_label, parse_line expected_parsed_annot = { "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, @@ -199,6 +199,7 @@ def test_parse_bio_no_input(): ( ("Hi B-ORG", "Hi", "B-ORG"), ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"), + ("1258 B-Date et Lieu", "1258", "B-Date et Lieu"), ), ) def test_parse_line(line, word, label): @@ -207,11 +208,20 @@ def test_parse_line(line, word, label): @pytest.mark.parametrize( "line", - ( - ("HiB-ORG"), - ("HiB-ORG or maybe not"), - ), + (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")), ) def test_parse_line_crash(line): with pytest.raises(Exception): parse_line(index=0, line=line, path=Path("")) + + +@pytest.mark.parametrize( + "label, expected_type", + ( + ("B-ORG", "ORG"), + ("B-Date et Lieu", "Date et Lieu"), + ("I-Date et Lieu", "Date et Lieu"), + ), +) +def test_get_type_label(label, expected_type): + assert get_type_label(label) == expected_type -- GitLab