From de1275d29db79474c71907fc279b3680af56979f Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Mon, 6 Mar 2023 10:07:13 +0000 Subject: [PATCH] Support spaces in label --- nerval/parse.py | 33 ++++++++++++++++++++------------- tests/test_parse_bio.py | 24 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/nerval/parse.py b/nerval/parse.py index 60c50fd..d0e1fe5 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -5,6 +5,9 @@ from pathlib import Path NOT_ENTITY_TAG = "O" BEGINNING_POS = ["B", "S", "U"] +REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$") +REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") + def get_type_label(label: str) -> str: """Return the type (tag) of a label @@ -12,11 +15,7 @@ def get_type_label(label: str) -> str: Input format: "[BIESLU]-type" """ try: - tag = ( - NOT_ENTITY_TAG - if label == NOT_ENTITY_TAG - else re.match(r"[BIESLU]-(.*)$", label)[1] - ) + tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1] except TypeError: raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) @@ -40,6 +39,21 @@ def get_position_label(label: str) -> str: return pos +def parse_line(index: int, line: str, path: Path): + try: + match_iob = REGEX_IOB_LINE.search(line) + + assert match_iob + + return match_iob.group(1, 2) + except AssertionError: + raise ( + Exception( + f"The file @ {path} is not in BIO format: check line {index} ({line})" + ) + ) + + def parse_bio(path: Path) -> dict: """Parse a BIO file to get text content, character-level NE labels and entity types count. @@ -68,14 +82,7 @@ def parse_bio(path: Path) -> dict: containing_tag = None for index, line in enumerate(lines): - try: - word, label = line.split() - except ValueError: - raise ( - Exception( - f"The file {path} given in input is not in BIO format: check line {index} ({line})" - ) - ) + word, label = parse_line(index, line, path) # Preserve hyphens to avoid confusion with the hyphens added later during alignment word = word.replace("-", "§") diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index db161b8..7b20b5e 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest from nerval import evaluate +from nerval.parse import parse_line expected_parsed_annot = { "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, @@ -191,3 +192,26 @@ def test_parse_bio_bad_input(bad_bio): def test_parse_bio_no_input(): with pytest.raises(AssertionError): evaluate.parse_bio(Path("not_a_bio")) + + +@pytest.mark.parametrize( + "line, word, label", + ( + ("Hi B-ORG", "Hi", "B-ORG"), + ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"), + ), +) +def test_parse_line(line, word, label): + assert parse_line(index=0, line=line, path=Path("")) == (word, label) + + +@pytest.mark.parametrize( + "line", + ( + ("HiB-ORG"), + ("HiB-ORG or maybe not"), + ), +) +def test_parse_line_crash(line): + with pytest.raises(Exception): + parse_line(index=0, line=line, path=Path("")) -- GitLab