From de1275d29db79474c71907fc279b3680af56979f Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Mon, 6 Mar 2023 10:07:13 +0000
Subject: [PATCH] Support spaces in label

---
 nerval/parse.py         | 33 ++++++++++++++++++++-------------
 tests/test_parse_bio.py | 24 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/nerval/parse.py b/nerval/parse.py
index 60c50fd..d0e1fe5 100644
--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -5,6 +5,9 @@ from pathlib import Path
 NOT_ENTITY_TAG = "O"
 BEGINNING_POS = ["B", "S", "U"]
 
+REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$")
+REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
+
 
 def get_type_label(label: str) -> str:
     """Return the type (tag) of a label
@@ -12,11 +15,7 @@ def get_type_label(label: str) -> str:
     Input format: "[BIESLU]-type"
     """
     try:
-        tag = (
-            NOT_ENTITY_TAG
-            if label == NOT_ENTITY_TAG
-            else re.match(r"[BIESLU]-(.*)$", label)[1]
-        )
+        tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1]
     except TypeError:
         raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
 
@@ -40,6 +39,21 @@ def get_position_label(label: str) -> str:
     return pos
 
 
+def parse_line(index: int, line: str, path: Path):
+    try:
+        match_iob = REGEX_IOB_LINE.search(line)
+
+        assert match_iob
+
+        return match_iob.group(1, 2)
+    except AssertionError:
+        raise (
+            Exception(
+                f"The file @ {path} is not in BIO format: check line {index} ({line})"
+            )
+        )
+
+
 def parse_bio(path: Path) -> dict:
     """Parse a BIO file to get text content, character-level NE labels and entity types count.
 
@@ -68,14 +82,7 @@ def parse_bio(path: Path) -> dict:
     containing_tag = None
 
     for index, line in enumerate(lines):
-        try:
-            word, label = line.split()
-        except ValueError:
-            raise (
-                Exception(
-                    f"The file {path} given in input is not in BIO format: check line {index} ({line})"
-                )
-            )
+        word, label = parse_line(index, line, path)
 
         # Preserve hyphens to avoid confusion with the hyphens added later during alignment
         word = word.replace("-", "Â§")
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index db161b8..7b20b5e 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import pytest
 
 from nerval import evaluate
+from nerval.parse import parse_line
 
 expected_parsed_annot = {
     "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
@@ -191,3 +192,26 @@ def test_parse_bio_bad_input(bad_bio):
 def test_parse_bio_no_input():
     with pytest.raises(AssertionError):
         evaluate.parse_bio(Path("not_a_bio"))
+
+
+@pytest.mark.parametrize(
+    "line, word, label",
+    (
+        ("Hi B-ORG", "Hi", "B-ORG"),
+        ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"),
+    ),
+)
+def test_parse_line(line, word, label):
+    assert parse_line(index=0, line=line, path=Path("")) == (word, label)
+
+
+@pytest.mark.parametrize(
+    "line",
+    (
+        ("HiB-ORG"),
+        ("HiB-ORG or maybe not"),
+    ),
+)
+def test_parse_line_crash(line):
+    with pytest.raises(Exception):
+        parse_line(index=0, line=line, path=Path(""))
-- 
GitLab