From 032c22676615cab5d102fc2138c87adbfa827bfa Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Mon, 6 Mar 2023 15:02:28 +0000
Subject: [PATCH] Allow spaces in label (followup)

---
 nerval/parse.py         |  8 ++++++--
 tests/test_parse_bio.py | 20 +++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/nerval/parse.py b/nerval/parse.py
index d0e1fe5..5b92763 100644
--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -5,7 +5,7 @@ from pathlib import Path
 NOT_ENTITY_TAG = "O"
 BEGINNING_POS = ["B", "S", "U"]
 
-REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$")
+REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$")
 REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
 
 
@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path):
 
         assert match_iob
 
-        return match_iob.group(1, 2)
+        word, label = match_iob.group(1, 2)
+        # We should have either one - (BLIU-) or none at all (O)
+        assert label.count("-") <= 1
+
+        return word, label
     except AssertionError:
         raise (
             Exception(
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 7b20b5e..625e9a2 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest
 
 from nerval import evaluate
-from nerval.parse import parse_line
+from nerval.parse import get_type_label, parse_line
 
 expected_parsed_annot = {
     "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
@@ -199,6 +199,7 @@ def test_parse_bio_no_input():
     (
         ("Hi B-ORG", "Hi", "B-ORG"),
         ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"),
+        ("1258 B-Date et Lieu", "1258", "B-Date et Lieu"),
     ),
 )
 def test_parse_line(line, word, label):
@@ -207,11 +208,20 @@ def test_parse_line(line, word, label):
 
 @pytest.mark.parametrize(
     "line",
-    (
-        ("HiB-ORG"),
-        ("HiB-ORG or maybe not"),
-    ),
+    (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")),
 )
 def test_parse_line_crash(line):
     with pytest.raises(Exception):
         parse_line(index=0, line=line, path=Path(""))
+
+
+@pytest.mark.parametrize(
+    "label, expected_type",
+    (
+        ("B-ORG", "ORG"),
+        ("B-Date et Lieu", "Date et Lieu"),
+        ("I-Date et Lieu", "Date et Lieu"),
+    ),
+)
+def test_get_type_label(label, expected_type):
+    assert get_type_label(label) == expected_type
-- 
GitLab