From 660fa714fb0bf74b6de93f6c50cfafb8b40596c1 Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Mon, 6 Mar 2023 15:59:35 +0100
Subject: [PATCH] forbid spaces in token and - in labels

---
 nerval/parse.py         | 8 ++++++--
 tests/test_parse_bio.py | 5 +----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/nerval/parse.py b/nerval/parse.py
index dd140f2..5b92763 100644
--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -5,7 +5,7 @@ from pathlib import Path
 NOT_ENTITY_TAG = "O"
 BEGINNING_POS = ["B", "S", "U"]
 
-REGEX_IOB_LINE = re.compile(r"^(.*) ((?:[BIESLU]-|O).*)$")
+REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$")
 REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
 
 
@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path):
 
         assert match_iob
 
-        return match_iob.group(1, 2)
+        word, label = match_iob.group(1, 2)
+        # We should have either one - (BLIU-) or none at all (O)
+        assert label.count("-") <= 1
+
+        return word, label
     except AssertionError:
         raise (
             Exception(
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 681ff52..625e9a2 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -208,10 +208,7 @@ def test_parse_line(line, word, label):
 
 @pytest.mark.parametrize(
     "line",
-    (
-        ("HiB-ORG"),
-        ("HiB-ORG or maybe not"),
-    ),
+    (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")),
 )
 def test_parse_line_crash(line):
     with pytest.raises(Exception):
-- 
GitLab