Skip to content
Snippets Groups Projects
Verified Commit 660fa714 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

forbid spaces in token and - in labels

parent e56dff03
No related branches found
No related tags found
1 merge request!25Allow spaces in label (followup)
Pipeline #104019 passed
......@@ -5,7 +5,7 @@ from pathlib import Path
NOT_ENTITY_TAG = "O"
BEGINNING_POS = ["B", "S", "U"]
REGEX_IOB_LINE = re.compile(r"^(.*) ((?:[BIESLU]-|O).*)$")
REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$")
REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
......@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path):
assert match_iob
return match_iob.group(1, 2)
word, label = match_iob.group(1, 2)
# We should have either one - (BLIU-) or none at all (O)
assert label.count("-") <= 1
return word, label
except AssertionError:
raise (
Exception(
......
......@@ -208,10 +208,7 @@ def test_parse_line(line, word, label):
@pytest.mark.parametrize(
"line",
(
("HiB-ORG"),
("HiB-ORG or maybe not"),
),
(("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")),
)
def test_parse_line_crash(line):
with pytest.raises(Exception):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment