Skip to content
Snippets Groups Projects
Verified Commit 660fa714 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

forbid spaces in token and - in labels

parent e56dff03
No related branches found
No related tags found
1 merge request!25Allow spaces in label (followup)
Pipeline #104019 passed
...@@ -5,7 +5,7 @@ from pathlib import Path ...@@ -5,7 +5,7 @@ from pathlib import Path
NOT_ENTITY_TAG = "O" NOT_ENTITY_TAG = "O"
BEGINNING_POS = ["B", "S", "U"] BEGINNING_POS = ["B", "S", "U"]
REGEX_IOB_LINE = re.compile(r"^(.*) ((?:[BIESLU]-|O).*)$") REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$")
REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
...@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): ...@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path):
assert match_iob assert match_iob
return match_iob.group(1, 2) word, label = match_iob.group(1, 2)
# We should have either one - (BLIU-) or none at all (O)
assert label.count("-") <= 1
return word, label
except AssertionError: except AssertionError:
raise ( raise (
Exception( Exception(
......
...@@ -208,10 +208,7 @@ def test_parse_line(line, word, label): ...@@ -208,10 +208,7 @@ def test_parse_line(line, word, label):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"line", "line",
( (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")),
("HiB-ORG"),
("HiB-ORG or maybe not"),
),
) )
def test_parse_line_crash(line): def test_parse_line_crash(line):
with pytest.raises(Exception): with pytest.raises(Exception):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment