Skip to content
Snippets Groups Projects
Commit 032c2267 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Allow spaces in label (followup)

parent de1275d2
No related branches found
No related tags found
1 merge request!25Allow spaces in label (followup)
Pipeline #104020 passed
...@@ -5,7 +5,7 @@ from pathlib import Path ...@@ -5,7 +5,7 @@ from pathlib import Path
NOT_ENTITY_TAG = "O" NOT_ENTITY_TAG = "O"
BEGINNING_POS = ["B", "S", "U"] BEGINNING_POS = ["B", "S", "U"]
REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$") REGEX_IOB_LINE = re.compile(r"^(\S*) ((?:[BIESLU]-|O).*)$")
REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$") REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
...@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path): ...@@ -45,7 +45,11 @@ def parse_line(index: int, line: str, path: Path):
assert match_iob assert match_iob
return match_iob.group(1, 2) word, label = match_iob.group(1, 2)
# We should have either one - (BLIU-) or none at all (O)
assert label.count("-") <= 1
return word, label
except AssertionError: except AssertionError:
raise ( raise (
Exception( Exception(
......
...@@ -4,7 +4,7 @@ from pathlib import Path ...@@ -4,7 +4,7 @@ from pathlib import Path
import pytest import pytest
from nerval import evaluate from nerval import evaluate
from nerval.parse import parse_line from nerval.parse import get_type_label, parse_line
expected_parsed_annot = { expected_parsed_annot = {
"entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
...@@ -199,6 +199,7 @@ def test_parse_bio_no_input(): ...@@ -199,6 +199,7 @@ def test_parse_bio_no_input():
( (
("Hi B-ORG", "Hi", "B-ORG"), ("Hi B-ORG", "Hi", "B-ORG"),
("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"), ("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"),
("1258 B-Date et Lieu", "1258", "B-Date et Lieu"),
), ),
) )
def test_parse_line(line, word, label): def test_parse_line(line, word, label):
...@@ -207,11 +208,20 @@ def test_parse_line(line, word, label): ...@@ -207,11 +208,20 @@ def test_parse_line(line, word, label):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"line", "line",
( (("HiB-ORG"), ("HiB-ORG or maybe not"), ("Hello B-surname and L-ocation")),
("HiB-ORG"),
("HiB-ORG or maybe not"),
),
) )
def test_parse_line_crash(line): def test_parse_line_crash(line):
with pytest.raises(Exception): with pytest.raises(Exception):
parse_line(index=0, line=line, path=Path("")) parse_line(index=0, line=line, path=Path(""))
@pytest.mark.parametrize(
"label, expected_type",
(
("B-ORG", "ORG"),
("B-Date et Lieu", "Date et Lieu"),
("I-Date et Lieu", "Date et Lieu"),
),
)
def test_get_type_label(label, expected_type):
assert get_type_label(label) == expected_type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment