Skip to content
Snippets Groups Projects
Commit de1275d2 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Support spaces in label

parent 28a38eb7
No related branches found
No related tags found
1 merge request!24Support spaces in label
Pipeline #104017 passed
......@@ -5,6 +5,9 @@ from pathlib import Path
NOT_ENTITY_TAG = "O"
BEGINNING_POS = ["B", "S", "U"]
REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$")
REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
def get_type_label(label: str) -> str:
"""Return the type (tag) of a label
......@@ -12,11 +15,7 @@ def get_type_label(label: str) -> str:
Input format: "[BIESLU]-type"
"""
try:
tag = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"[BIESLU]-(.*)$", label)[1]
)
tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1]
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
......@@ -40,6 +39,21 @@ def get_position_label(label: str) -> str:
return pos
def parse_line(index: int, line: str, path: Path):
try:
match_iob = REGEX_IOB_LINE.search(line)
assert match_iob
return match_iob.group(1, 2)
except AssertionError:
raise (
Exception(
f"The file @ {path} is not in BIO format: check line {index} ({line})"
)
)
def parse_bio(path: Path) -> dict:
"""Parse a BIO file to get text content, character-level NE labels and entity types count.
......@@ -68,14 +82,7 @@ def parse_bio(path: Path) -> dict:
containing_tag = None
for index, line in enumerate(lines):
try:
word, label = line.split()
except ValueError:
raise (
Exception(
f"The file {path} given in input is not in BIO format: check line {index} ({line})"
)
)
word, label = parse_line(index, line, path)
# Preserve hyphens to avoid confusion with the hyphens added later during alignment
word = word.replace("-", "§")
......
......@@ -4,6 +4,7 @@ from pathlib import Path
import pytest
from nerval import evaluate
from nerval.parse import parse_line
expected_parsed_annot = {
"entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1},
......@@ -191,3 +192,26 @@ def test_parse_bio_bad_input(bad_bio):
def test_parse_bio_no_input():
with pytest.raises(AssertionError):
evaluate.parse_bio(Path("not_a_bio"))
@pytest.mark.parametrize(
"line, word, label",
(
("Hi B-ORG", "Hi", "B-ORG"),
("Hi B-Org or maybe not org", "Hi", "B-Org or maybe not org"),
),
)
def test_parse_line(line, word, label):
assert parse_line(index=0, line=line, path=Path("")) == (word, label)
@pytest.mark.parametrize(
"line",
(
("HiB-ORG"),
("HiB-ORG or maybe not"),
),
)
def test_parse_line_crash(line):
with pytest.raises(Exception):
parse_line(index=0, line=line, path=Path(""))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment