Skip to content
Snippets Groups Projects

Support spaces in label

Merged Yoann Schneider requested to merge support-spaces-in-entity-names into master
2 files
+ 44
13
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 20
13
@@ -5,6 +5,9 @@ from pathlib import Path
NOT_ENTITY_TAG = "O"
BEGINNING_POS = ["B", "S", "U"]
REGEX_IOB_LINE = re.compile(r"^(.*) ([BIESLUO]-?.*)$")
REGEX_LABEL = re.compile(r"[BIESLU]-(.*)$")
def get_type_label(label: str) -> str:
"""Return the type (tag) of a label
@@ -12,11 +15,7 @@ def get_type_label(label: str) -> str:
Input format: "[BIESLU]-type"
"""
try:
tag = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"[BIESLU]-(.*)$", label)[1]
)
tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1]
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
@@ -40,6 +39,21 @@ def get_position_label(label: str) -> str:
return pos
def parse_line(index: int, line: str, path: Path):
try:
match_iob = REGEX_IOB_LINE.search(line)
assert match_iob
return match_iob.group(1, 2)
except AssertionError:
raise (
Exception(
f"The file @ {path} is not in BIO format: check line {index} ({line})"
)
)
def parse_bio(path: Path) -> dict:
"""Parse a BIO file to get text content, character-level NE labels and entity types count.
@@ -68,14 +82,7 @@ def parse_bio(path: Path) -> dict:
containing_tag = None
for index, line in enumerate(lines):
try:
word, label = line.split()
except ValueError:
raise (
Exception(
f"The file {path} given in input is not in BIO format: check line {index} ({line})"
)
)
word, label = parse_line(index, line, path)
# Preserve hyphens to avoid confusion with the hyphens added later during alignment
word = word.replace("-", "§")
Loading