Skip to content
Snippets Groups Projects

Handle all beginning labels

Merged Thibault Lavigne requested to merge handle_all_beginning_labels into master
3 files
+ 143
55
Compare changes
  • Side-by-side
  • Inline
Files
3
+ 36
11
@@ -12,6 +12,7 @@ import termtables as tt
NOT_ENTITY_TAG = "O"
THRESHOLD = 0.30
BEGINNING_POS = ["B", "S", "U"]
def get_type_label(label: str) -> str:
@@ -31,6 +32,23 @@ def get_type_label(label: str) -> str:
return tag
def get_position_label(label: str) -> str:
"""Return the position of a label
Input format: "[BIELUS]-type"
"""
try:
pos = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"([BIELUS])-.{3,4}", label)[1]
)
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
return pos
def parse_bio(path: str) -> dict:
"""Parse a BIO file to get text content, character-level NE labels and entity types count.
@@ -75,13 +93,17 @@ def parse_bio(path: str) -> dict:
if index != 0:
# If new word has same tag as previous, not new entity and in entity, continue entity
if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG:
if (
last_tag == tag
and get_position_label(label) not in BEGINNING_POS
and tag != NOT_ENTITY_TAG
):
labels.append(f"I-{last_tag}")
# If new word begins a new entity of different type, check for nested entity to correctly tag the space
elif (
last_tag != tag
and "B" in label
and get_position_label(label) in BEGINNING_POS
and tag != NOT_ENTITY_TAG
and last_tag != NOT_ENTITY_TAG
):
@@ -99,7 +121,7 @@ def parse_bio(path: str) -> dict:
# Check for continuation of the original entity
if (
index < len(lines)
and "B" not in future_label
and get_position_label(future_label) not in BEGINNING_POS
and get_type_label(future_label) == last_tag
):
labels.append(f"I-{last_tag}")
@@ -117,13 +139,13 @@ def parse_bio(path: str) -> dict:
in_nested_entity = False
# Add a tag for each letter in the word
if "B" in label:
if get_position_label(label) in BEGINNING_POS:
labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1)
else:
labels += [label] * len(word)
# Count nb entity for each type
if "B" in label:
if get_position_label(label) in BEGINNING_POS:
entity_count[tag] = entity_count.get(tag, 0) + 1
entity_count["All"] += 1
@@ -171,7 +193,7 @@ def look_for_further_entity_part(index, tag, characters, labels):
index += 1
while (
index < len(characters)
and "B" not in labels[index]
and get_position_label(labels[index]) not in BEGINNING_POS
and get_type_label(labels[index]) == tag
):
visited.append(index)
@@ -254,7 +276,7 @@ def compute_matches(
else:
# If beginning new entity
if "B" in label_ref:
if get_position_label(label_ref) in BEGINNING_POS:
current_ref, current_compar = [], []
last_tag = tag_ref
found_aligned_beginning = False
@@ -269,18 +291,21 @@ def compute_matches(
continue
# If just beginning new entity, backtrack tags on prediction string
if len(current_ref) == 1 and "B" not in labels_predict[i]:
if (
len(current_ref) == 1
and get_position_label(labels_predict[i]) not in BEGINNING_POS
):
j = i - 1
while (
j >= 0
and get_type_label(labels_predict[j]) == tag_ref
and "B" not in labels_predict[j]
and get_position_label(labels_predict[j]) not in BEGINNING_POS
and j not in visited_predict
):
j -= 1
if (
"B" in labels_predict[j]
get_position_label(labels_predict[j]) in BEGINNING_POS
and get_type_label(labels_predict[j]) == tag_ref
and j not in visited_predict
):
@@ -372,7 +397,7 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
elif not char == original[index_original]:
new_label = (
last_label
if "B" not in last_label
if get_position_label(last_label) not in BEGINNING_POS
else f"I-{get_type_label(last_label)}"
)
Loading