Skip to content
Snippets Groups Projects
Commit 9d5df0ab authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Correct char label between entities

parent 253bd2f7
No related branches found
No related tags found
1 merge request!5Correct char label between entities
Pipeline #154690 passed
......@@ -315,7 +315,7 @@ class Document:
def char_labels(self) -> list[str]:
r"""Character-level IOB labels.
Spaces between two tokens with the same label get the same label, others get 'O'.
Spaces between two tokens part of the same entities with the same label get the same label, others get 'O'.
Examples:
The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label.
......@@ -325,12 +325,18 @@ class Document:
The space between 'run' and 'fast' is tagged as 'O', because it's not the same label.
>>> Document(bio_repr="run B-Animal\nfast O").char_labels
['B-Animal', 'I-Animal', 'I-Animal', 'O', 'O', 'O', 'O', 'O']
The space between 'dog' and 'cat' is tagged as 'O', because it's not the same entity.
>>> Document(bio_repr="run B-Animal\ncat B-Animal").char_labels
['B-Animal', 'I-Animal', 'I-Animal', 'O', 'B-Animal', 'I-Animal', 'I-Animal']
"""
tags = []
for token, next_token in pairwise(self.tokens + [None]):
# Add token tags
tags.extend(token.labels)
if next_token and token.label == next_token.label:
if next_token and (
token.label == next_token.label and not next_token.tag == Tag.BEGINNING
):
tags.append(next_token.iob_label)
elif next_token:
tags.append(Tag.OUTSIDE.value)
......
......@@ -189,6 +189,25 @@ def test_parse_token(document: Document):
assert token.chars == ["r", "o", "b", "o", "t", "s"]
def test_consecutive_entities():
# BIO FILE
# dog B-Animal
# cat B-Animal
document = Document("dog B-Animal\ncat B-Animal")
assert document.chars == ["d", "o", "g", " ", "c", "a", "t"]
assert document.char_labels == [
"B-Animal",
"I-Animal",
"I-Animal",
"O", # Character between two new entities should be set to O
"B-Animal",
"I-Animal",
"I-Animal",
]
@pytest.mark.parametrize(
"annotation",
["Something something", "Something A-GPE", "Something GPE-A", "Something A"],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment