diff --git a/bio_parser/parse/document.py b/bio_parser/parse/document.py index 983c5ae551bb55d85164961e7f9271bff3ed2b0f..6be99e9a6fcbccd914034cc4ac5b85a2604b2b8e 100644 --- a/bio_parser/parse/document.py +++ b/bio_parser/parse/document.py @@ -315,7 +315,7 @@ class Document: def char_labels(self) -> list[str]: r"""Character-level IOB labels. - Spaces between two tokens with the same label get the same label, others get 'O'. + Spaces between two tokens part of the same entities with the same label get the same label, others get 'O'. Examples: The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label. @@ -325,12 +325,18 @@ class Document: The space between 'run' and 'fast' is tagged as 'O', because it's not the same label. >>> Document(bio_repr="run B-Animal\nfast O").char_labels ['B-Animal', 'I-Animal', 'I-Animal', 'O', 'O', 'O', 'O', 'O'] + + The space between 'dog' and 'cat' is tagged as 'O', because it's not the same entity. + >>> Document(bio_repr="run B-Animal\ncat B-Animal").char_labels + ['B-Animal', 'I-Animal', 'I-Animal', 'O', 'B-Animal', 'I-Animal', 'I-Animal'] """ tags = [] for token, next_token in pairwise(self.tokens + [None]): # Add token tags tags.extend(token.labels) - if next_token and token.label == next_token.label: + if next_token and ( + token.label == next_token.label and not next_token.tag == Tag.BEGINNING + ): tags.append(next_token.iob_label) elif next_token: tags.append(Tag.OUTSIDE.value) diff --git a/tests/parse/test_document.py b/tests/parse/test_document.py index e8243e39de0767873e8b18abff97629c758b519a..1329516ff2525cd1068db2eb12ae27dd8d7c91da 100644 --- a/tests/parse/test_document.py +++ b/tests/parse/test_document.py @@ -189,6 +189,25 @@ def test_parse_token(document: Document): assert token.chars == ["r", "o", "b", "o", "t", "s"] +def test_consecutive_entities(): + # BIO FILE + # dog B-Animal + # cat B-Animal + document = Document("dog B-Animal\ncat B-Animal") + + assert document.chars == ["d", "o", "g", " ", "c", "a", "t"] + + assert document.char_labels == [ + "B-Animal", + "I-Animal", + "I-Animal", + "O", # Character between two new entities should be set to O + "B-Animal", + "I-Animal", + "I-Animal", + ] + + @pytest.mark.parametrize( "annotation", ["Something something", "Something A-GPE", "Something GPE-A", "Something A"],