From d77e9801adfe549ef6057d8f0c2240377934c51c Mon Sep 17 00:00:00 2001 From: Blanche Miret <bmiret@teklia.com> Date: Wed, 2 Jun 2021 07:33:54 +0000 Subject: [PATCH] Handle all beginning labels --- nerval/evaluate.py | 47 ++++++++++---- tests/bioues.bio | 10 +++ tests/test_parse_bio.py | 141 +++++++++++++++++++++++++++------------- 3 files changed, 143 insertions(+), 55 deletions(-) create mode 100644 tests/bioues.bio diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 8b23527..3621865 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -12,6 +12,7 @@ import termtables as tt NOT_ENTITY_TAG = "O" THRESHOLD = 0.30 +BEGINNING_POS = ["B", "S", "U"] def get_type_label(label: str) -> str: @@ -31,6 +32,23 @@ def get_type_label(label: str) -> str: return tag +def get_position_label(label: str) -> str: + """Return the position of a label + + Input format: "[BIELUS]-type" + """ + try: + pos = ( + NOT_ENTITY_TAG + if label == NOT_ENTITY_TAG + else re.match(r"([BIELUS])-.{3,4}", label)[1] + ) + except TypeError: + raise (Exception(f"The label {label} is not valid in BIOES/BILOU format.")) + + return pos + + def parse_bio(path: str) -> dict: """Parse a BIO file to get text content, character-level NE labels and entity types count. @@ -75,13 +93,17 @@ def parse_bio(path: str) -> dict: if index != 0: # If new word has same tag as previous, not new entity and in entity, continue entity - if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG: + if ( + last_tag == tag + and get_position_label(label) not in BEGINNING_POS + and tag != NOT_ENTITY_TAG + ): labels.append(f"I-{last_tag}") # If new word begins a new entity of different type, check for nested entity to correctly tag the space elif ( last_tag != tag - and "B" in label + and get_position_label(label) in BEGINNING_POS and tag != NOT_ENTITY_TAG and last_tag != NOT_ENTITY_TAG ): @@ -99,7 +121,7 @@ def parse_bio(path: str) -> dict: # Check for continuation of the original entity if ( index < len(lines) - and "B" not in future_label + and get_position_label(future_label) not in BEGINNING_POS and get_type_label(future_label) == last_tag ): labels.append(f"I-{last_tag}") @@ -117,13 +139,13 @@ def parse_bio(path: str) -> dict: in_nested_entity = False # Add a tag for each letter in the word - if "B" in label: + if get_position_label(label) in BEGINNING_POS: labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1) else: labels += [label] * len(word) # Count nb entity for each type - if "B" in label: + if get_position_label(label) in BEGINNING_POS: entity_count[tag] = entity_count.get(tag, 0) + 1 entity_count["All"] += 1 @@ -171,7 +193,7 @@ def look_for_further_entity_part(index, tag, characters, labels): index += 1 while ( index < len(characters) - and "B" not in labels[index] + and get_position_label(labels[index]) not in BEGINNING_POS and get_type_label(labels[index]) == tag ): visited.append(index) @@ -254,7 +276,7 @@ def compute_matches( else: # If beginning new entity - if "B" in label_ref: + if get_position_label(label_ref) in BEGINNING_POS: current_ref, current_compar = [], [] last_tag = tag_ref found_aligned_beginning = False @@ -269,18 +291,21 @@ def compute_matches( continue # If just beginning new entity, backtrack tags on prediction string - if len(current_ref) == 1 and "B" not in labels_predict[i]: + if ( + len(current_ref) == 1 + and get_position_label(labels_predict[i]) not in BEGINNING_POS + ): j = i - 1 while ( j >= 0 and get_type_label(labels_predict[j]) == tag_ref - and "B" not in labels_predict[j] + and get_position_label(labels_predict[j]) not in BEGINNING_POS and j not in visited_predict ): j -= 1 if ( - "B" in labels_predict[j] + get_position_label(labels_predict[j]) in BEGINNING_POS and get_type_label(labels_predict[j]) == tag_ref and j not in visited_predict ): @@ -372,7 +397,7 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li elif not char == original[index_original]: new_label = ( last_label - if "B" not in last_label + if get_position_label(last_label) not in BEGINNING_POS else f"I-{get_type_label(last_label)}" ) diff --git a/tests/bioues.bio b/tests/bioues.bio new file mode 100644 index 0000000..6faa87d --- /dev/null +++ b/tests/bioues.bio @@ -0,0 +1,10 @@ +Gérard B-PER +de I-PER +Nerval I-PER +was O +born O +in O +Paris U-LOC +in O +1808 S-DAT +. O diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 2572dc8..7084c5e 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -8,58 +8,110 @@ EMPTY_BIO = "tests/test_empty.bio" BAD_BIO = "tests/test_bad.bio" FAKE_ANNOT_BIO = "tests/test_annot.bio" FAKE_PREDICT_BIO = "tests/test_predict.bio" +BIOUES_BIO = "tests/bioues.bio" + -# fmt: off expected_parsed_annot = { - 'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1}, - 'labels': [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', - 'O', - 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O' + "entity_count": {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, + "labels": [ + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "O", + "O", + "O", + "O", + "B-DAT", + "I-DAT", + "I-DAT", + "I-DAT", + "O", + "O", ], - 'words': 'Gérard de Nerval was born in Paris in 1808 .' + "words": "Gérard de Nerval was born in Paris in 1808 .", } expected_parsed_predict = { - 'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1}, - 'labels': [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - 'B-***', 'I-***', 'I-***', 'I-***', 'I-***', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O', 'O' + "entity_count": {"All": 3, "DAT": 1, "***": 1, "PER": 1}, + "labels": [ + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "B-***", + "I-***", + "I-***", + "I-***", + "I-***", + "O", + "O", + "O", + "O", + "B-DAT", + "I-DAT", + "I-DAT", + "I-DAT", + "O", + "O", + "O", ], - 'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.' + "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.", } -# fmt: on @pytest.mark.parametrize( @@ -68,6 +120,7 @@ expected_parsed_predict = { (FAKE_ANNOT_BIO, expected_parsed_annot), (FAKE_PREDICT_BIO, expected_parsed_predict), (EMPTY_BIO, None), + (BIOUES_BIO, expected_parsed_annot), ], ) def test_parse_bio(test_input, expected): -- GitLab