From bb3fcc721cb17b6c7e8de46b8cd11a2db68b48fe Mon Sep 17 00:00:00 2001 From: Blanche Miret <bmiret@teklia.com> Date: Wed, 2 Jun 2021 11:22:26 +0000 Subject: [PATCH] Fix indexerror in parsing --- nerval/evaluate.py | 15 +++---- tests/{bioues.bio => bioeslu.bio} | 0 tests/end_of_file.bio | 12 ++++++ tests/test_compute_matches.py | 4 +- tests/test_parse_bio.py | 67 ++++++++++++++++++++++++++++++- 5 files changed, 87 insertions(+), 11 deletions(-) rename tests/{bioues.bio => bioeslu.bio} (100%) create mode 100644 tests/end_of_file.bio diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 3621865..100a6c0 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"] def get_type_label(label: str) -> str: """Return the type (tag) of a label - Input format: "[BIELUS]-type" + Input format: "[BIESLU]-type" """ try: tag = ( NOT_ENTITY_TAG if label == NOT_ENTITY_TAG - else re.match(r"[BIELUS]-(.{3,4})", label)[1] + else re.match(r"[BIESLU]-(.{3,4})", label)[1] ) except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BILOU format.")) + raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) return tag @@ -35,16 +35,16 @@ def get_type_label(label: str) -> str: def get_position_label(label: str) -> str: """Return the position of a label - Input format: "[BIELUS]-type" + Input format: "[BIESLU]-type" """ try: pos = ( NOT_ENTITY_TAG if label == NOT_ENTITY_TAG - else re.match(r"([BIELUS])-.{3,4}", label)[1] + else re.match(r"([BIESLU])-.{3,4}", label)[1] ) except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BILOU format.")) + raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) return pos @@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict: and get_type_label(future_label) != last_tag ): index += 1 - future_label = lines[index].split()[1] + if index < len(lines): + future_label = lines[index].split()[1] # Check for continuation of the original entity if ( diff --git a/tests/bioues.bio b/tests/bioeslu.bio similarity index 100% rename from tests/bioues.bio rename to tests/bioeslu.bio diff --git a/tests/end_of_file.bio b/tests/end_of_file.bio new file mode 100644 index 0000000..56f8e52 --- /dev/null +++ b/tests/end_of_file.bio @@ -0,0 +1,12 @@ +Louis B-PER +par I-PER +la I-PER +grâce I-PER +de I-PER +Dieu I-PER +roy I-PER +de I-PER +France B-LOC +et I-PER +de I-PER +Navarre B-LOC diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py index d2f0a0b..f472e84 100644 --- a/tests/test_compute_matches.py +++ b/tests/test_compute_matches.py @@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [ 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', + 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O' @@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [ 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', + 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O' diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 7084c5e..23df00b 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio" BAD_BIO = "tests/test_bad.bio" FAKE_ANNOT_BIO = "tests/test_annot.bio" FAKE_PREDICT_BIO = "tests/test_predict.bio" -BIOUES_BIO = "tests/bioues.bio" +BIOESLU_BIO = "tests/bioeslu.bio" +END_OF_FILE_BIO = "tests/end_of_file.bio" expected_parsed_annot = { @@ -113,6 +114,67 @@ expected_parsed_predict = { "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.", } +expected_parsed_end_of_file = { + "entity_count": {"All": 3, "LOC": 2, "PER": 1}, + "labels": [ + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "B-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "B-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + ], + "words": "Louis par la grâce de Dieu roy de France et de Navarre", +} + @pytest.mark.parametrize( "test_input, expected", @@ -120,7 +182,8 @@ expected_parsed_predict = { (FAKE_ANNOT_BIO, expected_parsed_annot), (FAKE_PREDICT_BIO, expected_parsed_predict), (EMPTY_BIO, None), - (BIOUES_BIO, expected_parsed_annot), + (BIOESLU_BIO, expected_parsed_annot), + (END_OF_FILE_BIO, expected_parsed_end_of_file), ], ) def test_parse_bio(test_input, expected): -- GitLab