diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 3621865b992d31afc2a0033d8a7ab217fd321134..100a6c0b6e02d7bf8dcc9a73780fb014c6bcc499 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"] def get_type_label(label: str) -> str: """Return the type (tag) of a label - Input format: "[BIELUS]-type" + Input format: "[BIESLU]-type" """ try: tag = ( NOT_ENTITY_TAG if label == NOT_ENTITY_TAG - else re.match(r"[BIELUS]-(.{3,4})", label)[1] + else re.match(r"[BIESLU]-(.{3,4})", label)[1] ) except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BILOU format.")) + raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) return tag @@ -35,16 +35,16 @@ def get_type_label(label: str) -> str: def get_position_label(label: str) -> str: """Return the position of a label - Input format: "[BIELUS]-type" + Input format: "[BIESLU]-type" """ try: pos = ( NOT_ENTITY_TAG if label == NOT_ENTITY_TAG - else re.match(r"([BIELUS])-.{3,4}", label)[1] + else re.match(r"([BIESLU])-.{3,4}", label)[1] ) except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BILOU format.")) + raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) return pos @@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict: and get_type_label(future_label) != last_tag ): index += 1 - future_label = lines[index].split()[1] + if index < len(lines): + future_label = lines[index].split()[1] # Check for continuation of the original entity if ( diff --git a/tests/bioues.bio b/tests/bioeslu.bio similarity index 100% rename from tests/bioues.bio rename to tests/bioeslu.bio diff --git a/tests/end_of_file.bio b/tests/end_of_file.bio new file mode 100644 index 0000000000000000000000000000000000000000..56f8e52b33f4ac07ceaa78c8e7c0ff34aa367d5d --- /dev/null +++ b/tests/end_of_file.bio @@ -0,0 +1,12 @@ +Louis B-PER +par I-PER +la I-PER +grâce I-PER +de I-PER +Dieu I-PER +roy I-PER +de I-PER +France B-LOC +et I-PER +de I-PER +Navarre B-LOC diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py index d2f0a0b9478af6273f29342aff6ef6ec5db6b515..f472e84d790fc9053481063daddadf9dfc5c8d93 100644 --- a/tests/test_compute_matches.py +++ b/tests/test_compute_matches.py @@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [ 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', + 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O' @@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [ 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', + 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O' diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 7084c5e2a7d4ed6aa62f7fd793655a350a0f9ea6..23df00beadf0d158bb782c95f0a64c1978ced10d 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio" BAD_BIO = "tests/test_bad.bio" FAKE_ANNOT_BIO = "tests/test_annot.bio" FAKE_PREDICT_BIO = "tests/test_predict.bio" -BIOUES_BIO = "tests/bioues.bio" +BIOESLU_BIO = "tests/bioeslu.bio" +END_OF_FILE_BIO = "tests/end_of_file.bio" expected_parsed_annot = { @@ -113,6 +114,67 @@ expected_parsed_predict = { "words": "G*rard de *N*erval bo*rn in Paris in 1833 *.", } +expected_parsed_end_of_file = { + "entity_count": {"All": 3, "LOC": 2, "PER": 1}, + "labels": [ + "B-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "B-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "I-PER", + "O", + "B-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + "I-LOC", + ], + "words": "Louis par la grâce de Dieu roy de France et de Navarre", +} + @pytest.mark.parametrize( "test_input, expected", @@ -120,7 +182,8 @@ expected_parsed_predict = { (FAKE_ANNOT_BIO, expected_parsed_annot), (FAKE_PREDICT_BIO, expected_parsed_predict), (EMPTY_BIO, None), - (BIOUES_BIO, expected_parsed_annot), + (BIOESLU_BIO, expected_parsed_annot), + (END_OF_FILE_BIO, expected_parsed_end_of_file), ], ) def test_parse_bio(test_input, expected):