Skip to content
Snippets Groups Projects
Commit bb3fcc72 authored by Blanche Miret's avatar Blanche Miret Committed by kermorvant
Browse files

Fix indexerror in parsing

parent fd3db043
No related branches found
No related tags found
1 merge request!9Fix indexerror in parsing
......@@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"]
def get_type_label(label: str) -> str:
"""Return the type (tag) of a label
Input format: "[BIELUS]-type"
Input format: "[BIESLU]-type"
"""
try:
tag = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"[BIELUS]-(.{3,4})", label)[1]
else re.match(r"[BIESLU]-(.{3,4})", label)[1]
)
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
return tag
......@@ -35,16 +35,16 @@ def get_type_label(label: str) -> str:
def get_position_label(label: str) -> str:
"""Return the position of a label
Input format: "[BIELUS]-type"
Input format: "[BIESLU]-type"
"""
try:
pos = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"([BIELUS])-.{3,4}", label)[1]
else re.match(r"([BIESLU])-.{3,4}", label)[1]
)
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
return pos
......@@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict:
and get_type_label(future_label) != last_tag
):
index += 1
future_label = lines[index].split()[1]
if index < len(lines):
future_label = lines[index].split()[1]
# Check for continuation of the original entity
if (
......
File moved
Louis B-PER
par I-PER
la I-PER
grâce I-PER
de I-PER
Dieu I-PER
roy I-PER
de I-PER
France B-LOC
et I-PER
de I-PER
Navarre B-LOC
......@@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [
'I-PER', 'I-PER',
'I-PER',
'I-PER', 'I-PER',
'I-PER',
'O',
'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
'O',
'O'
......@@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [
'I-PER', 'I-PER',
'I-PER',
'I-PER', 'I-PER',
'I-PER',
'O',
'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
'O',
'O'
......
......@@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio"
BAD_BIO = "tests/test_bad.bio"
FAKE_ANNOT_BIO = "tests/test_annot.bio"
FAKE_PREDICT_BIO = "tests/test_predict.bio"
BIOUES_BIO = "tests/bioues.bio"
BIOESLU_BIO = "tests/bioeslu.bio"
END_OF_FILE_BIO = "tests/end_of_file.bio"
expected_parsed_annot = {
......@@ -113,6 +114,67 @@ expected_parsed_predict = {
"words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
}
expected_parsed_end_of_file = {
"entity_count": {"All": 3, "LOC": 2, "PER": 1},
"labels": [
"B-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"B-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"O",
"B-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
],
"words": "Louis par la grâce de Dieu roy de France et de Navarre",
}
@pytest.mark.parametrize(
"test_input, expected",
......@@ -120,7 +182,8 @@ expected_parsed_predict = {
(FAKE_ANNOT_BIO, expected_parsed_annot),
(FAKE_PREDICT_BIO, expected_parsed_predict),
(EMPTY_BIO, None),
(BIOUES_BIO, expected_parsed_annot),
(BIOESLU_BIO, expected_parsed_annot),
(END_OF_FILE_BIO, expected_parsed_end_of_file),
],
)
def test_parse_bio(test_input, expected):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment