Skip to content
Snippets Groups Projects
Commit e2b6bf95 authored by kermorvant's avatar kermorvant
Browse files

Merge branch 'fix_indexerror_in_parsing' into 'master'

Fix indexerror in parsing

See merge request teklia/nerval!9
parents fd3db043 bb3fcc72
No related branches found
No related tags found
1 merge request!9Fix indexerror in parsing
Pipeline #103810 passed
......@@ -18,16 +18,16 @@ BEGINNING_POS = ["B", "S", "U"]
def get_type_label(label: str) -> str:
"""Return the type (tag) of a label
Input format: "[BIELUS]-type"
Input format: "[BIESLU]-type"
"""
try:
tag = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"[BIELUS]-(.{3,4})", label)[1]
else re.match(r"[BIESLU]-(.{3,4})", label)[1]
)
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
return tag
......@@ -35,16 +35,16 @@ def get_type_label(label: str) -> str:
def get_position_label(label: str) -> str:
"""Return the position of a label
Input format: "[BIELUS]-type"
Input format: "[BIESLU]-type"
"""
try:
pos = (
NOT_ENTITY_TAG
if label == NOT_ENTITY_TAG
else re.match(r"([BIELUS])-.{3,4}", label)[1]
else re.match(r"([BIESLU])-.{3,4}", label)[1]
)
except TypeError:
raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
return pos
......@@ -116,7 +116,8 @@ def parse_bio(path: str) -> dict:
and get_type_label(future_label) != last_tag
):
index += 1
future_label = lines[index].split()[1]
if index < len(lines):
future_label = lines[index].split()[1]
# Check for continuation of the original entity
if (
......
File moved
Louis B-PER
par I-PER
la I-PER
grâce I-PER
de I-PER
Dieu I-PER
roy I-PER
de I-PER
France B-LOC
et I-PER
de I-PER
Navarre B-LOC
......@@ -33,7 +33,7 @@ fake_tags_aligned_nested_perfect = [
'I-PER', 'I-PER',
'I-PER',
'I-PER', 'I-PER',
'I-PER',
'O',
'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
'O',
'O'
......@@ -62,7 +62,7 @@ fake_tags_aligned_nested_false = [
'I-PER', 'I-PER',
'I-PER',
'I-PER', 'I-PER',
'I-PER',
'O',
'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
'O',
'O'
......
......@@ -8,7 +8,8 @@ EMPTY_BIO = "tests/test_empty.bio"
BAD_BIO = "tests/test_bad.bio"
FAKE_ANNOT_BIO = "tests/test_annot.bio"
FAKE_PREDICT_BIO = "tests/test_predict.bio"
BIOUES_BIO = "tests/bioues.bio"
BIOESLU_BIO = "tests/bioeslu.bio"
END_OF_FILE_BIO = "tests/end_of_file.bio"
expected_parsed_annot = {
......@@ -113,6 +114,67 @@ expected_parsed_predict = {
"words": "G*rard de *N*erval bo*rn in Paris in 1833 *.",
}
expected_parsed_end_of_file = {
"entity_count": {"All": 3, "LOC": 2, "PER": 1},
"labels": [
"B-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"B-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"I-PER",
"O",
"B-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
"I-LOC",
],
"words": "Louis par la grâce de Dieu roy de France et de Navarre",
}
@pytest.mark.parametrize(
"test_input, expected",
......@@ -120,7 +182,8 @@ expected_parsed_predict = {
(FAKE_ANNOT_BIO, expected_parsed_annot),
(FAKE_PREDICT_BIO, expected_parsed_predict),
(EMPTY_BIO, None),
(BIOUES_BIO, expected_parsed_annot),
(BIOESLU_BIO, expected_parsed_annot),
(END_OF_FILE_BIO, expected_parsed_end_of_file),
],
)
def test_parse_bio(test_input, expected):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment