diff --git a/README.md b/README.md index 637e0c80cf00311e2353b0ba4c390e4f7b9fbd08..2a3848e0020239c8cdd0f1e81fcc90fd91b7c5b8 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions. Expected inputs are a ground truth and a prediction BIOES/BILOU files without any ''§'' occurrences, this character having a special meaning during evaluation. +It also works by designating a csv file with file matches (one pair per row with the annotation file in the first column and the prediction file in the second column) ## Usage @@ -31,7 +32,7 @@ You can now use Nerval in command line : ``` $ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \ - [-t/--threshold <threshold_value>] + [-t/--threshold <threshold_value>] [-c/--csv <correspondence_file.csv>] ``` The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match. @@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio ``` +You can also indicate a folder and a csv file to have multiple evaluation at once. + +``` +$ nerval -c demo/mapping_file.csv -f demo/bio_folder +``` + ## Metric This metric uses string alignment at character level. diff --git a/demo/bio_folder/demo_annot.bio b/demo/bio_folder/demo_annot.bio new file mode 100644 index 0000000000000000000000000000000000000000..cf16200fcd0436d943f71127f63894e428fafea7 --- /dev/null +++ b/demo/bio_folder/demo_annot.bio @@ -0,0 +1,82 @@ +Césaire B-PER +Alphonse I-PER +Garon I-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un I-DAT +nous O +prêtre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adeline B-PER +Tremblay I-PER +, O +soussignés O +, O +de O +Ste B-LOC +Luce I-LOC +, O +Lec O +- O +ture O +faite O +. O diff --git a/demo/bio_folder/demo_predict.bio b/demo/bio_folder/demo_predict.bio new file mode 100644 index 0000000000000000000000000000000000000000..7e01c2d127aa47c95c49ab87a06e06d86f9af9b0 --- /dev/null +++ b/demo/bio_folder/demo_predict.bio @@ -0,0 +1,81 @@ +Césaire B-PER +Alphonse O +Garon B-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un O +nous O +pretre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adéline B-PER +Tremblay I-PER +, O +sousignés O +, O +de O +St B-LOC +. I-LOC +Luce O +, O +Lec O +ture O +faite O diff --git a/demo/bio_folder/toy_test_annot.bio b/demo/bio_folder/toy_test_annot.bio new file mode 100644 index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef --- /dev/null +++ b/demo/bio_folder/toy_test_annot.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/demo/bio_folder/toy_test_predict.bio b/demo/bio_folder/toy_test_predict.bio new file mode 100644 index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef --- /dev/null +++ b/demo/bio_folder/toy_test_predict.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/demo/mapping_file.csv b/demo/mapping_file.csv new file mode 100644 index 0000000000000000000000000000000000000000..5a2ce9244d8751ec5812a297c8b7ddf367ed3a56 --- /dev/null +++ b/demo/mapping_file.csv @@ -0,0 +1,2 @@ +demo_annot.bio,demo_predict.bio +toy_test_annot.bio,toy_test_predict.bio \ No newline at end of file diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 6e2fef9d3631ce2545243188b68fb95224eb0a27..00d9e4343e1bc13098849aa208755edfab156a35 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- import argparse +import glob import logging import os import re +from csv import reader +from pathlib import Path import editdistance import edlib @@ -81,7 +84,11 @@ def parse_bio(path: str) -> dict: try: word, label = line.split() except ValueError: - raise (Exception(f"The file {path} given in input is not in BIO format.")) + raise ( + Exception( + f"The file {path} given in input is not in BIO format: check line {index} ({line})" + ) + ) # Preserve hyphens to avoid confusion with the hyphens added later during alignment word = word.replace("-", "§") @@ -533,6 +540,37 @@ def run(annotation: str, prediction: str, threshold: int) -> dict: return scores +def run_multiple(file_csv, folder, threshold): + """Run the program for multiple files (correlation indicated in the csv file)""" + # Read the csv in a list + with open(file_csv, "r") as read_obj: + csv_reader = reader(read_obj) + list_cor = list(csv_reader) + + if os.path.isdir(folder): + list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True) + + for row in list_cor: + annot = None + predict = None + + for file in list_bio_file: + if row[0] == os.path.basename(file): + annot = file + for file in list_bio_file: + if row[1] == os.path.basename(file): + predict = file + + if annot and predict: + print(os.path.basename(predict)) + run(annot, predict, threshold) + print() + else: + raise f"No file found for files {annot}, {predict}" + else: + raise Exception("the path indicated does not lead to a folder.") + + def threshold_float_type(arg): """Type function for argparse.""" try: @@ -551,22 +589,60 @@ def main(): parser = argparse.ArgumentParser(description="Compute score of NER on predict.") parser.add_argument( - "-a", "--annot", help="Annotation in BIO format.", required=True + "-m", + "--multiple", + help="Single if 1, multiple 2", + type=int, + required=True, + ) + parser.add_argument( + "-a", + "--annot", + help="Annotation in BIO format.", ) parser.add_argument( - "-p", "--predict", help="Prediction in BIO format.", required=True + "-p", + "--predict", + help="Prediction in BIO format.", ) parser.add_argument( "-t", "--threshold", help="Set a distance threshold for the match between gold and predicted entity.", - required=False, default=THRESHOLD, type=threshold_float_type, ) + parser.add_argument( + "-c", + "--csv", + help="Csv with the correlation between the annotation bio files and the predict bio files", + type=Path, + ) + parser.add_argument( + "-f", + "--folder", + help="Folder containing the bio files referred to in the csv file", + type=Path, + ) args = parser.parse_args() - run(args.annot, args.predict, args.threshold) + if args.multiple == 1 or args.multiple == 2: + if args.multiple == 2: + if not args.folder: + raise argparse.ArgumentError(args.folder, "-f must be given if -m is 2") + if not args.csv: + raise argparse.ArgumentError(args.folder, "-c must be given if -m is 2") + if args.folder and args.csv: + run_multiple(args.csv, args.folder, args.threshold) + if args.multiple == 1: + if not args.annot: + raise argparse.ArgumentError(args.folder, "-a must be given if -m is 1") + if not args.predict: + raise argparse.ArgumentError(args.folder, "-p must be given if -m is 1") + if args.annot and args.predict: + run(args.annot, args.predict, args.threshold) + else: + raise argparse.ArgumentTypeError("Value has to be 1 or 2") if __name__ == "__main__": diff --git a/tests/test_folder/test_demo_annot.bio b/tests/test_folder/test_demo_annot.bio new file mode 100644 index 0000000000000000000000000000000000000000..cf16200fcd0436d943f71127f63894e428fafea7 --- /dev/null +++ b/tests/test_folder/test_demo_annot.bio @@ -0,0 +1,82 @@ +Césaire B-PER +Alphonse I-PER +Garon I-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un I-DAT +nous O +prêtre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adeline B-PER +Tremblay I-PER +, O +soussignés O +, O +de O +Ste B-LOC +Luce I-LOC +, O +Lec O +- O +ture O +faite O +. O diff --git a/tests/test_folder/test_demo_predict.bio b/tests/test_folder/test_demo_predict.bio new file mode 100644 index 0000000000000000000000000000000000000000..7e01c2d127aa47c95c49ab87a06e06d86f9af9b0 --- /dev/null +++ b/tests/test_folder/test_demo_predict.bio @@ -0,0 +1,81 @@ +Césaire B-PER +Alphonse O +Garon B-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un O +nous O +pretre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adéline B-PER +Tremblay I-PER +, O +sousignés O +, O +de O +St B-LOC +. I-LOC +Luce O +, O +Lec O +ture O +faite O diff --git a/tests/test_folder/test_toy_annot.bio b/tests/test_folder/test_toy_annot.bio new file mode 100644 index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef --- /dev/null +++ b/tests/test_folder/test_toy_annot.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/tests/test_folder/test_toy_predict.bio b/tests/test_folder/test_toy_predict.bio new file mode 100644 index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef --- /dev/null +++ b/tests/test_folder/test_toy_predict.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/tests/test_mapping_file.csv b/tests/test_mapping_file.csv new file mode 100644 index 0000000000000000000000000000000000000000..5a2ce9244d8751ec5812a297c8b7ddf367ed3a56 --- /dev/null +++ b/tests/test_mapping_file.csv @@ -0,0 +1,2 @@ +demo_annot.bio,demo_predict.bio +toy_test_annot.bio,toy_test_predict.bio \ No newline at end of file diff --git a/tests/test_run.py b/tests/test_run.py index d0ea6d59880e5da982a949b80818215af5358160..cedbd0d169f8336763927606f744569ef940f1d3 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -9,6 +9,8 @@ FAKE_ANNOT_BIO = "tests/test_annot.bio" FAKE_PREDICT_BIO = "tests/test_predict.bio" EMPTY_BIO = "tests/test_empty.bio" FAKE_BIO_NESTED = "tests/test_nested.bio" +BIO_FOLDER = "test_folder" +CSV_FILE = "test_mapping_file.csv" expected_scores_nested = { "All": { @@ -81,3 +83,8 @@ def test_run_empty_bio(): def test_run_empty_entry(): with pytest.raises(TypeError): evaluate.run(None, None, THRESHOLD) + + +def test_run_multiple(): + with pytest.raises(Exception): + evaluate.run_multiple(CSV_FILE, BIO_FOLDER, THRESHOLD)