From 1925e390e261a92bfa9a26c5b3c0eb4166e8b1c8 Mon Sep 17 00:00:00 2001 From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr> Date: Wed, 22 Dec 2021 11:02:01 +0100 Subject: [PATCH] Multiple input --- demo/annot/demo_annot.bio | 82 ++++++++++++++++++++++++++++++++++ demo/annot/toy_test_annot.bio | 39 ++++++++++++++++ demo/cor.csv | 3 ++ demo/pred/demo_predict.bio | 81 +++++++++++++++++++++++++++++++++ demo/pred/toy_test_predict.bio | 39 ++++++++++++++++ nerval/evaluate.py | 50 ++++++++++++++++++++- requirements.txt | 1 + 7 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 demo/annot/demo_annot.bio create mode 100644 demo/annot/toy_test_annot.bio create mode 100644 demo/cor.csv create mode 100644 demo/pred/demo_predict.bio create mode 100644 demo/pred/toy_test_predict.bio diff --git a/demo/annot/demo_annot.bio b/demo/annot/demo_annot.bio new file mode 100644 index 0000000..cf16200 --- /dev/null +++ b/demo/annot/demo_annot.bio @@ -0,0 +1,82 @@ +Césaire B-PER +Alphonse I-PER +Garon I-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un I-DAT +nous O +prêtre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adeline B-PER +Tremblay I-PER +, O +soussignés O +, O +de O +Ste B-LOC +Luce I-LOC +, O +Lec O +- O +ture O +faite O +. O diff --git a/demo/annot/toy_test_annot.bio b/demo/annot/toy_test_annot.bio new file mode 100644 index 0000000..5a941ee --- /dev/null +++ b/demo/annot/toy_test_annot.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/demo/cor.csv b/demo/cor.csv new file mode 100644 index 0000000..a0f41c6 --- /dev/null +++ b/demo/cor.csv @@ -0,0 +1,3 @@ +annot,predict +demo_annot.bio,demo_predict.bio +toy_test_annot.bio,toy_test_predict.bio \ No newline at end of file diff --git a/demo/pred/demo_predict.bio b/demo/pred/demo_predict.bio new file mode 100644 index 0000000..7e01c2d --- /dev/null +++ b/demo/pred/demo_predict.bio @@ -0,0 +1,81 @@ +Césaire B-PER +Alphonse O +Garon B-PER +marraine O +Adeline B-PER +Dionne I-PER +, O +soussignés O +Lecture O +faite O +Adéline O +Dionne O +Arsène O +Côté O +Arpin O +R O +Le O +onze B-DAT +aout I-DAT +mil I-DAT +neuf I-DAT +cent I-DAT +un O +nous O +pretre O +soussigné O +avons O +baptisé O +Marie B-PER +Luce I-PER +Louise I-PER +, O +née O +la B-DAT +veille I-DAT +, O +fille O +légitime O +de O +Carmel B-PER +Côté I-PER +, O +cordonnier B-OCC +, O +pré O +- O +sent O +, O +déclarant O +ne O +savoir O +signer O +, O +et O +de O +Eugé B-PER +nie I-PER +Fréchette I-PER +, O +de O +cette B-LOC +paroisse I-LOC +. O +Parrain O +Napoléon B-PER +Fréchette I-PER +, O +marraine O +Adéline B-PER +Tremblay I-PER +, O +sousignés O +, O +de O +St B-LOC +. I-LOC +Luce O +, O +Lec O +ture O +faite O diff --git a/demo/pred/toy_test_predict.bio b/demo/pred/toy_test_predict.bio new file mode 100644 index 0000000..5a941ee --- /dev/null +++ b/demo/pred/toy_test_predict.bio @@ -0,0 +1,39 @@ +John B-PER +Ronald I-PER +Reuel I-PER +Tolkien I-PER +was O +born O +on O +three B-DAT +January I-DAT +eighteen I-DAT +ninety I-DAT +- I-DAT +two I-DAT +in O +Bloemfontein B-LOC +in O +the O +Orange B-LOC +Free I-LOC +State I-LOC +, O +to O +Arthur B-PER +Reuel I-PER +Tolkien I-PER +, O +an O +English O +bank B-OCC +manager I-OCC +, O +and O +his O +wife O +Mabel B-PER +, O +née O +Suffield B-PER +. O diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 6e2fef9..de56dd6 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -9,6 +9,11 @@ import editdistance import edlib import termtables as tt +import glob +from pathlib import Path +import pandas as pd + + NOT_ENTITY_TAG = "O" THRESHOLD = 0.30 @@ -533,6 +538,39 @@ def run(annotation: str, prediction: str, threshold: int) -> dict: return scores +def run_multiple(file_csv, annot_folder, predict_folder, threshold): + """Run the program for multiple file (correlation indicated in the csv file)""" + # Read the csv in a dataframe + df_cor = pd.read_csv(file_csv) + + # Check if the variable given are folder + if os.path.isdir(annot_folder) and os.path.isdir(predict_folder): + list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True) + list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True) + + for index, row in df_cor.iterrows(): + annot = None + predict = None + + # Check if the file exist + for file_annot in list_bio_annot: + if row["annot"] == os.path.basename(file_annot): + annot = file_annot + for file_predict in list_bio_predict: + if row["predict"] == os.path.basename(file_predict): + predict = file_predict + + # Apply the evaluation + if annot and predict: + run(annot, predict, threshold) + print("") + else: + print(f"No file found for row {index}") + + else: + print("Error this is no folder") + + def threshold_float_type(arg): """Type function for argparse.""" try: @@ -564,9 +602,19 @@ def main(): default=THRESHOLD, type=threshold_float_type, ) + parser.add_argument( + "-c", + "--csv", + help="csv with the correlation between the annotation bio file and the predict bio file", + required=False, + type=Path, + ) args = parser.parse_args() - run(args.annot, args.predict, args.threshold) + if args.csv: + run_multiple(args.csv, args.annot, args.predict, args.threshold) + else: + run(args.annot, args.predict, args.threshold) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index d6af2d0..70abc00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ editdistance==0.5.3 edlib==1.3.8.post2 termtables==0.2.3 +pandas==1.3.4 \ No newline at end of file -- GitLab