From 61aec3df1b6963df77ec17c426981744e95a782c Mon Sep 17 00:00:00 2001 From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr> Date: Tue, 11 Jan 2022 09:47:16 +0000 Subject: [PATCH] Better parser --- README.md | 8 ++- demo/bio_folder/demo_annot.bio | 82 --------------------------- demo/bio_folder/demo_predict.bio | 81 --------------------------- demo/bio_folder/toy_test_annot.bio | 39 ------------- demo/bio_folder/toy_test_predict.bio | 39 ------------- nerval/evaluate.py | 83 ++++++++++++++-------------- 6 files changed, 48 insertions(+), 284 deletions(-) delete mode 100644 demo/bio_folder/demo_annot.bio delete mode 100644 demo/bio_folder/demo_predict.bio delete mode 100644 demo/bio_folder/toy_test_annot.bio delete mode 100644 demo/bio_folder/toy_test_predict.bio diff --git a/README.md b/README.md index 2a3848e..4a4f41c 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Counting the spaces, 7 characters differ over 24 characters in the reference ent ### Demo ``` -$ nerval -a demo/demo_annot.bio -p demo/demo_predict.bio +$ nerval -a demo/bio_folder/demo_annot.bio -p demo/bio_folder/demo_predict.bio ``` We also provide two annotation and prediction toy files, which are identical for now and produce perfect scores. Feel free to play with the the text and entity tags in the prediction file to see the impact on the score. @@ -65,6 +65,12 @@ You can also indicate a folder and a csv file to have multiple evaluation at onc $ nerval -c demo/mapping_file.csv -f demo/bio_folder ``` +And with the verbose option that's triggered by -v + +``` +$ nerval -a demo/bio_folder/demo_annot.bio -p demo/bio_folder/demo_predict.bio -v +``` + ## Metric This metric uses string alignment at character level. diff --git a/demo/bio_folder/demo_annot.bio b/demo/bio_folder/demo_annot.bio deleted file mode 100644 index cf16200..0000000 --- a/demo/bio_folder/demo_annot.bio +++ /dev/null @@ -1,82 +0,0 @@ -Césaire B-PER -Alphonse I-PER -Garon I-PER -marraine O -Adeline B-PER -Dionne I-PER -, O -soussignés O -Lecture O -faite O -Adéline O -Dionne O -Arsène O -Côté O -Arpin O -R O -Le O -onze B-DAT -aout I-DAT -mil I-DAT -neuf I-DAT -cent I-DAT -un I-DAT -nous O -prêtre O -soussigné O -avons O -baptisé O -Marie B-PER -Luce I-PER -Louise I-PER -, O -née O -la B-DAT -veille I-DAT -, O -fille O -légitime O -de O -Carmel B-PER -Côté I-PER -, O -cordonnier B-OCC -, O -pré O -- O -sent O -, O -déclarant O -ne O -savoir O -signer O -, O -et O -de O -Eugé B-PER -nie I-PER -Fréchette I-PER -, O -de O -cette B-LOC -paroisse I-LOC -. O -Parrain O -Napoléon B-PER -Fréchette I-PER -, O -marraine O -Adeline B-PER -Tremblay I-PER -, O -soussignés O -, O -de O -Ste B-LOC -Luce I-LOC -, O -Lec O -- O -ture O -faite O -. O diff --git a/demo/bio_folder/demo_predict.bio b/demo/bio_folder/demo_predict.bio deleted file mode 100644 index 7e01c2d..0000000 --- a/demo/bio_folder/demo_predict.bio +++ /dev/null @@ -1,81 +0,0 @@ -Césaire B-PER -Alphonse O -Garon B-PER -marraine O -Adeline B-PER -Dionne I-PER -, O -soussignés O -Lecture O -faite O -Adéline O -Dionne O -Arsène O -Côté O -Arpin O -R O -Le O -onze B-DAT -aout I-DAT -mil I-DAT -neuf I-DAT -cent I-DAT -un O -nous O -pretre O -soussigné O -avons O -baptisé O -Marie B-PER -Luce I-PER -Louise I-PER -, O -née O -la B-DAT -veille I-DAT -, O -fille O -légitime O -de O -Carmel B-PER -Côté I-PER -, O -cordonnier B-OCC -, O -pré O -- O -sent O -, O -déclarant O -ne O -savoir O -signer O -, O -et O -de O -Eugé B-PER -nie I-PER -Fréchette I-PER -, O -de O -cette B-LOC -paroisse I-LOC -. O -Parrain O -Napoléon B-PER -Fréchette I-PER -, O -marraine O -Adéline B-PER -Tremblay I-PER -, O -sousignés O -, O -de O -St B-LOC -. I-LOC -Luce O -, O -Lec O -ture O -faite O diff --git a/demo/bio_folder/toy_test_annot.bio b/demo/bio_folder/toy_test_annot.bio deleted file mode 100644 index 5a941ee..0000000 --- a/demo/bio_folder/toy_test_annot.bio +++ /dev/null @@ -1,39 +0,0 @@ -John B-PER -Ronald I-PER -Reuel I-PER -Tolkien I-PER -was O -born O -on O -three B-DAT -January I-DAT -eighteen I-DAT -ninety I-DAT -- I-DAT -two I-DAT -in O -Bloemfontein B-LOC -in O -the O -Orange B-LOC -Free I-LOC -State I-LOC -, O -to O -Arthur B-PER -Reuel I-PER -Tolkien I-PER -, O -an O -English O -bank B-OCC -manager I-OCC -, O -and O -his O -wife O -Mabel B-PER -, O -née O -Suffield B-PER -. O diff --git a/demo/bio_folder/toy_test_predict.bio b/demo/bio_folder/toy_test_predict.bio deleted file mode 100644 index 5a941ee..0000000 --- a/demo/bio_folder/toy_test_predict.bio +++ /dev/null @@ -1,39 +0,0 @@ -John B-PER -Ronald I-PER -Reuel I-PER -Tolkien I-PER -was O -born O -on O -three B-DAT -January I-DAT -eighteen I-DAT -ninety I-DAT -- I-DAT -two I-DAT -in O -Bloemfontein B-LOC -in O -the O -Orange B-LOC -Free I-LOC -State I-LOC -, O -to O -Arthur B-PER -Reuel I-PER -Tolkien I-PER -, O -an O -English O -bank B-OCC -manager I-OCC -, O -and O -his O -wife O -Mabel B-PER -, O -née O -Suffield B-PER -. O diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 2f0070d..80335e9 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -595,11 +595,17 @@ def run_multiple(file_csv, folder, threshold, verbose): else: raise Exception(f"No file found for files {annot}, {predict}") if count: - print( - "Average scores in all corpus (mean of final files scores)\n" - f" * Precision: {round(precision/count, 3)}\n" - f" * Recall: {round(recall/count, 3)}\n" - f" * F1: {round(f1/count, 3)}\n" + print("Average score on all corpus") + tt.print( + [ + [ + round(precision / count, 3), + round(recall / count, 3), + round(f1 / count, 3), + ] + ], + ["Precision", "Recall", "F1"], + style=tt.styles.markdown, ) else: raise Exception("No file were counted") @@ -624,36 +630,24 @@ def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Compute score of NER on predict.") - parser.add_argument( - "-m", - "--multiple", - help="Single if 1, multiple 2", - type=int, - required=True, - ) - parser.add_argument( + + group = parser.add_mutually_exclusive_group() + group.add_argument( "-a", "--annot", help="Annotation in BIO format.", ) - parser.add_argument( - "-p", - "--predict", - help="Prediction in BIO format.", - ) - parser.add_argument( - "-t", - "--threshold", - help="Set a distance threshold for the match between gold and predicted entity.", - default=THRESHOLD, - type=threshold_float_type, - ) - parser.add_argument( + group.add_argument( "-c", "--csv", help="Csv with the correlation between the annotation bio files and the predict bio files", type=Path, ) + parser.add_argument( + "-p", + "--predict", + help="Prediction in BIO format.", + ) parser.add_argument( "-f", "--folder", @@ -666,25 +660,30 @@ def main(): help="Print only the recap if False", action="store_false", ) + parser.add_argument( + "-t", + "--threshold", + help="Set a distance threshold for the match between gold and predicted entity.", + default=THRESHOLD, + type=threshold_float_type, + ) + args = parser.parse_args() - if args.multiple == 1 or args.multiple == 2: - if args.multiple == 2: - if not args.folder: - raise argparse.ArgumentError(args.folder, "-f must be given if -m is 2") - if not args.csv: - raise argparse.ArgumentError(args.folder, "-c must be given if -m is 2") - if args.folder and args.csv: - run_multiple(args.csv, args.folder, args.threshold, args.verbose) - if args.multiple == 1: - if not args.annot: - raise argparse.ArgumentError(args.folder, "-a must be given if -m is 1") - if not args.predict: - raise argparse.ArgumentError(args.folder, "-p must be given if -m is 1") - if args.annot and args.predict: - run(args.annot, args.predict, args.threshold, args.verbose) + if args.annot: + if not args.predict: + raise parser.error("You need to specify the path to a predict file with -p") + if args.annot and args.predict: + run(args.annot, args.predict, args.threshold, args.verbose) + elif args.csv: + if not args.folder: + raise parser.error( + "You need to specify the path to a folder of bio files with -f" + ) + if args.folder and args.csv: + run_multiple(args.csv, args.folder, args.threshold, args.verbose) else: - raise argparse.ArgumentTypeError("Value has to be 1 or 2") + raise parser.error("You need to specify the argument of input file") if __name__ == "__main__": -- GitLab