From 71a3e8c0d691662ecb8511ca6c0d005c10f85c56 Mon Sep 17 00:00:00 2001 From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr> Date: Thu, 23 Dec 2021 12:27:25 +0100 Subject: [PATCH] Correctif --- README.md | 9 ++- demo/{annot => bio_folder}/demo_annot.bio | 0 demo/{pred => bio_folder}/demo_predict.bio | 0 demo/{annot => bio_folder}/toy_test_annot.bio | 0 .../{pred => bio_folder}/toy_test_predict.bio | 0 nerval/evaluate.py | 65 +++++++++++-------- 6 files changed, 47 insertions(+), 27 deletions(-) rename demo/{annot => bio_folder}/demo_annot.bio (100%) rename demo/{pred => bio_folder}/demo_predict.bio (100%) rename demo/{annot => bio_folder}/toy_test_annot.bio (100%) rename demo/{pred => bio_folder}/toy_test_predict.bio (100%) diff --git a/README.md b/README.md index 637e0c8..f20bca8 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions. Expected inputs are a ground truth and a prediction BIOES/BILOU files without any ''§'' occurrences, this character having a special meaning during evaluation. +It also works by designating a csv file with file matches (one pair per row with the annotation file in the first column and the prediction file in the second column) ## Usage @@ -31,7 +32,7 @@ You can now use Nerval in command line : ``` $ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \ - [-t/--threshold <threshold_value>] + [-t/--threshold <threshold_value>] [-c/--csv <correspondence_file.csv>] ``` The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match. @@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio ``` +You can also indicate a folder and a csv file to have multiple evaluation at once. + +``` +$ nerval -a demo/annot/ -p demo/pred/ -c demo/cor.csv +``` + ## Metric This metric uses string alignment at character level. diff --git a/demo/annot/demo_annot.bio b/demo/bio_folder/demo_annot.bio similarity index 100% rename from demo/annot/demo_annot.bio rename to demo/bio_folder/demo_annot.bio diff --git a/demo/pred/demo_predict.bio b/demo/bio_folder/demo_predict.bio similarity index 100% rename from demo/pred/demo_predict.bio rename to demo/bio_folder/demo_predict.bio diff --git a/demo/annot/toy_test_annot.bio b/demo/bio_folder/toy_test_annot.bio similarity index 100% rename from demo/annot/toy_test_annot.bio rename to demo/bio_folder/toy_test_annot.bio diff --git a/demo/pred/toy_test_predict.bio b/demo/bio_folder/toy_test_predict.bio similarity index 100% rename from demo/pred/toy_test_predict.bio rename to demo/bio_folder/toy_test_predict.bio diff --git a/nerval/evaluate.py b/nerval/evaluate.py index e14561e..ebdba10 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -84,9 +84,11 @@ def parse_bio(path: str) -> dict: try: word, label = line.split() except ValueError: - print(index) - print(line) - raise (Exception(f"The file {path} given in input is not in BIO format.")) + raise ( + Exception( + f"The file {path} given in input is not in BIO format: check line {index} ({line})" + ) + ) # Preserve hyphens to avoid confusion with the hyphens added later during alignment word = word.replace("-", "§") @@ -538,37 +540,33 @@ def run(annotation: str, prediction: str, threshold: int) -> dict: return scores -def run_multiple(file_csv, annot_folder, predict_folder, threshold): - """Run the program for multiple file (correlation indicated in the csv file)""" +def run_multiple(file_csv, folder, threshold): + """Run the program for multiple files (correlation indicated in the csv file)""" # Read the csv in a dataframe df_cor = pd.read_csv(file_csv) - # Check if the variable given are folder - if os.path.isdir(annot_folder) and os.path.isdir(predict_folder): - list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True) - list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True) + if os.path.isdir(folder): + list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True) for index, row in df_cor.iterrows(): annot = None predict = None - # Check if the file exist - for file_annot in list_bio_annot: - if row["annot"] == os.path.basename(file_annot): - annot = file_annot - for file_predict in list_bio_predict: - if row["predict"] == os.path.basename(file_predict): - predict = file_predict + for file in list_bio_file: + if row[0] == os.path.basename(file): + annot = file + for file in list_bio_file: + if row[1] == os.path.basename(file): + predict = file - # Apply the evaluation if annot and predict: run(annot, predict, threshold) - print("") + print() else: - print(f"No file found for row {index}") + raise f"No file found for row {index}" else: - print("Error this is no folder") + raise "This is no folder" def threshold_float_type(arg): @@ -589,10 +587,16 @@ def main(): parser = argparse.ArgumentParser(description="Compute score of NER on predict.") parser.add_argument( - "-a", "--annot", help="Annotation in BIO format.", required=True + "-a", + "--annot", + help="Annotation in BIO format.", + required=False, ) parser.add_argument( - "-p", "--predict", help="Prediction in BIO format.", required=True + "-p", + "--predict", + help="Prediction in BIO format.", + required=False, ) parser.add_argument( "-t", @@ -605,16 +609,25 @@ def main(): parser.add_argument( "-c", "--csv", - help="csv with the correlation between the annotation bio file and the predict bio file", + help="Csv with the correlation between the annotation bio files and the predict bio files", + required=False, + type=Path, + ) + parser.add_argument( + "-f", + "--folder", + help="Folder containing the bio files referred to in the csv file", required=False, type=Path, ) args = parser.parse_args() - if args.csv: - run_multiple(args.csv, args.annot, args.predict, args.threshold) - else: + if args.csv and args.folder: + run_multiple(args.csv, args.folder, args.threshold) + elif args.annot and args.predict: run(args.annot, args.predict, args.threshold) + else: + raise "You did not give the proper input" if __name__ == "__main__": -- GitLab