From 71a3e8c0d691662ecb8511ca6c0d005c10f85c56 Mon Sep 17 00:00:00 2001
From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr>
Date: Thu, 23 Dec 2021 12:27:25 +0100
Subject: [PATCH] Correctif

---
 README.md                                     |  9 ++-
 demo/{annot => bio_folder}/demo_annot.bio     |  0
 demo/{pred => bio_folder}/demo_predict.bio    |  0
 demo/{annot => bio_folder}/toy_test_annot.bio |  0
 .../{pred => bio_folder}/toy_test_predict.bio |  0
 nerval/evaluate.py                            | 65 +++++++++++--------
 6 files changed, 47 insertions(+), 27 deletions(-)
 rename demo/{annot => bio_folder}/demo_annot.bio (100%)
 rename demo/{pred => bio_folder}/demo_predict.bio (100%)
 rename demo/{annot => bio_folder}/toy_test_annot.bio (100%)
 rename demo/{pred => bio_folder}/toy_test_predict.bio (100%)

diff --git a/README.md b/README.md
index 637e0c8..f20bca8 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions.
 
 Expected inputs are a ground truth and a prediction BIOES/BILOU files without any  ''Â§'' occurrences, this character having a special meaning during evaluation.
+It also works by designating a csv file with file matches (one pair per row with the annotation file in the first column and the prediction file in the second column)
 
 ## Usage
 
@@ -31,7 +32,7 @@ You can now use Nerval in command line :
 
 ```
 $ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \
-		 [-t/--threshold <threshold_value>]
+		 [-t/--threshold <threshold_value>] [-c/--csv <correspondence_file.csv>]
 ```
 
 The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match.
@@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for
 $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
 ```
 
+You can also indicate a folder and a csv file to have multiple evaluation at once.
+
+```
+$ nerval -a demo/annot/ -p demo/pred/ -c demo/cor.csv
+```
+
 ## Metric
 
 This metric uses string alignment at character level.
diff --git a/demo/annot/demo_annot.bio b/demo/bio_folder/demo_annot.bio
similarity index 100%
rename from demo/annot/demo_annot.bio
rename to demo/bio_folder/demo_annot.bio
diff --git a/demo/pred/demo_predict.bio b/demo/bio_folder/demo_predict.bio
similarity index 100%
rename from demo/pred/demo_predict.bio
rename to demo/bio_folder/demo_predict.bio
diff --git a/demo/annot/toy_test_annot.bio b/demo/bio_folder/toy_test_annot.bio
similarity index 100%
rename from demo/annot/toy_test_annot.bio
rename to demo/bio_folder/toy_test_annot.bio
diff --git a/demo/pred/toy_test_predict.bio b/demo/bio_folder/toy_test_predict.bio
similarity index 100%
rename from demo/pred/toy_test_predict.bio
rename to demo/bio_folder/toy_test_predict.bio
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index e14561e..ebdba10 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -84,9 +84,11 @@ def parse_bio(path: str) -> dict:
         try:
             word, label = line.split()
         except ValueError:
-            print(index)
-            print(line)
-            raise (Exception(f"The file {path} given in input is not in BIO format."))
+            raise (
+                Exception(
+                    f"The file {path} given in input is not in BIO format: check line {index} ({line})"
+                )
+            )
 
         # Preserve hyphens to avoid confusion with the hyphens added later during alignment
         word = word.replace("-", "Â§")
@@ -538,37 +540,33 @@ def run(annotation: str, prediction: str, threshold: int) -> dict:
     return scores
 
 
-def run_multiple(file_csv, annot_folder, predict_folder, threshold):
-    """Run the program for multiple file (correlation indicated in the csv file)"""
+def run_multiple(file_csv, folder, threshold):
+    """Run the program for multiple files (correlation indicated in the csv file)"""
     # Read the csv in a dataframe
     df_cor = pd.read_csv(file_csv)
 
-    # Check if the variable given are folder
-    if os.path.isdir(annot_folder) and os.path.isdir(predict_folder):
-        list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True)
-        list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True)
+    if os.path.isdir(folder):
+        list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True)
 
         for index, row in df_cor.iterrows():
             annot = None
             predict = None
 
-            # Check if the file exist
-            for file_annot in list_bio_annot:
-                if row["annot"] == os.path.basename(file_annot):
-                    annot = file_annot
-            for file_predict in list_bio_predict:
-                if row["predict"] == os.path.basename(file_predict):
-                    predict = file_predict
+            for file in list_bio_file:
+                if row[0] == os.path.basename(file):
+                    annot = file
+            for file in list_bio_file:
+                if row[1] == os.path.basename(file):
+                    predict = file
 
-            # Apply the evaluation
             if annot and predict:
                 run(annot, predict, threshold)
-                print("")
+                print()
             else:
-                print(f"No file found for row {index}")
+                raise f"No file found for row {index}"
 
     else:
-        print("Error this is no folder")
+        raise "This is no folder"
 
 
 def threshold_float_type(arg):
@@ -589,10 +587,16 @@ def main():
 
     parser = argparse.ArgumentParser(description="Compute score of NER on predict.")
     parser.add_argument(
-        "-a", "--annot", help="Annotation in BIO format.", required=True
+        "-a",
+        "--annot",
+        help="Annotation in BIO format.",
+        required=False,
     )
     parser.add_argument(
-        "-p", "--predict", help="Prediction in BIO format.", required=True
+        "-p",
+        "--predict",
+        help="Prediction in BIO format.",
+        required=False,
     )
     parser.add_argument(
         "-t",
@@ -605,16 +609,25 @@ def main():
     parser.add_argument(
         "-c",
         "--csv",
-        help="csv with the correlation between the annotation bio file and the predict bio file",
+        help="Csv with the correlation between the annotation bio files and the predict bio files",
+        required=False,
+        type=Path,
+    )
+    parser.add_argument(
+        "-f",
+        "--folder",
+        help="Folder containing the bio files referred to in the csv file",
         required=False,
         type=Path,
     )
     args = parser.parse_args()
 
-    if args.csv:
-        run_multiple(args.csv, args.annot, args.predict, args.threshold)
-    else:
+    if args.csv and args.folder:
+        run_multiple(args.csv, args.folder, args.threshold)
+    elif args.annot and args.predict:
         run(args.annot, args.predict, args.threshold)
+    else:
+        raise "You did not give the proper input"
 
 
 if __name__ == "__main__":
-- 
GitLab