Correctif

71a3e8c0 · Charlotte Mauvezin · 0aca621a · 71a3e8c0 · 71a3e8c0 · 71a3e8c0
Commit 71a3e8c0 authored 3 years ago by Charlotte Mauvezin
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions.
 Expected inputs are a ground truth and a prediction BIOES/BILOU files without any  ''§'' occurrences, this character having a special meaning during evaluation.
+It also works by designating a csv file with file matches (one pair per row with the annotation file in the first column and the prediction file in the second column)
 ## Usage
@@ -31,7 +32,7 @@ You can now use Nerval in command line :
 ```
 $ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \
-		 [-t/--threshold <threshold_value>]
+		 [-t/--threshold <threshold_value>] [-c/--csv <correspondence_file.csv>]
 ```
 The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match.
@@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for
 $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
 ```
+You can also indicate a folder and a csv file to have multiple evaluation at once.
+```
+$ nerval -a demo/annot/ -p demo/pred/ -c demo/cor.csv
+```
 ## Metric
 This metric uses string alignment at character level.

--- a/demo/annot/demo_annot.bio
+++ b/demo/annot/demo_annot.bio
--- a/demo/pred/demo_predict.bio
+++ b/demo/pred/demo_predict.bio
--- a/demo/annot/toy_test_annot.bio
+++ b/demo/annot/toy_test_annot.bio
--- a/demo/pred/toy_test_predict.bio
+++ b/demo/pred/toy_test_predict.bio
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -84,9 +84,11 @@ def parse_bio(path: str) -> dict:
        try:
            word, label = line.split()
        except ValueError:
-            print(index)
+            raise (
-            print(line)
+                Exception(
-            raise (Exception(f"The file {path} given in input is not in BIO format."))
+                    f"The file {path} given in input is not in BIO format: check line {index} ({line})"
+                )
+            )
        # Preserve hyphens to avoid confusion with the hyphens added later during alignment
        word = word.replace("-", "§")
@@ -538,37 +540,33 @@ def run(annotation: str, prediction: str, threshold: int) -> dict:
    return scores
-def run_multiple(file_csv, annot_folder, predict_folder, threshold):
+def run_multiple(file_csv, folder, threshold):
-    """Run the program for multiple file (correlation indicated in the csv file)"""
+    """Run the program for multiple files (correlation indicated in the csv file)"""
    # Read the csv in a dataframe
    df_cor = pd.read_csv(file_csv)
-    # Check if the variable given are folder
+    if os.path.isdir(folder):
-    if os.path.isdir(annot_folder) and os.path.isdir(predict_folder):
+        list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True)
-        list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True)
-        list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True)
        for index, row in df_cor.iterrows():
            annot = None
            predict = None
-            # Check if the file exist
+            for file in list_bio_file:
-            for file_annot in list_bio_annot:
+                if row[0] == os.path.basename(file):
-                if row["annot"] == os.path.basename(file_annot):
+                    annot = file
-                    annot = file_annot
+            for file in list_bio_file:
-            for file_predict in list_bio_predict:
+                if row[1] == os.path.basename(file):
-                if row["predict"] == os.path.basename(file_predict):
+                    predict = file
-                    predict = file_predict
-            # Apply the evaluation
            if annot and predict:
                run(annot, predict, threshold)
-                print("")
+                print()
            else:
-                print(f"No file found for row {index}")
+                raise f"No file found for row {index}"
    else:
-        print("Error this is no folder")
+        raise "This is no folder"
 def threshold_float_type(arg):
@@ -589,10 +587,16 @@ def main():
    parser = argparse.ArgumentParser(description="Compute score of NER on predict.")
    parser.add_argument(
-        "-a", "--annot", help="Annotation in BIO format.", required=True
+        "-a",
+        "--annot",
+        help="Annotation in BIO format.",
+        required=False,
    )
    parser.add_argument(
-        "-p", "--predict", help="Prediction in BIO format.", required=True
+        "-p",
+        "--predict",
+        help="Prediction in BIO format.",
+        required=False,
    )
    parser.add_argument(
        "-t",
@@ -605,16 +609,25 @@ def main():
    parser.add_argument(
        "-c",
        "--csv",
-        help="csv with the correlation between the annotation bio file and the predict bio file",
+        help="Csv with the correlation between the annotation bio files and the predict bio files",
+        required=False,
+        type=Path,
+    )
+    parser.add_argument(
+        "-f",
+        "--folder",
+        help="Folder containing the bio files referred to in the csv file",
        required=False,
        type=Path,
    )
    args = parser.parse_args()
-    if args.csv:
+    if args.csv and args.folder:
-        run_multiple(args.csv, args.annot, args.predict, args.threshold)
+        run_multiple(args.csv, args.folder, args.threshold)
-    else:
+    elif args.annot and args.predict:
        run(args.annot, args.predict, args.threshold)
+    else:
+        raise "You did not give the proper input"
 if __name__ == "__main__":