From 1925e390e261a92bfa9a26c5b3c0eb4166e8b1c8 Mon Sep 17 00:00:00 2001
From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr>
Date: Wed, 22 Dec 2021 11:02:01 +0100
Subject: [PATCH] Multiple input

---
 demo/annot/demo_annot.bio      | 82 ++++++++++++++++++++++++++++++++++
 demo/annot/toy_test_annot.bio  | 39 ++++++++++++++++
 demo/cor.csv                   |  3 ++
 demo/pred/demo_predict.bio     | 81 +++++++++++++++++++++++++++++++++
 demo/pred/toy_test_predict.bio | 39 ++++++++++++++++
 nerval/evaluate.py             | 50 ++++++++++++++++++++-
 requirements.txt               |  1 +
 7 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 demo/annot/demo_annot.bio
 create mode 100644 demo/annot/toy_test_annot.bio
 create mode 100644 demo/cor.csv
 create mode 100644 demo/pred/demo_predict.bio
 create mode 100644 demo/pred/toy_test_predict.bio

diff --git a/demo/annot/demo_annot.bio b/demo/annot/demo_annot.bio
new file mode 100644
index 0000000..cf16200
--- /dev/null
+++ b/demo/annot/demo_annot.bio
@@ -0,0 +1,82 @@
+Césaire B-PER
+Alphonse I-PER
+Garon I-PER
+marraine O
+Adeline B-PER
+Dionne I-PER
+, O
+soussignés O
+Lecture O
+faite O
+Adéline O
+Dionne O
+Arsène O
+Côté O
+Arpin O
+R O
+Le O
+onze B-DAT
+aout I-DAT
+mil I-DAT
+neuf I-DAT
+cent I-DAT
+un I-DAT
+nous O
+prêtre O
+soussigné O
+avons O
+baptisé O
+Marie B-PER
+Luce I-PER
+Louise I-PER
+, O
+née O
+la B-DAT
+veille I-DAT
+, O
+fille O
+légitime O
+de O
+Carmel B-PER
+Côté I-PER
+, O
+cordonnier B-OCC
+, O
+pré O
+- O
+sent O
+, O
+déclarant O
+ne O
+savoir O
+signer O
+, O
+et O
+de O
+Eugé B-PER
+nie I-PER
+Fréchette I-PER
+, O
+de O
+cette B-LOC
+paroisse I-LOC
+. O
+Parrain O
+Napoléon B-PER
+Fréchette I-PER
+, O
+marraine O
+Adeline B-PER
+Tremblay I-PER
+, O
+soussignés O
+, O
+de O
+Ste B-LOC
+Luce I-LOC
+, O
+Lec O
+- O
+ture O
+faite O
+. O
diff --git a/demo/annot/toy_test_annot.bio b/demo/annot/toy_test_annot.bio
new file mode 100644
index 0000000..5a941ee
--- /dev/null
+++ b/demo/annot/toy_test_annot.bio
@@ -0,0 +1,39 @@
+John B-PER
+Ronald I-PER
+Reuel I-PER
+Tolkien I-PER
+was O
+born O
+on O
+three B-DAT
+January I-DAT
+eighteen I-DAT
+ninety I-DAT
+- I-DAT
+two I-DAT
+in O
+Bloemfontein B-LOC
+in O
+the O
+Orange B-LOC
+Free I-LOC
+State I-LOC
+, O
+to O
+Arthur B-PER
+Reuel I-PER
+Tolkien I-PER
+, O
+an O
+English O
+bank B-OCC
+manager I-OCC
+, O
+and O
+his O
+wife O
+Mabel B-PER
+, O
+née O
+Suffield B-PER
+. O
diff --git a/demo/cor.csv b/demo/cor.csv
new file mode 100644
index 0000000..a0f41c6
--- /dev/null
+++ b/demo/cor.csv
@@ -0,0 +1,3 @@
+annot,predict
+demo_annot.bio,demo_predict.bio
+toy_test_annot.bio,toy_test_predict.bio
\ No newline at end of file
diff --git a/demo/pred/demo_predict.bio b/demo/pred/demo_predict.bio
new file mode 100644
index 0000000..7e01c2d
--- /dev/null
+++ b/demo/pred/demo_predict.bio
@@ -0,0 +1,81 @@
+Césaire B-PER
+Alphonse O
+Garon B-PER
+marraine O
+Adeline B-PER
+Dionne I-PER
+, O
+soussignés O
+Lecture O
+faite O
+Adéline O
+Dionne O
+Arsène O
+Côté O
+Arpin O
+R O
+Le O
+onze B-DAT
+aout I-DAT
+mil I-DAT
+neuf I-DAT
+cent I-DAT
+un O
+nous O
+pretre O
+soussigné O
+avons O
+baptisé O
+Marie B-PER
+Luce I-PER
+Louise I-PER
+, O
+née O
+la B-DAT
+veille I-DAT
+, O
+fille O
+légitime O
+de O
+Carmel B-PER
+Côté I-PER
+, O
+cordonnier B-OCC
+, O
+pré O
+- O
+sent O
+, O
+déclarant O
+ne O
+savoir O
+signer O
+, O
+et O
+de O
+Eugé B-PER
+nie I-PER
+Fréchette I-PER
+, O
+de O
+cette B-LOC
+paroisse I-LOC
+. O
+Parrain O
+Napoléon B-PER
+Fréchette I-PER
+, O
+marraine O
+Adéline B-PER
+Tremblay I-PER
+, O
+sousignés O
+, O
+de O
+St B-LOC
+. I-LOC
+Luce O
+, O
+Lec O
+ture O
+faite O
diff --git a/demo/pred/toy_test_predict.bio b/demo/pred/toy_test_predict.bio
new file mode 100644
index 0000000..5a941ee
--- /dev/null
+++ b/demo/pred/toy_test_predict.bio
@@ -0,0 +1,39 @@
+John B-PER
+Ronald I-PER
+Reuel I-PER
+Tolkien I-PER
+was O
+born O
+on O
+three B-DAT
+January I-DAT
+eighteen I-DAT
+ninety I-DAT
+- I-DAT
+two I-DAT
+in O
+Bloemfontein B-LOC
+in O
+the O
+Orange B-LOC
+Free I-LOC
+State I-LOC
+, O
+to O
+Arthur B-PER
+Reuel I-PER
+Tolkien I-PER
+, O
+an O
+English O
+bank B-OCC
+manager I-OCC
+, O
+and O
+his O
+wife O
+Mabel B-PER
+, O
+née O
+Suffield B-PER
+. O
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index 6e2fef9..de56dd6 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -9,6 +9,11 @@ import editdistance
 import edlib
 import termtables as tt
 
+import glob
+from pathlib import Path
+import pandas as pd
+
+
 NOT_ENTITY_TAG = "O"
 
 THRESHOLD = 0.30
@@ -533,6 +538,39 @@ def run(annotation: str, prediction: str, threshold: int) -> dict:
     return scores
 
 
+def run_multiple(file_csv, annot_folder, predict_folder, threshold):
+    """Run the program for multiple file (correlation indicated in the csv file)"""
+    # Read the csv in a dataframe
+    df_cor = pd.read_csv(file_csv)
+
+    # Check if the variable given are folder
+    if os.path.isdir(annot_folder) and os.path.isdir(predict_folder):
+        list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True)
+        list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True)
+
+        for index, row in df_cor.iterrows():
+            annot = None
+            predict = None
+
+            # Check if the file exist
+            for file_annot in list_bio_annot:
+                if row["annot"] == os.path.basename(file_annot):
+                    annot = file_annot
+            for file_predict in list_bio_predict:
+                if row["predict"] == os.path.basename(file_predict):
+                    predict = file_predict
+
+            # Apply the evaluation
+            if annot and predict:
+                run(annot, predict, threshold)
+                print("")
+            else:
+                print(f"No file found for row {index}")
+
+    else:
+        print("Error this is no folder")
+
+
 def threshold_float_type(arg):
     """Type function for argparse."""
     try:
@@ -564,9 +602,19 @@ def main():
         default=THRESHOLD,
         type=threshold_float_type,
     )
+    parser.add_argument(
+        "-c",
+        "--csv",
+        help="csv with the correlation between the annotation bio file and the predict bio file",
+        required=False,
+        type=Path,
+    )
     args = parser.parse_args()
 
-    run(args.annot, args.predict, args.threshold)
+    if args.csv:
+        run_multiple(args.csv, args.annot, args.predict, args.threshold)
+    else:
+        run(args.annot, args.predict, args.threshold)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index d6af2d0..70abc00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 editdistance==0.5.3
 edlib==1.3.8.post2
 termtables==0.2.3
+pandas==1.3.4
\ No newline at end of file
-- 
GitLab