From d9eb90635c2a4c17257933033696691264a4e7a8 Mon Sep 17 00:00:00 2001 From: Charlotte Mauvezin <charlotte.mauvezin@irht.cnrs.fr> Date: Mon, 3 Jan 2022 16:04:25 +0100 Subject: [PATCH] Rebase --- nerval/evaluate.py | 90 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 8 deletions(-) diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 6601a72..ab11dde 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- import argparse +import glob import logging import os import re +from csv import reader +from pathlib import Path import editdistance import edlib @@ -81,7 +84,11 @@ def parse_bio(path: str) -> dict: try: word, label = line.split() except ValueError: - raise (Exception(f"The file {path} given in input is not in BIO format.")) + raise ( + Exception( + f"The file {path} given in input is not in BIO format: check line {index} ({line})" + ) + ) # Preserve hyphens to avoid confusion with the hyphens added later during alignment word = word.replace("-", "§") @@ -553,6 +560,37 @@ def run(annotation: str, prediction: str, threshold: int, verbose: bool) -> dict return scores +def run_multiple(file_csv, folder, threshold, verbose): + """Run the program for multiple files (correlation indicated in the csv file)""" + # Read the csv in a list + with open(file_csv, "r") as read_obj: + csv_reader = reader(read_obj) + list_cor = list(csv_reader) + + if os.path.isdir(folder): + list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True) + + for row in list_cor: + annot = None + predict = None + + for file in list_bio_file: + if row[0] == os.path.basename(file): + annot = file + for file in list_bio_file: + if row[1] == os.path.basename(file): + predict = file + + if annot and predict: + print(os.path.basename(predict)) + run(annot, predict, threshold, verbose) + print() + else: + raise f"No file found for files {annot}, {predict}" + else: + raise Exception("the path indicated does not lead to a folder.") + + def threshold_float_type(arg): """Type function for argparse.""" try: @@ -571,30 +609,66 @@ def main(): parser = argparse.ArgumentParser(description="Compute score of NER on predict.") parser.add_argument( - "-a", "--annot", help="Annotation in BIO format.", required=True + "-m", + "--multiple", + help="Single if 1, multiple 2", + type=int, + required=True, + ) + parser.add_argument( + "-a", + "--annot", + help="Annotation in BIO format.", ) parser.add_argument( - "-p", "--predict", help="Prediction in BIO format.", required=True + "-p", + "--predict", + help="Prediction in BIO format.", ) parser.add_argument( "-t", "--threshold", help="Set a distance threshold for the match between gold and predicted entity.", - required=False, default=THRESHOLD, type=threshold_float_type, ) + parser.add_argument( + "-c", + "--csv", + help="Csv with the correlation between the annotation bio files and the predict bio files", + type=Path, + ) + parser.add_argument( + "-f", + "--folder", + help="Folder containing the bio files referred to in the csv file", + type=Path, + ) parser.add_argument( "-v", "--verbose", - help="Print only the recap if False and detailed results if True (default)", + help="Print only the recap if False", action="store_false", - default="True", ) - args = parser.parse_args() - run(args.annot, args.predict, args.threshold, args.verbose) + if args.multiple == 1 or args.multiple == 2: + if args.multiple == 2: + if not args.folder: + raise argparse.ArgumentError(args.folder, "-f must be given if -m is 2") + if not args.csv: + raise argparse.ArgumentError(args.folder, "-c must be given if -m is 2") + if args.folder and args.csv: + run_multiple(args.csv, args.folder, args.threshold, args.verbose) + if args.multiple == 1: + if not args.annot: + raise argparse.ArgumentError(args.folder, "-a must be given if -m is 1") + if not args.predict: + raise argparse.ArgumentError(args.folder, "-p must be given if -m is 1") + if args.annot and args.predict: + run(args.annot, args.predict, args.threshold, args.verbose) + else: + raise argparse.ArgumentTypeError("Value has to be 1 or 2") if __name__ == "__main__": -- GitLab