Skip to content
Snippets Groups Projects
Commit 71a3e8c0 authored by Charlotte Mauvezin's avatar Charlotte Mauvezin
Browse files

Correctif

parent 0aca621a
No related branches found
No related tags found
1 merge request!10Multiple input
Pipeline #103820 passed
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions. Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions.
Expected inputs are a ground truth and a prediction BIOES/BILOU files without any ''§'' occurrences, this character having a special meaning during evaluation. Expected inputs are a ground truth and a prediction BIOES/BILOU files without any ''§'' occurrences, this character having a special meaning during evaluation.
It also works by designating a csv file with file matches (one pair per row with the annotation file in the first column and the prediction file in the second column)
## Usage ## Usage
...@@ -31,7 +32,7 @@ You can now use Nerval in command line : ...@@ -31,7 +32,7 @@ You can now use Nerval in command line :
``` ```
$ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \ $ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio> \
[-t/--threshold <threshold_value>] [-t/--threshold <threshold_value>] [-c/--csv <correspondence_file.csv>]
``` ```
The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match. The threshold value should be between 0 and 1. It designates the acceptable number of characters differing between an annotated and a predicted entity - over the number of characters in the annotated entity - to consider it as a match. Default value is 0.30. 0 would impose perfect matches, 1 would allow completely different strings to be considered as a match.
...@@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for ...@@ -58,6 +59,12 @@ We also provide two annotation and prediction toy files, which are identical for
$ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
``` ```
You can also indicate a folder and a csv file to have multiple evaluation at once.
```
$ nerval -a demo/annot/ -p demo/pred/ -c demo/cor.csv
```
## Metric ## Metric
This metric uses string alignment at character level. This metric uses string alignment at character level.
......
File moved
File moved
File moved
...@@ -84,9 +84,11 @@ def parse_bio(path: str) -> dict: ...@@ -84,9 +84,11 @@ def parse_bio(path: str) -> dict:
try: try:
word, label = line.split() word, label = line.split()
except ValueError: except ValueError:
print(index) raise (
print(line) Exception(
raise (Exception(f"The file {path} given in input is not in BIO format.")) f"The file {path} given in input is not in BIO format: check line {index} ({line})"
)
)
# Preserve hyphens to avoid confusion with the hyphens added later during alignment # Preserve hyphens to avoid confusion with the hyphens added later during alignment
word = word.replace("-", "§") word = word.replace("-", "§")
...@@ -538,37 +540,33 @@ def run(annotation: str, prediction: str, threshold: int) -> dict: ...@@ -538,37 +540,33 @@ def run(annotation: str, prediction: str, threshold: int) -> dict:
return scores return scores
def run_multiple(file_csv, annot_folder, predict_folder, threshold): def run_multiple(file_csv, folder, threshold):
"""Run the program for multiple file (correlation indicated in the csv file)""" """Run the program for multiple files (correlation indicated in the csv file)"""
# Read the csv in a dataframe # Read the csv in a dataframe
df_cor = pd.read_csv(file_csv) df_cor = pd.read_csv(file_csv)
# Check if the variable given are folder if os.path.isdir(folder):
if os.path.isdir(annot_folder) and os.path.isdir(predict_folder): list_bio_file = glob.glob(str(folder) + "/**/*.bio", recursive=True)
list_bio_annot = glob.glob(annot_folder + "/**/*.bio", recursive=True)
list_bio_predict = glob.glob(predict_folder + "/**/*.bio", recursive=True)
for index, row in df_cor.iterrows(): for index, row in df_cor.iterrows():
annot = None annot = None
predict = None predict = None
# Check if the file exist for file in list_bio_file:
for file_annot in list_bio_annot: if row[0] == os.path.basename(file):
if row["annot"] == os.path.basename(file_annot): annot = file
annot = file_annot for file in list_bio_file:
for file_predict in list_bio_predict: if row[1] == os.path.basename(file):
if row["predict"] == os.path.basename(file_predict): predict = file
predict = file_predict
# Apply the evaluation
if annot and predict: if annot and predict:
run(annot, predict, threshold) run(annot, predict, threshold)
print("") print()
else: else:
print(f"No file found for row {index}") raise f"No file found for row {index}"
else: else:
print("Error this is no folder") raise "This is no folder"
def threshold_float_type(arg): def threshold_float_type(arg):
...@@ -589,10 +587,16 @@ def main(): ...@@ -589,10 +587,16 @@ def main():
parser = argparse.ArgumentParser(description="Compute score of NER on predict.") parser = argparse.ArgumentParser(description="Compute score of NER on predict.")
parser.add_argument( parser.add_argument(
"-a", "--annot", help="Annotation in BIO format.", required=True "-a",
"--annot",
help="Annotation in BIO format.",
required=False,
) )
parser.add_argument( parser.add_argument(
"-p", "--predict", help="Prediction in BIO format.", required=True "-p",
"--predict",
help="Prediction in BIO format.",
required=False,
) )
parser.add_argument( parser.add_argument(
"-t", "-t",
...@@ -605,16 +609,25 @@ def main(): ...@@ -605,16 +609,25 @@ def main():
parser.add_argument( parser.add_argument(
"-c", "-c",
"--csv", "--csv",
help="csv with the correlation between the annotation bio file and the predict bio file", help="Csv with the correlation between the annotation bio files and the predict bio files",
required=False,
type=Path,
)
parser.add_argument(
"-f",
"--folder",
help="Folder containing the bio files referred to in the csv file",
required=False, required=False,
type=Path, type=Path,
) )
args = parser.parse_args() args = parser.parse_args()
if args.csv: if args.csv and args.folder:
run_multiple(args.csv, args.annot, args.predict, args.threshold) run_multiple(args.csv, args.folder, args.threshold)
else: elif args.annot and args.predict:
run(args.annot, args.predict, args.threshold) run(args.annot, args.predict, args.threshold)
else:
raise "You did not give the proper input"
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment