Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (5)
...@@ -16,7 +16,6 @@ class MetricManager: ...@@ -16,7 +16,6 @@ class MetricManager:
if "simara" in dataset_name and "page" in dataset_name: if "simara" in dataset_name and "page" in dataset_name:
self.post_processing_module = PostProcessingModuleSIMARA self.post_processing_module = PostProcessingModuleSIMARA
self.matching_tokens = SIMARA_MATCHING_TOKENS self.matching_tokens = SIMARA_MATCHING_TOKENS
self.edit_and_num_edge_nodes = edit_and_num_items_for_ged_from_str_simara
else: else:
self.matching_tokens = dict() self.matching_tokens = dict()
...@@ -246,7 +245,7 @@ class MetricManager: ...@@ -246,7 +245,7 @@ class MetricManager:
pp_pred.append(pp_module.post_process(pred)) pp_pred.append(pp_module.post_process(pred))
metrics["nb_pp_op_layout"].append(pp_module.num_op) metrics["nb_pp_op_layout"].append(pp_module.num_op)
metrics["nb_gt_layout_token"] = [ metrics["nb_gt_layout_token"] = [
len(keep_only_tokens(str_x, self.layout_tokens)) len(keep_only_ner_tokens(str_x, self.layout_tokens))
for str_x in values["str_x"] for str_x in values["str_x"]
] ]
edit_and_num_items = [ edit_and_num_items = [
...@@ -262,16 +261,16 @@ class MetricManager: ...@@ -262,16 +261,16 @@ class MetricManager:
return self.epoch_metrics[name] return self.epoch_metrics[name]
def keep_only_tokens(str, tokens): def keep_only_ner_tokens(str, tokens):
""" """
Remove all but layout tokens from string Remove all but ner tokens from string
""" """
return re.sub("([^" + tokens + "])", "", str) return re.sub("([^" + tokens + "])", "", str)
def keep_all_but_tokens(str, tokens): def keep_all_but_ner_tokens(str, tokens):
""" """
Remove all layout tokens from string Remove all ner tokens from string
""" """
return re.sub("([" + tokens + "])", "", str) return re.sub("([" + tokens + "])", "", str)
...@@ -310,7 +309,7 @@ def format_string_for_wer(str, layout_tokens, remove_punct=False): ...@@ -310,7 +309,7 @@ def format_string_for_wer(str, layout_tokens, remove_punct=False):
r"([\[\]{}/\\()\"'&+*=<>?.;:,!\-—_€#%°])", "", str r"([\[\]{}/\\()\"'&+*=<>?.;:,!\-—_€#%°])", "", str
) # remove punctuation ) # remove punctuation
if layout_tokens is not None: if layout_tokens is not None:
str = keep_all_but_tokens( str = keep_all_but_ner_tokens(
str, layout_tokens str, layout_tokens
) # remove layout tokens from metric ) # remove layout tokens from metric
str = re.sub("([ \n])+", " ", str).strip() # keep only one space character str = re.sub("([ \n])+", " ", str).strip() # keep only one space character
...@@ -322,7 +321,7 @@ def format_string_for_cer(str, layout_tokens): ...@@ -322,7 +321,7 @@ def format_string_for_cer(str, layout_tokens):
Format string for CER computation: remove layout tokens and extra spaces Format string for CER computation: remove layout tokens and extra spaces
""" """
if layout_tokens is not None: if layout_tokens is not None:
str = keep_all_but_tokens( str = keep_all_but_ner_tokens(
str, layout_tokens str, layout_tokens
) # remove layout tokens from metric ) # remove layout tokens from metric
str = re.sub("([\n])+", "\n", str) # remove consecutive line breaks str = re.sub("([\n])+", "\n", str) # remove consecutive line breaks
...@@ -378,8 +377,8 @@ def compute_layout_precision_per_threshold( ...@@ -378,8 +377,8 @@ def compute_layout_precision_per_threshold(
pred, begin_token, end_token, associated_score=score, order_by_score=True pred, begin_token, end_token, associated_score=score, order_by_score=True
) )
gt_list = extract_by_tokens(gt, begin_token, end_token) gt_list = extract_by_tokens(gt, begin_token, end_token)
pred_list = [keep_all_but_tokens(p, layout_tokens) for p in pred_list] pred_list = [keep_all_but_ner_tokens(p, layout_tokens) for p in pred_list]
gt_list = [keep_all_but_tokens(gt, layout_tokens) for gt in gt_list] gt_list = [keep_all_but_ner_tokens(gt, layout_tokens) for gt in gt_list]
precision_per_threshold = [ precision_per_threshold = [
compute_layout_AP_for_given_threshold(gt_list, pred_list, threshold / 100) compute_layout_AP_for_given_threshold(gt_list, pred_list, threshold / 100)
for threshold in range(5, 51, 5) for threshold in range(5, 51, 5)
...@@ -514,7 +513,7 @@ def str_to_graph_simara(str): ...@@ -514,7 +513,7 @@ def str_to_graph_simara(str):
Compute graph from string of layout tokens for the SIMARA dataset at page level Compute graph from string of layout tokens for the SIMARA dataset at page level
""" """
begin_layout_tokens = "".join(list(SIMARA_MATCHING_TOKENS.keys())) begin_layout_tokens = "".join(list(SIMARA_MATCHING_TOKENS.keys()))
layout_token_sequence = keep_only_tokens(str, begin_layout_tokens) layout_token_sequence = keep_only_ner_tokens(str, begin_layout_tokens)
g = nx.DiGraph() g = nx.DiGraph()
g.add_node("D", type="document", level=2, page=0) g.add_node("D", type="document", level=2, page=0)
token_name_dict = {"": "I", "": "D", "": "S", "": "C", "": "P", "": "A"} token_name_dict = {"": "I", "": "D", "": "S", "": "C", "": "P", "": "A"}
...@@ -549,16 +548,3 @@ def graph_edit_distance(g1, g2): ...@@ -549,16 +548,3 @@ def graph_edit_distance(g1, g2):
): ):
new_edit = v new_edit = v
return new_edit return new_edit
def edit_and_num_items_for_ged_from_str_simara(str_gt, str_pred):
"""
Compute graph edit distance and num nodes/edges for normalized graph edit distance
For the SIMARA dataset
"""
g_gt = str_to_graph_simara(str_gt)
g_pred = str_to_graph_simara(str_pred)
return (
graph_edit_distance(g_gt, g_pred),
g_gt.number_of_nodes() + g_gt.number_of_edges(),
)
...@@ -4,8 +4,6 @@ import json ...@@ -4,8 +4,6 @@ import json
import os import os
import pickle import pickle
import random import random
import sys
from datetime import date
from time import time from time import time
import numpy as np import numpy as np
...@@ -523,7 +521,6 @@ class GenericTrainingManager: ...@@ -523,7 +521,6 @@ class GenericTrainingManager:
return return
params = copy.deepcopy(self.params) params = copy.deepcopy(self.params)
params = class_to_str_dict(params) params = class_to_str_dict(params)
params["date"] = date.today().strftime("%d/%m/%Y")
total_params = 0 total_params = 0
for model_name in self.models.keys(): for model_name in self.models.keys():
current_params = compute_nb_params(self.models[model_name]) current_params = compute_nb_params(self.models[model_name])
...@@ -533,21 +530,6 @@ class GenericTrainingManager: ...@@ -533,21 +530,6 @@ class GenericTrainingManager:
] ]
total_params += current_params total_params += current_params
params["model_params"]["total_params"] = "{:,}".format(total_params) params["model_params"]["total_params"] = "{:,}".format(total_params)
params["hardware"] = dict()
if self.device != "cpu":
for i in range(self.params["training_params"]["nb_gpu"]):
params["hardware"][str(i)] = "{} {}".format(
torch.cuda.get_device_name(i), torch.cuda.get_device_properties(i)
)
else:
params["hardware"]["0"] = "CPU"
params["software"] = {
"python_version": sys.version,
"pytorch_version": torch.__version__,
"cuda_version": torch.version.cuda,
"cudnn_version": torch.backends.cudnn.version(),
}
with open(path, "w") as f: with open(path, "w") as f:
json.dump(params, f, indent=4) json.dump(params, f, indent=4)
...@@ -871,8 +853,9 @@ class GenericTrainingManager: ...@@ -871,8 +853,9 @@ class GenericTrainingManager:
with open(path, "w") as f: with open(path, "w") as f:
yaml.dump(metrics, stream=f) yaml.dump(metrics, stream=f)
# Log mlflow artifacts if mlflow_logging:
mlflow.log_artifact(path, "predictions") # Log mlflow artifacts
mlflow.log_artifact(path, "predictions")
def output_pred(self, name): def output_pred(self, name):
path = os.path.join( path = os.path.join(
...@@ -1104,14 +1087,7 @@ class Manager(OCRManager): ...@@ -1104,14 +1087,7 @@ class Manager(OCRManager):
reduced_size = [s[:2] for s in batch_data["imgs_reduced_shape"]] reduced_size = [s[:2] for s in batch_data["imgs_reduced_shape"]]
y_len = batch_data["labels_len"] y_len = batch_data["labels_len"]
# add errors in teacher forcing if "label_noise_scheduler" in self.params["training_params"]:
if (
"teacher_forcing_error_rate" in self.params["training_params"]
and self.params["training_params"]["teacher_forcing_error_rate"] is not None
):
error_rate = self.params["training_params"]["teacher_forcing_error_rate"]
simulated_y_pred, y_len = self.add_label_noise(y, y_len, error_rate)
elif "label_noise_scheduler" in self.params["training_params"]:
error_rate = ( error_rate = (
self.params["training_params"]["label_noise_scheduler"][ self.params["training_params"]["label_noise_scheduler"][
"min_error_rate" "min_error_rate"
......