From 79c51d7843dfe0493ffc982299024d70b5883fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lodie=20Boillet?= <boillet@teklia.com> Date: Thu, 3 Aug 2023 08:31:40 +0200 Subject: [PATCH] Apply f0a6e38c --- dan/decoder.py | 12 +- dan/ocr/document/train.py | 2 - dan/ocr/document/train_popp.py | 317 +++++++++++++++++++++++++++ docs/get_started/training.md | 1 - docs/usage/train/parameters.md | 2 - tests/conftest.py | 2 - tests/data/prediction/parameters.yml | 1 - 7 files changed, 319 insertions(+), 18 deletions(-) create mode 100644 dan/ocr/document/train_popp.py diff --git a/dan/decoder.py b/dan/decoder.py index af5c28cb..69e372cf 100644 --- a/dan/decoder.py +++ b/dan/decoder.py @@ -305,14 +305,9 @@ class FeaturesUpdater(Module): self.pe_2d = PositionalEncoding2D( params["enc_dim"], params["h_max"], params["w_max"], params["device"] ) - self.use_2d_positional_encoding = ( - "use_2d_pe" not in params or params["use_2d_pe"] - ) def get_pos_features(self, features): - if self.use_2d_positional_encoding: - return self.pe_2d(features) - return features + return self.pe_2d(features) class GlobalHTADecoder(Module): @@ -326,7 +321,6 @@ class GlobalHTADecoder(Module): self.dec_att_win = ( params["attention_win"] if params["attention_win"] is not None else 1 ) - self.use_1d_pe = "use_1d_pe" not in params or params["use_1d_pe"] self.features_updater = FeaturesUpdater(params) self.att_decoder = GlobalAttDecoder(params) @@ -361,9 +355,7 @@ class GlobalHTADecoder(Module): pos_tokens = self.emb(tokens).permute(0, 2, 1) # Add 1D Positional Encoding - if self.use_1d_pe: - pos_tokens = self.pe_1d(pos_tokens, start=start) - pos_tokens = pos_tokens.permute(2, 0, 1) + pos_tokens = self.pe_1d(pos_tokens, start=start).permute(2, 0, 1) if num_pred is None: num_pred = tokens.size(1) diff --git a/dan/ocr/document/train.py b/dan/ocr/document/train.py index 6f5e17b3..ad4643b9 100644 --- a/dan/ocr/document/train.py +++ b/dan/ocr/document/train.py @@ -151,8 +151,6 @@ def get_config(): "dec_pred_dropout": 0.1, # dropout rate before decision layer "dec_att_dropout": 0.1, # dropout rate in multi head attention "dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers - "use_2d_pe": True, # use 2D positional embedding - "use_1d_pe": True, # use 1D positional embedding "attention_win": 100, # length of attention window # Curriculum dropout "dropout_scheduler": { diff --git a/dan/ocr/document/train_popp.py b/dan/ocr/document/train_popp.py new file mode 100644 index 00000000..6237748a --- /dev/null +++ b/dan/ocr/document/train_popp.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +import json +import logging +import random +from copy import deepcopy +from pathlib import Path + +import numpy as np +import torch +import torch.multiprocessing as mp +from torch.optim import Adam + +from dan.decoder import GlobalHTADecoder +from dan.encoder import FCN_Encoder +from dan.manager.training import Manager +from dan.mlflow import MLFLOW_AVAILABLE +from dan.schedulers import exponential_dropout_scheduler +from dan.transforms import Preprocessing +from dan.utils import MLflowNotInstalled + +if MLFLOW_AVAILABLE: + import mlflow + + from dan.mlflow import make_mlflow_request, start_mlflow_run + + +logger = logging.getLogger(__name__) + + +def train_and_test(rank, params, mlflow_logging=False): + torch.manual_seed(0) + torch.cuda.manual_seed(0) + np.random.seed(0) + random.seed(0) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + params["training_params"]["ddp_rank"] = rank + model = Manager(params) + model.load_model() + + if mlflow_logging: + logger.info("MLflow logging enabled") + + model.train(mlflow_logging=mlflow_logging) + + # load weights giving best CER on valid set + model.params["training_params"]["load_epoch"] = "best" + model.load_model() + + metrics = ["cer", "wer", "wer_no_punct", "time"] + for dataset_name in params["dataset_params"]["datasets"].keys(): + for set_name in ["test", "val", "train"]: + model.predict( + "{}-{}".format(dataset_name, set_name), + [ + (dataset_name, set_name), + ], + metrics, + output=True, + mlflow_logging=mlflow_logging, + ) + + +def get_config(): + """ + Retrieve model configuration + """ + dataset_name = "data/popp" + dataset_level = "page" + dataset_variant = "" + dataset_path = "." + params = { + # "mlflow": { + # "run_name": "Test log DAN", + # "run_id": None, + # "s3_endpoint_url": "", + # "tracking_uri": "", + # "experiment_id": "0", + # "aws_access_key_id": "", + # "aws_secret_access_key": "", + # }, + "dataset_params": { + "datasets": { + dataset_name: "{}/{}_{}{}".format( + dataset_path, dataset_name, dataset_level, dataset_variant + ), + }, + "train": { + "name": "{}-train".format(dataset_name), + "datasets": [ + (dataset_name, "train"), + ], + }, + "val": { + "{}-val".format(dataset_name): [ + (dataset_name, "val"), + ], + }, + "test": { + "{}-test".format(dataset_name): [ + (dataset_name, "test"), + ], + }, + "config": { + "load_in_memory": True, # Load all images in CPU memory + "worker_per_gpu": 4, # Num of parallel processes per gpu for data loading + "preprocessings": [ + { + "type": Preprocessing.MaxResize, + "max_width": 2000, + "max_height": 2000, + } + ], + "augmentation": True, + }, + }, + "model_params": { + "models": { + "encoder": FCN_Encoder, + "decoder": GlobalHTADecoder, + }, + # "transfer_learning": None, + "transfer_learning": { + # model_name: [state_dict_name, checkpoint_path, learnable, strict] + "encoder": [ + "encoder", + "pretrained-models/popp_sp.pt", + True, + True, + ], + "decoder": [ + "decoder", + "pretrained-models/popp_sp.pt", + True, + False, + ], + }, + "transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model + "additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset + "input_channels": 3, # number of channels of input image + "dropout": 0.5, # dropout rate for encoder + "enc_dim": 256, # dimension of extracted features + "nb_layers": 5, # encoder + "h_max": 500, # maximum height for encoder output (for 2D positional embedding) + "w_max": 1000, # maximum width for encoder output (for 2D positional embedding) + "l_max": 15000, # max predicted sequence (for 1D positional embedding) + "dec_num_layers": 8, # number of transformer decoder layers + "dec_num_heads": 4, # number of heads in transformer decoder layers + "dec_res_dropout": 0.1, # dropout in transformer decoder layers + "dec_pred_dropout": 0.1, # dropout rate before decision layer + "dec_att_dropout": 0.1, # dropout rate in multi head attention + "dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers + "use_2d_pe": True, # use 2D positional embedding + "use_1d_pe": True, # use 1D positional embedding + "use_lstm": False, + "attention_win": 100, # length of attention window + # Curriculum dropout + "dropout_scheduler": { + "function": exponential_dropout_scheduler, + "T": 5e4, + }, + }, + "training_params": { + "output_folder": "outputs/dan_esposalles_record", # folder name for checkpoint and results + "max_nb_epochs": 2, # maximum number of epochs before to stop + "max_training_time": 3600 + * 24 + * 1.9, # maximum time before to stop (in seconds) + "load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate + "interval_save_weights": None, # None: keep best and last only + "batch_size": 1, # mini-batch size for training + "valid_batch_size": 1, # mini-batch size for valdiation + "use_ddp": False, # Use DistributedDataParallel + "ddp_port": "20027", + "use_amp": True, # Enable automatic mix-precision + "nb_gpu": torch.cuda.device_count(), + "optimizers": { + "all": { + "class": Adam, + "args": { + "lr": 0.0001, + "amsgrad": False, + }, + }, + }, + "lr_schedulers": None, # Learning rate schedulers + "eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not + "eval_on_valid_interval": 5, # Interval (in epochs) to evaluate during training + "focus_metric": "cer", # Metrics to focus on to determine best epoch + "expected_metric_value": "low", # ["high", "low"] What is best for the focus metric value + "set_name_focus_metric": "{}-val".format( + dataset_name + ), # Which dataset to focus on to select best weights + "train_metrics": [ + "loss_ce", + "cer", + "wer", + "wer_no_punct", + ], # Metrics name for training + "eval_metrics": [ + "cer", + "wer", + "wer_no_punct", + ], # Metrics name for evaluation on validation set during training + "force_cpu": True, # True for debug purposes + "max_char_prediction": 10, # max number of token prediction + # Keep teacher forcing rate to 20% during whole training + "label_noise_scheduler": { + "min_error_rate": 0.2, + "max_error_rate": 0.2, + "total_num_steps": 5e4, + }, + }, + } + + return params, dataset_name + + +def serialize_config(config): + """ + Make every field of the configuration JSON-Serializable and remove sensitive information. + + - Classes are transformed using their name attribute + - Functions are casted to strings + """ + # Create a copy of the original config without erase it + serialized_config = deepcopy(config) + + # Remove credentials to the config + serialized_config["mlflow"]["s3_endpoint_url"] = "" + serialized_config["mlflow"]["tracking_uri"] = "" + serialized_config["mlflow"]["aws_access_key_id"] = "" + serialized_config["mlflow"]["aws_secret_access_key"] = "" + + # Get the name of the class + serialized_config["model_params"]["models"]["encoder"] = serialized_config[ + "model_params" + ]["models"]["encoder"].__name__ + serialized_config["model_params"]["models"]["decoder"] = serialized_config[ + "model_params" + ]["models"]["decoder"].__name__ + serialized_config["training_params"]["optimizers"]["all"][ + "class" + ] = serialized_config["training_params"]["optimizers"]["all"]["class"].__name__ + + # Cast the functions to str + serialized_config["dataset_params"]["config"]["augmentation"] = str( + serialized_config["dataset_params"]["config"]["augmentation"] + ) + serialized_config["model_params"]["dropout_scheduler"]["function"] = str( + serialized_config["model_params"]["dropout_scheduler"]["function"] + ) + serialized_config["training_params"]["nb_gpu"] = str( + serialized_config["training_params"]["nb_gpu"] + ) + + return serialized_config + + +def start_training(config, mlflow_logging: bool) -> None: + if ( + config["training_params"]["use_ddp"] + and not config["training_params"]["force_cpu"] + ): + mp.spawn( + train_and_test, + args=(config, mlflow_logging), + nprocs=config["training_params"]["nb_gpu"], + ) + else: + train_and_test(0, config, mlflow_logging) + + +def run(): + """ + Main program, training a new model, using a valid configuration + """ + + config, dataset_name = get_config() + + if "mlflow" in config and not MLFLOW_AVAILABLE: + logger.error( + "Cannot log to MLflow. Please install the `mlflow` extra requirements." + ) + raise MLflowNotInstalled() + + if "mlflow" not in config: + start_training(config, mlflow_logging=False) + else: + labels_path = ( + Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json" + ) + with start_mlflow_run(config["mlflow"]) as (run, created): + if created: + logger.info(f"Started MLflow run with ID ({run.info.run_id})") + else: + logger.info(f"Resumed MLflow run with ID ({run.info.run_id})") + + make_mlflow_request( + mlflow_method=mlflow.set_tags, tags={"Dataset": dataset_name} + ) + # Get the labels json file + with open(labels_path) as json_file: + labels_artifact = json.load(json_file) + + # Log MLflow artifacts + for artifact, filename in [ + (serialize_config(config), "config.json"), + (labels_artifact, "labels.json"), + ]: + make_mlflow_request( + mlflow_method=mlflow.log_dict, + dictionary=artifact, + artifact_file=filename, + ) + start_training(config, mlflow_logging=True) diff --git a/docs/get_started/training.md b/docs/get_started/training.md index c92045bc..598eba39 100644 --- a/docs/get_started/training.md +++ b/docs/get_started/training.md @@ -60,7 +60,6 @@ parameters: l_max: int dec_pred_dropout: float attention_win: int - use_1d_pe: bool vocab_size: int h_max: int w_max: int diff --git a/docs/usage/train/parameters.md b/docs/usage/train/parameters.md index 981ab842..d659a1e0 100644 --- a/docs/usage/train/parameters.md +++ b/docs/usage/train/parameters.md @@ -145,8 +145,6 @@ For a detailed description of all augmentation transforms, see the [dedicated pa | `model_params.dec_pred_dropout` | Dropout rate before decision layer. | `float` | `0.1` | | `model_params.dec_att_dropout` | Dropout rate in multi head attention. | `float` | `0.1` | | `model_params.dec_dim_feedforward` | Number of dimensions for feedforward layer in transformer decoder layers. | `int` | `256` | -| `model_params.use_2d_pe` | Whether to use 2D positional embedding. | `bool` | `True` | -| `model_params.use_1d_pe` | Whether to use 1D positional embedding. | `bool` | `True` | | `model_params.attention_win` | Length of attention window. | `int` | `100` | | `model_params.dropout_scheduler.function` | Curriculum dropout scheduler. | custom class | `exponential_dropout_scheduler` | | `model_params.dropout_scheduler.T` | Exponential factor. | `float` | `5e4` | diff --git a/tests/conftest.py b/tests/conftest.py index 136cead1..e804cb36 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -99,8 +99,6 @@ def training_config(): "dec_pred_dropout": 0.1, # dropout rate before decision layer "dec_att_dropout": 0.1, # dropout rate in multi head attention "dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers - "use_2d_pe": True, # use 2D positional embedding - "use_1d_pe": True, # use 1D positional embedding "attention_win": 100, # length of attention window # Curriculum dropout "dropout_scheduler": { diff --git a/tests/data/prediction/parameters.yml b/tests/data/prediction/parameters.yml index 101fe5c8..c299406c 100644 --- a/tests/data/prediction/parameters.yml +++ b/tests/data/prediction/parameters.yml @@ -12,7 +12,6 @@ parameters: l_max: 15000 dec_pred_dropout: 0.1 attention_win: 100 - use_1d_pe: True vocab_size: 96 h_max: 500 w_max: 1000 -- GitLab