diff --git a/dan/ocr/document/train_popp.py b/dan/ocr/document/train_popp.py deleted file mode 100644 index 6237748a3e2c3786d6c3a99b5a641046330d01a5..0000000000000000000000000000000000000000 --- a/dan/ocr/document/train_popp.py +++ /dev/null @@ -1,317 +0,0 @@ -# -*- coding: utf-8 -*- -import json -import logging -import random -from copy import deepcopy -from pathlib import Path - -import numpy as np -import torch -import torch.multiprocessing as mp -from torch.optim import Adam - -from dan.decoder import GlobalHTADecoder -from dan.encoder import FCN_Encoder -from dan.manager.training import Manager -from dan.mlflow import MLFLOW_AVAILABLE -from dan.schedulers import exponential_dropout_scheduler -from dan.transforms import Preprocessing -from dan.utils import MLflowNotInstalled - -if MLFLOW_AVAILABLE: - import mlflow - - from dan.mlflow import make_mlflow_request, start_mlflow_run - - -logger = logging.getLogger(__name__) - - -def train_and_test(rank, params, mlflow_logging=False): - torch.manual_seed(0) - torch.cuda.manual_seed(0) - np.random.seed(0) - random.seed(0) - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - - params["training_params"]["ddp_rank"] = rank - model = Manager(params) - model.load_model() - - if mlflow_logging: - logger.info("MLflow logging enabled") - - model.train(mlflow_logging=mlflow_logging) - - # load weights giving best CER on valid set - model.params["training_params"]["load_epoch"] = "best" - model.load_model() - - metrics = ["cer", "wer", "wer_no_punct", "time"] - for dataset_name in params["dataset_params"]["datasets"].keys(): - for set_name in ["test", "val", "train"]: - model.predict( - "{}-{}".format(dataset_name, set_name), - [ - (dataset_name, set_name), - ], - metrics, - output=True, - mlflow_logging=mlflow_logging, - ) - - -def get_config(): - """ - Retrieve model configuration - """ - dataset_name = "data/popp" - dataset_level = "page" - dataset_variant = "" - dataset_path = "." - params = { - # "mlflow": { - # "run_name": "Test log DAN", - # "run_id": None, - # "s3_endpoint_url": "", - # "tracking_uri": "", - # "experiment_id": "0", - # "aws_access_key_id": "", - # "aws_secret_access_key": "", - # }, - "dataset_params": { - "datasets": { - dataset_name: "{}/{}_{}{}".format( - dataset_path, dataset_name, dataset_level, dataset_variant - ), - }, - "train": { - "name": "{}-train".format(dataset_name), - "datasets": [ - (dataset_name, "train"), - ], - }, - "val": { - "{}-val".format(dataset_name): [ - (dataset_name, "val"), - ], - }, - "test": { - "{}-test".format(dataset_name): [ - (dataset_name, "test"), - ], - }, - "config": { - "load_in_memory": True, # Load all images in CPU memory - "worker_per_gpu": 4, # Num of parallel processes per gpu for data loading - "preprocessings": [ - { - "type": Preprocessing.MaxResize, - "max_width": 2000, - "max_height": 2000, - } - ], - "augmentation": True, - }, - }, - "model_params": { - "models": { - "encoder": FCN_Encoder, - "decoder": GlobalHTADecoder, - }, - # "transfer_learning": None, - "transfer_learning": { - # model_name: [state_dict_name, checkpoint_path, learnable, strict] - "encoder": [ - "encoder", - "pretrained-models/popp_sp.pt", - True, - True, - ], - "decoder": [ - "decoder", - "pretrained-models/popp_sp.pt", - True, - False, - ], - }, - "transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model - "additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset - "input_channels": 3, # number of channels of input image - "dropout": 0.5, # dropout rate for encoder - "enc_dim": 256, # dimension of extracted features - "nb_layers": 5, # encoder - "h_max": 500, # maximum height for encoder output (for 2D positional embedding) - "w_max": 1000, # maximum width for encoder output (for 2D positional embedding) - "l_max": 15000, # max predicted sequence (for 1D positional embedding) - "dec_num_layers": 8, # number of transformer decoder layers - "dec_num_heads": 4, # number of heads in transformer decoder layers - "dec_res_dropout": 0.1, # dropout in transformer decoder layers - "dec_pred_dropout": 0.1, # dropout rate before decision layer - "dec_att_dropout": 0.1, # dropout rate in multi head attention - "dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers - "use_2d_pe": True, # use 2D positional embedding - "use_1d_pe": True, # use 1D positional embedding - "use_lstm": False, - "attention_win": 100, # length of attention window - # Curriculum dropout - "dropout_scheduler": { - "function": exponential_dropout_scheduler, - "T": 5e4, - }, - }, - "training_params": { - "output_folder": "outputs/dan_esposalles_record", # folder name for checkpoint and results - "max_nb_epochs": 2, # maximum number of epochs before to stop - "max_training_time": 3600 - * 24 - * 1.9, # maximum time before to stop (in seconds) - "load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate - "interval_save_weights": None, # None: keep best and last only - "batch_size": 1, # mini-batch size for training - "valid_batch_size": 1, # mini-batch size for valdiation - "use_ddp": False, # Use DistributedDataParallel - "ddp_port": "20027", - "use_amp": True, # Enable automatic mix-precision - "nb_gpu": torch.cuda.device_count(), - "optimizers": { - "all": { - "class": Adam, - "args": { - "lr": 0.0001, - "amsgrad": False, - }, - }, - }, - "lr_schedulers": None, # Learning rate schedulers - "eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not - "eval_on_valid_interval": 5, # Interval (in epochs) to evaluate during training - "focus_metric": "cer", # Metrics to focus on to determine best epoch - "expected_metric_value": "low", # ["high", "low"] What is best for the focus metric value - "set_name_focus_metric": "{}-val".format( - dataset_name - ), # Which dataset to focus on to select best weights - "train_metrics": [ - "loss_ce", - "cer", - "wer", - "wer_no_punct", - ], # Metrics name for training - "eval_metrics": [ - "cer", - "wer", - "wer_no_punct", - ], # Metrics name for evaluation on validation set during training - "force_cpu": True, # True for debug purposes - "max_char_prediction": 10, # max number of token prediction - # Keep teacher forcing rate to 20% during whole training - "label_noise_scheduler": { - "min_error_rate": 0.2, - "max_error_rate": 0.2, - "total_num_steps": 5e4, - }, - }, - } - - return params, dataset_name - - -def serialize_config(config): - """ - Make every field of the configuration JSON-Serializable and remove sensitive information. - - - Classes are transformed using their name attribute - - Functions are casted to strings - """ - # Create a copy of the original config without erase it - serialized_config = deepcopy(config) - - # Remove credentials to the config - serialized_config["mlflow"]["s3_endpoint_url"] = "" - serialized_config["mlflow"]["tracking_uri"] = "" - serialized_config["mlflow"]["aws_access_key_id"] = "" - serialized_config["mlflow"]["aws_secret_access_key"] = "" - - # Get the name of the class - serialized_config["model_params"]["models"]["encoder"] = serialized_config[ - "model_params" - ]["models"]["encoder"].__name__ - serialized_config["model_params"]["models"]["decoder"] = serialized_config[ - "model_params" - ]["models"]["decoder"].__name__ - serialized_config["training_params"]["optimizers"]["all"][ - "class" - ] = serialized_config["training_params"]["optimizers"]["all"]["class"].__name__ - - # Cast the functions to str - serialized_config["dataset_params"]["config"]["augmentation"] = str( - serialized_config["dataset_params"]["config"]["augmentation"] - ) - serialized_config["model_params"]["dropout_scheduler"]["function"] = str( - serialized_config["model_params"]["dropout_scheduler"]["function"] - ) - serialized_config["training_params"]["nb_gpu"] = str( - serialized_config["training_params"]["nb_gpu"] - ) - - return serialized_config - - -def start_training(config, mlflow_logging: bool) -> None: - if ( - config["training_params"]["use_ddp"] - and not config["training_params"]["force_cpu"] - ): - mp.spawn( - train_and_test, - args=(config, mlflow_logging), - nprocs=config["training_params"]["nb_gpu"], - ) - else: - train_and_test(0, config, mlflow_logging) - - -def run(): - """ - Main program, training a new model, using a valid configuration - """ - - config, dataset_name = get_config() - - if "mlflow" in config and not MLFLOW_AVAILABLE: - logger.error( - "Cannot log to MLflow. Please install the `mlflow` extra requirements." - ) - raise MLflowNotInstalled() - - if "mlflow" not in config: - start_training(config, mlflow_logging=False) - else: - labels_path = ( - Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json" - ) - with start_mlflow_run(config["mlflow"]) as (run, created): - if created: - logger.info(f"Started MLflow run with ID ({run.info.run_id})") - else: - logger.info(f"Resumed MLflow run with ID ({run.info.run_id})") - - make_mlflow_request( - mlflow_method=mlflow.set_tags, tags={"Dataset": dataset_name} - ) - # Get the labels json file - with open(labels_path) as json_file: - labels_artifact = json.load(json_file) - - # Log MLflow artifacts - for artifact, filename in [ - (serialize_config(config), "config.json"), - (labels_artifact, "labels.json"), - ]: - make_mlflow_request( - mlflow_method=mlflow.log_dict, - dictionary=artifact, - artifact_file=filename, - ) - start_training(config, mlflow_logging=True)