Skip to content
Snippets Groups Projects

Fix version 0.2.0-dev3 and later

Merged Mélodie Boillet requested to merge fix-dev3 into main
All threads resolved!
1 file
+ 0
317
Compare changes
  • Side-by-side
  • Inline
+ 0
317
# -*- coding: utf-8 -*-
import json
import logging
import random
from copy import deepcopy
from pathlib import Path
import numpy as np
import torch
import torch.multiprocessing as mp
from torch.optim import Adam
from dan.decoder import GlobalHTADecoder
from dan.encoder import FCN_Encoder
from dan.manager.training import Manager
from dan.mlflow import MLFLOW_AVAILABLE
from dan.schedulers import exponential_dropout_scheduler
from dan.transforms import Preprocessing
from dan.utils import MLflowNotInstalled
if MLFLOW_AVAILABLE:
import mlflow
from dan.mlflow import make_mlflow_request, start_mlflow_run
logger = logging.getLogger(__name__)
def train_and_test(rank, params, mlflow_logging=False):
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
params["training_params"]["ddp_rank"] = rank
model = Manager(params)
model.load_model()
if mlflow_logging:
logger.info("MLflow logging enabled")
model.train(mlflow_logging=mlflow_logging)
# load weights giving best CER on valid set
model.params["training_params"]["load_epoch"] = "best"
model.load_model()
metrics = ["cer", "wer", "wer_no_punct", "time"]
for dataset_name in params["dataset_params"]["datasets"].keys():
for set_name in ["test", "val", "train"]:
model.predict(
"{}-{}".format(dataset_name, set_name),
[
(dataset_name, set_name),
],
metrics,
output=True,
mlflow_logging=mlflow_logging,
)
def get_config():
"""
Retrieve model configuration
"""
dataset_name = "data/popp"
dataset_level = "page"
dataset_variant = ""
dataset_path = "."
params = {
# "mlflow": {
# "run_name": "Test log DAN",
# "run_id": None,
# "s3_endpoint_url": "",
# "tracking_uri": "",
# "experiment_id": "0",
# "aws_access_key_id": "",
# "aws_secret_access_key": "",
# },
"dataset_params": {
"datasets": {
dataset_name: "{}/{}_{}{}".format(
dataset_path, dataset_name, dataset_level, dataset_variant
),
},
"train": {
"name": "{}-train".format(dataset_name),
"datasets": [
(dataset_name, "train"),
],
},
"val": {
"{}-val".format(dataset_name): [
(dataset_name, "val"),
],
},
"test": {
"{}-test".format(dataset_name): [
(dataset_name, "test"),
],
},
"config": {
"load_in_memory": True, # Load all images in CPU memory
"worker_per_gpu": 4, # Num of parallel processes per gpu for data loading
"preprocessings": [
{
"type": Preprocessing.MaxResize,
"max_width": 2000,
"max_height": 2000,
}
],
"augmentation": True,
},
},
"model_params": {
"models": {
"encoder": FCN_Encoder,
"decoder": GlobalHTADecoder,
},
# "transfer_learning": None,
"transfer_learning": {
# model_name: [state_dict_name, checkpoint_path, learnable, strict]
"encoder": [
"encoder",
"pretrained-models/popp_sp.pt",
True,
True,
],
"decoder": [
"decoder",
"pretrained-models/popp_sp.pt",
True,
False,
],
},
"transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model
"additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset
"input_channels": 3, # number of channels of input image
"dropout": 0.5, # dropout rate for encoder
"enc_dim": 256, # dimension of extracted features
"nb_layers": 5, # encoder
"h_max": 500, # maximum height for encoder output (for 2D positional embedding)
"w_max": 1000, # maximum width for encoder output (for 2D positional embedding)
"l_max": 15000, # max predicted sequence (for 1D positional embedding)
"dec_num_layers": 8, # number of transformer decoder layers
"dec_num_heads": 4, # number of heads in transformer decoder layers
"dec_res_dropout": 0.1, # dropout in transformer decoder layers
"dec_pred_dropout": 0.1, # dropout rate before decision layer
"dec_att_dropout": 0.1, # dropout rate in multi head attention
"dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers
"use_2d_pe": True, # use 2D positional embedding
"use_1d_pe": True, # use 1D positional embedding
"use_lstm": False,
"attention_win": 100, # length of attention window
# Curriculum dropout
"dropout_scheduler": {
"function": exponential_dropout_scheduler,
"T": 5e4,
},
},
"training_params": {
"output_folder": "outputs/dan_esposalles_record", # folder name for checkpoint and results
"max_nb_epochs": 2, # maximum number of epochs before to stop
"max_training_time": 3600
* 24
* 1.9, # maximum time before to stop (in seconds)
"load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate
"interval_save_weights": None, # None: keep best and last only
"batch_size": 1, # mini-batch size for training
"valid_batch_size": 1, # mini-batch size for valdiation
"use_ddp": False, # Use DistributedDataParallel
"ddp_port": "20027",
"use_amp": True, # Enable automatic mix-precision
"nb_gpu": torch.cuda.device_count(),
"optimizers": {
"all": {
"class": Adam,
"args": {
"lr": 0.0001,
"amsgrad": False,
},
},
},
"lr_schedulers": None, # Learning rate schedulers
"eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not
"eval_on_valid_interval": 5, # Interval (in epochs) to evaluate during training
"focus_metric": "cer", # Metrics to focus on to determine best epoch
"expected_metric_value": "low", # ["high", "low"] What is best for the focus metric value
"set_name_focus_metric": "{}-val".format(
dataset_name
), # Which dataset to focus on to select best weights
"train_metrics": [
"loss_ce",
"cer",
"wer",
"wer_no_punct",
], # Metrics name for training
"eval_metrics": [
"cer",
"wer",
"wer_no_punct",
], # Metrics name for evaluation on validation set during training
"force_cpu": True, # True for debug purposes
"max_char_prediction": 10, # max number of token prediction
# Keep teacher forcing rate to 20% during whole training
"label_noise_scheduler": {
"min_error_rate": 0.2,
"max_error_rate": 0.2,
"total_num_steps": 5e4,
},
},
}
return params, dataset_name
def serialize_config(config):
"""
Make every field of the configuration JSON-Serializable and remove sensitive information.
- Classes are transformed using their name attribute
- Functions are casted to strings
"""
# Create a copy of the original config without erase it
serialized_config = deepcopy(config)
# Remove credentials to the config
serialized_config["mlflow"]["s3_endpoint_url"] = ""
serialized_config["mlflow"]["tracking_uri"] = ""
serialized_config["mlflow"]["aws_access_key_id"] = ""
serialized_config["mlflow"]["aws_secret_access_key"] = ""
# Get the name of the class
serialized_config["model_params"]["models"]["encoder"] = serialized_config[
"model_params"
]["models"]["encoder"].__name__
serialized_config["model_params"]["models"]["decoder"] = serialized_config[
"model_params"
]["models"]["decoder"].__name__
serialized_config["training_params"]["optimizers"]["all"][
"class"
] = serialized_config["training_params"]["optimizers"]["all"]["class"].__name__
# Cast the functions to str
serialized_config["dataset_params"]["config"]["augmentation"] = str(
serialized_config["dataset_params"]["config"]["augmentation"]
)
serialized_config["model_params"]["dropout_scheduler"]["function"] = str(
serialized_config["model_params"]["dropout_scheduler"]["function"]
)
serialized_config["training_params"]["nb_gpu"] = str(
serialized_config["training_params"]["nb_gpu"]
)
return serialized_config
def start_training(config, mlflow_logging: bool) -> None:
if (
config["training_params"]["use_ddp"]
and not config["training_params"]["force_cpu"]
):
mp.spawn(
train_and_test,
args=(config, mlflow_logging),
nprocs=config["training_params"]["nb_gpu"],
)
else:
train_and_test(0, config, mlflow_logging)
def run():
"""
Main program, training a new model, using a valid configuration
"""
config, dataset_name = get_config()
if "mlflow" in config and not MLFLOW_AVAILABLE:
logger.error(
"Cannot log to MLflow. Please install the `mlflow` extra requirements."
)
raise MLflowNotInstalled()
if "mlflow" not in config:
start_training(config, mlflow_logging=False)
else:
labels_path = (
Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json"
)
with start_mlflow_run(config["mlflow"]) as (run, created):
if created:
logger.info(f"Started MLflow run with ID ({run.info.run_id})")
else:
logger.info(f"Resumed MLflow run with ID ({run.info.run_id})")
make_mlflow_request(
mlflow_method=mlflow.set_tags, tags={"Dataset": dataset_name}
)
# Get the labels json file
with open(labels_path) as json_file:
labels_artifact = json.load(json_file)
# Log MLflow artifacts
for artifact, filename in [
(serialize_config(config), "config.json"),
(labels_artifact, "labels.json"),
]:
make_mlflow_request(
mlflow_method=mlflow.log_dict,
dictionary=artifact,
artifact_file=filename,
)
start_training(config, mlflow_logging=True)
Loading