Skip to content
Snippets Groups Projects
Commit 5be2ec35 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Mélodie Boillet
Browse files

Organize training configuration in sections

parent bb09ae0c
No related branches found
No related tags found
1 merge request!242Organize training configuration in sections
......@@ -14,11 +14,15 @@ from dan.utils import pad_images, pad_sequences_1D
class OCRDatasetManager:
def __init__(self, params, device: str):
self.params = params
def __init__(
self, dataset_params: dict, training_params: dict, device: torch.device
):
self.params = dataset_params
self.training_params = training_params
self.device_params = training_params["device"]
# Whether data should be copied on GPU via https://pytorch.org/docs/stable/generated/torch.Tensor.pin_memory.html
self.pin_memory = device != "cpu"
self.pin_memory = device != torch.device("cpu")
self.train_dataset = None
self.valid_datasets = dict()
......@@ -32,37 +36,27 @@ class OCRDatasetManager:
self.valid_samplers = dict()
self.test_samplers = dict()
self.mean = (
np.array(params["config"]["mean"])
if "mean" in params["config"].keys()
else None
)
self.std = (
np.array(params["config"]["std"])
if "std" in params["config"].keys()
else None
)
self.mean = None
self.std = None
self.generator = torch.Generator()
self.generator.manual_seed(0)
self.load_in_memory = (
self.params["config"]["load_in_memory"]
if "load_in_memory" in self.params["config"]
else True
)
self.load_in_memory = self.training_params["data"].get("load_in_memory", True)
self.charset = self.get_charset()
self.tokens = self.get_tokens()
self.params["config"]["padding_token"] = self.tokens["pad"]
self.training_params["data"]["padding_token"] = self.tokens["pad"]
self.my_collate_function = OCRCollateFunction(self.params["config"])
self.my_collate_function = OCRCollateFunction(
padding_token=training_params["data"]["padding_token"]
)
self.augmentation = (
get_augmentation_transforms()
if self.params["config"]["augmentation"]
if self.training_params["data"]["augmentation"]
else None
)
self.preprocessing = get_preprocessing_transforms(
params["config"]["preprocessings"], to_pil_image=True
training_params["data"]["preprocessings"], to_pil_image=True
)
def load_datasets(self):
......@@ -100,18 +94,18 @@ class OCRDatasetManager:
"""
Load training and validation data samplers
"""
if self.params["use_ddp"]:
if self.device_params["use_ddp"]:
self.train_sampler = DistributedSampler(
self.train_dataset,
num_replicas=self.params["num_gpu"],
rank=self.params["ddp_rank"],
num_replicas=self.device_params["nb_gpu"],
rank=self.device_params["ddp_rank"],
shuffle=True,
)
for custom_name in self.valid_datasets.keys():
self.valid_samplers[custom_name] = DistributedSampler(
self.valid_datasets[custom_name],
num_replicas=self.params["num_gpu"],
rank=self.params["ddp_rank"],
num_replicas=self.device_params["nb_gpu"],
rank=self.device_params["ddp_rank"],
shuffle=False,
)
else:
......@@ -124,11 +118,12 @@ class OCRDatasetManager:
"""
self.train_loader = DataLoader(
self.train_dataset,
batch_size=self.params["batch_size"],
batch_size=self.training_params["data"]["batch_size"],
shuffle=True if self.train_sampler is None else False,
drop_last=False,
sampler=self.train_sampler,
num_workers=self.params["num_gpu"] * self.params["worker_per_gpu"],
num_workers=self.device_params["nb_gpu"]
* self.training_params["data"]["worker_per_gpu"],
pin_memory=self.pin_memory,
collate_fn=self.my_collate_function,
worker_init_fn=self.seed_worker,
......@@ -141,7 +136,8 @@ class OCRDatasetManager:
batch_size=1,
sampler=self.valid_samplers[key],
shuffle=False,
num_workers=self.params["num_gpu"] * self.params["worker_per_gpu"],
num_workers=self.device_params["nb_gpu"]
* self.training_params["data"]["worker_per_gpu"],
pin_memory=self.pin_memory,
drop_last=False,
collate_fn=self.my_collate_function,
......@@ -181,11 +177,11 @@ class OCRDatasetManager:
std=self.std,
)
if self.params["use_ddp"]:
if self.device_params["use_ddp"]:
self.test_samplers[custom_name] = DistributedSampler(
self.test_datasets[custom_name],
num_replicas=self.params["num_gpu"],
rank=self.params["ddp_rank"],
num_replicas=self.device_params["nb_gpu"],
rank=self.device_params["ddp_rank"],
shuffle=False,
)
else:
......@@ -196,7 +192,8 @@ class OCRDatasetManager:
batch_size=1,
sampler=self.test_samplers[custom_name],
shuffle=False,
num_workers=self.params["num_gpu"] * self.params["worker_per_gpu"],
num_workers=self.device_params["nb_gpu"]
* self.training_params["data"]["worker_per_gpu"],
pin_memory=self.pin_memory,
drop_last=False,
collate_fn=self.my_collate_function,
......@@ -245,9 +242,8 @@ class OCRCollateFunction:
Merge samples data to mini-batch data for OCR task
"""
def __init__(self, config):
self.label_padding_value = config["padding_token"]
self.config = config
def __init__(self, padding_token):
self.label_padding_value = padding_token
def __call__(self, batch_data):
labels = [batch_data[i]["token_label"] for i in range(len(batch_data))]
......
This diff is collapsed.
......@@ -34,7 +34,7 @@ def train_and_test(rank, params, mlflow_logging=False):
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
params["training_params"]["ddp_rank"] = rank
params["training"]["device"]["ddp_rank"] = rank
model = Manager(params)
model.load_model()
......@@ -44,11 +44,11 @@ def train_and_test(rank, params, mlflow_logging=False):
model.train(mlflow_logging=mlflow_logging)
# load weights giving best CER on valid set
model.params["training_params"]["load_epoch"] = "best"
model.params["training"]["load_epoch"] = "best"
model.load_model()
metrics = ["cer", "wer", "wer_no_punct", "time"]
for dataset_name in params["dataset_params"]["datasets"].keys():
for dataset_name in params["dataset"]["datasets"].keys():
for set_name in ["test", "val", "train"]:
model.predict(
"{}-{}".format(dataset_name, set_name),
......@@ -79,7 +79,7 @@ def get_config():
"aws_access_key_id": "",
"aws_secret_access_key": "",
},
"dataset_params": {
"dataset": {
"datasets": {
dataset_name: "{}/{}_{}{}".format(
dataset_path, dataset_name, dataset_level, dataset_variant
......@@ -101,7 +101,35 @@ def get_config():
(dataset_name, "test"),
],
},
"config": {
"max_char_prediction": 1000, # max number of token prediction
"tokens": None,
},
"model": {
"transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model
"additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset
"encoder": {
"class": FCN_Encoder,
"dropout": 0.5, # dropout rate for encoder
"nb_layers": 5, # encoder
},
"h_max": 500, # maximum height for encoder output (for 2D positional embedding)
"w_max": 1000, # maximum width for encoder output (for 2D positional embedding)
"decoder": {
"class": GlobalHTADecoder,
"l_max": 15000, # max predicted sequence (for 1D positional embedding)
"dec_num_layers": 8, # number of transformer decoder layers
"dec_num_heads": 4, # number of heads in transformer decoder layers
"dec_res_dropout": 0.1, # dropout in transformer decoder layers
"dec_pred_dropout": 0.1, # dropout rate before decision layer
"dec_att_dropout": 0.1, # dropout rate in multi head attention
"dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers
"attention_win": 100, # length of attention window
"enc_dim": 256, # dimension of extracted features
},
},
"training": {
"data": {
"batch_size": 2, # mini-batch size for training
"load_in_memory": True, # Load all images in CPU memory
"worker_per_gpu": 4, # Num of parallel processes per gpu for data loading
"preprocessings": [
......@@ -113,54 +141,36 @@ def get_config():
],
"augmentation": True,
},
"tokens": None,
},
"model_params": {
"models": {
"encoder": FCN_Encoder,
"decoder": GlobalHTADecoder,
"device": {
"use_ddp": False, # Use DistributedDataParallel
"ddp_port": "20027",
"use_amp": True, # Enable automatic mix-precision
"nb_gpu": torch.cuda.device_count(),
"force_cpu": False, # True for debug purposes
},
# "transfer_learning": None,
"transfer_learning": {
# model_name: [state_dict_name, checkpoint_path, learnable, strict]
"encoder": [
"encoder",
"pretrained_models/dan_rimes_page.pt",
True,
True,
],
"decoder": [
"decoder",
"pretrained_models/dan_rimes_page.pt",
True,
False,
],
"metrics": {
"train": [
"loss_ce",
"cer",
"wer",
"wer_no_punct",
], # Metrics name for training
"eval": [
"cer",
"wer",
"wer_no_punct",
], # Metrics name for evaluation on validation set during training
},
"validation": {
"eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not
"eval_on_valid_interval": 5, # Interval (in epochs) to evaluate during training
"set_name_focus_metric": "{}-val".format(
dataset_name
), # Which dataset to focus on to select best weights
},
"transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model
"additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset
"dropout": 0.5, # dropout rate for encoder
"enc_dim": 256, # dimension of extracted features
"nb_layers": 5, # encoder
"h_max": 500, # maximum height for encoder output (for 2D positional embedding)
"w_max": 1000, # maximum width for encoder output (for 2D positional embedding)
"l_max": 15000, # max predicted sequence (for 1D positional embedding)
"dec_num_layers": 8, # number of transformer decoder layers
"dec_num_heads": 4, # number of heads in transformer decoder layers
"dec_res_dropout": 0.1, # dropout in transformer decoder layers
"dec_pred_dropout": 0.1, # dropout rate before decision layer
"dec_att_dropout": 0.1, # dropout rate in multi head attention
"dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers
"attention_win": 100, # length of attention window
},
"training_params": {
"output_folder": "outputs/dan_esposalles_record", # folder name for checkpoint and results
"max_nb_epochs": 800, # maximum number of epochs before to stop
"load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate
"batch_size": 2, # mini-batch size for training
"use_ddp": False, # Use DistributedDataParallel
"ddp_port": "20027",
"use_amp": True, # Enable automatic mix-precision
"nb_gpu": torch.cuda.device_count(),
"optimizers": {
"all": {
"class": Adam,
......@@ -171,30 +181,28 @@ def get_config():
},
},
"lr_schedulers": None, # Learning rate schedulers
"eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not
"eval_on_valid_interval": 5, # Interval (in epochs) to evaluate during training
"set_name_focus_metric": "{}-val".format(
dataset_name
), # Which dataset to focus on to select best weights
"train_metrics": [
"loss_ce",
"cer",
"wer",
"wer_no_punct",
], # Metrics name for training
"eval_metrics": [
"cer",
"wer",
"wer_no_punct",
], # Metrics name for evaluation on validation set during training
"force_cpu": False, # True for debug purposes
"max_char_prediction": 1000, # max number of token prediction
# Keep teacher forcing rate to 20% during whole training
"label_noise_scheduler": {
"min_error_rate": 0.2,
"max_error_rate": 0.2,
"total_num_steps": 5e4,
},
# "transfer_learning": None,
"transfer_learning": {
# model_name: [state_dict_name, checkpoint_path, learnable, strict]
"encoder": [
"encoder",
"pretrained_models/dan_rimes_page.pt",
True,
True,
],
"decoder": [
"decoder",
"pretrained_models/dan_rimes_page.pt",
True,
False,
],
},
},
}
......@@ -218,22 +226,22 @@ def serialize_config(config):
serialized_config["mlflow"]["aws_secret_access_key"] = ""
# Get the name of the class
serialized_config["model_params"]["models"]["encoder"] = serialized_config[
"model_params"
]["models"]["encoder"].__name__
serialized_config["model_params"]["models"]["decoder"] = serialized_config[
"model_params"
]["models"]["decoder"].__name__
serialized_config["training_params"]["optimizers"]["all"][
"class"
] = serialized_config["training_params"]["optimizers"]["all"]["class"].__name__
serialized_config["model"]["models"]["encoder"] = serialized_config["model"][
"models"
]["encoder"].__name__
serialized_config["model"]["models"]["decoder"] = serialized_config["model"][
"models"
]["decoder"].__name__
serialized_config["training"]["optimizers"]["all"]["class"] = serialized_config[
"training"
]["optimizers"]["all"]["class"].__name__
# Cast the functions to str
serialized_config["dataset_params"]["config"]["augmentation"] = str(
serialized_config["dataset_params"]["config"]["augmentation"]
serialized_config["dataset"]["config"]["augmentation"] = str(
serialized_config["dataset"]["config"]["augmentation"]
)
serialized_config["training_params"]["nb_gpu"] = str(
serialized_config["training_params"]["nb_gpu"]
serialized_config["training"]["nb_gpu"] = str(
serialized_config["training"]["nb_gpu"]
)
return serialized_config
......@@ -241,13 +249,13 @@ def serialize_config(config):
def start_training(config, mlflow_logging: bool) -> None:
if (
config["training_params"]["use_ddp"]
and not config["training_params"]["force_cpu"]
config["training"]["device"]["use_ddp"]
and not config["training"]["device"]["force_cpu"]
):
mp.spawn(
train_and_test,
args=(config, mlflow_logging),
nprocs=config["training_params"]["nb_gpu"],
nprocs=config["training"]["device"]["nb_gpu"],
)
else:
train_and_test(0, config, mlflow_logging)
......@@ -269,9 +277,7 @@ def run():
if "mlflow" not in config:
start_training(config, mlflow_logging=False)
else:
labels_path = (
Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json"
)
labels_path = Path(config["dataset"]["datasets"][dataset_name]) / "labels.json"
with start_mlflow_run(config["mlflow"]) as (run, created):
if created:
logger.info(f"Started MLflow run with ID ({run.info.run_id})")
......
......@@ -28,7 +28,7 @@ def demo_db(database_path):
@pytest.fixture
def training_config():
return {
"dataset_params": {
"dataset": {
"datasets": {
"training": "./tests/data/training/training_dataset",
},
......@@ -48,48 +48,75 @@ def training_config():
("training", "test"),
],
},
"config": {
"max_char_prediction": 30, # max number of token prediction
"tokens": None,
},
"model": {
"transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model
"additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset
"encoder": {
"class": FCN_Encoder,
"dropout": 0.5, # dropout rate for encoder
"nb_layers": 5, # encoder
},
"h_max": 500, # maximum height for encoder output (for 2D positional embedding)
"w_max": 1000, # maximum width for encoder output (for 2D positional embedding)
"decoder": {
"class": GlobalHTADecoder,
"l_max": 15000, # max predicted sequence (for 1D positional embedding)
"dec_num_layers": 8, # number of transformer decoder layers
"dec_num_heads": 4, # number of heads in transformer decoder layers
"dec_res_dropout": 0.1, # dropout in transformer decoder layers
"dec_pred_dropout": 0.1, # dropout rate before decision layer
"dec_att_dropout": 0.1, # dropout rate in multi head attention
"dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers
"attention_win": 100, # length of attention window
"enc_dim": 256, # dimension of extracted features
},
},
"training": {
"data": {
"batch_size": 2, # mini-batch size for training
"load_in_memory": True, # Load all images in CPU memory
"worker_per_gpu": 4, # Num of parallel processes per gpu for data loading
"preprocessings": [
{
"type": Preprocessing.MaxResize,
"max_width": 2000,
"max_height": 2000,
},
}
],
"augmentation": True,
},
"tokens": None,
},
"model_params": {
"models": {
"encoder": FCN_Encoder,
"decoder": GlobalHTADecoder,
"device": {
"use_ddp": False, # Use DistributedDataParallel
"ddp_port": "20027",
"use_amp": True, # Enable automatic mix-precision
"nb_gpu": 0,
"force_cpu": True, # True for debug purposes
},
"metrics": {
"train": [
"loss_ce",
"cer",
"wer",
"wer_no_punct",
], # Metrics name for training
"eval": [
"cer",
"wer",
"wer_no_punct",
], # Metrics name for evaluation on validation set during training
},
"validation": {
"eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not
"eval_on_valid_interval": 2, # Interval (in epochs) to evaluate during training
"set_name_focus_metric": "training-val",
},
"transfer_learning": None,
"transfered_charset": True, # Transfer learning of the decision layer based on charset of the line HTR model
"additional_tokens": 1, # for decision layer = [<eot>, ], only for transferred charset
"dropout": 0.5, # dropout rate for encoder
"enc_dim": 256, # dimension of extracted features
"nb_layers": 5, # encoder
"h_max": 500, # maximum height for encoder output (for 2D positional embedding)
"w_max": 1000, # maximum width for encoder output (for 2D positional embedding)
"l_max": 15000, # max predicted sequence (for 1D positional embedding)
"dec_num_layers": 8, # number of transformer decoder layers
"dec_num_heads": 4, # number of heads in transformer decoder layers
"dec_res_dropout": 0.1, # dropout in transformer decoder layers
"dec_pred_dropout": 0.1, # dropout rate before decision layer
"dec_att_dropout": 0.1, # dropout rate in multi head attention
"dec_dim_feedforward": 256, # number of dimension for feedforward layer in transformer decoder layers
"attention_win": 100, # length of attention window
},
"training_params": {
"output_folder": "dan_trained_model", # folder name for checkpoint and results
"gradient_clipping": {},
"max_nb_epochs": 4, # maximum number of epochs before to stop
"load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate
"batch_size": 2, # mini-batch size for training
"use_ddp": False, # Use DistributedDataParallel
"nb_gpu": 0,
"optimizers": {
"all": {
"class": Adam,
......@@ -100,28 +127,13 @@ def training_config():
},
},
"lr_schedulers": None, # Learning rate schedulers
"eval_on_valid": True, # Whether to eval and logs metrics on validation set during training or not
"eval_on_valid_interval": 2, # Interval (in epochs) to evaluate during training
"set_name_focus_metric": "training-val", # Which dataset to focus on to select best weights
"train_metrics": [
"loss_ce",
"cer",
"wer",
"wer_no_punct",
], # Metrics name for training
"eval_metrics": [
"cer",
"wer",
"wer_no_punct",
], # Metrics name for evaluation on validation set during training
"force_cpu": True, # True for debug purposes
"max_char_prediction": 30, # max number of token prediction
# Keep teacher forcing rate to 20% during whole training
"label_noise_scheduler": {
"min_error_rate": 0.2,
"max_error_rate": 0.2,
"total_num_steps": 5e4,
},
"transfer_learning": None,
},
}
......
......@@ -88,8 +88,8 @@ def test_train_and_test(
tmp_path,
):
# Use the tmp_path as base folder
training_config["training_params"]["output_folder"] = str(
tmp_path / training_config["training_params"]["output_folder"]
training_config["training"]["output_folder"] = str(
tmp_path / training_config["training"]["output_folder"]
)
train_and_test(0, training_config)
......@@ -99,7 +99,7 @@ def test_train_and_test(
expected_model = torch.load(FIXTURES / "training" / "models" / model_name)
trained_model = torch.load(
tmp_path
/ training_config["training_params"]["output_folder"]
/ training_config["training"]["output_folder"]
/ "checkpoints"
/ model_name,
)
......@@ -114,7 +114,9 @@ def test_train_and_test(
expected_tensor,
) in zip(trained.items(), expected.items()):
assert trained_param == expected_param
assert torch.allclose(trained_tensor, expected_tensor, atol=1e-03)
assert torch.allclose(
trained_tensor, expected_tensor, rtol=1e-05, atol=1e-03
)
# Check the optimizer encoder and decoder state dicts
for optimizer_part in [
......@@ -169,7 +171,7 @@ def test_train_and_test(
):
with (
tmp_path
/ training_config["training_params"]["output_folder"]
/ training_config["training"]["output_folder"]
/ "results"
/ f"predict_training-{split_name}_0.yaml"
).open() as f:
......@@ -184,7 +186,7 @@ def test_train_and_test(
# Check that the inference parameters file is correct
with (
tmp_path
/ training_config["training_params"]["output_folder"]
/ training_config["training"]["output_folder"]
/ "results"
/ "inference_parameters.yml"
).open() as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment