Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (2)
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
"ddp_port": "20027", "ddp_port": "20027",
"use_amp": true, "use_amp": true,
"nb_gpu": null, "nb_gpu": null,
"force_cpu": false "force": null
}, },
"metrics": { "metrics": {
"train": [ "train": [
......
...@@ -100,8 +100,10 @@ class GenericTrainingManager: ...@@ -100,8 +100,10 @@ class GenericTrainingManager:
self.dataset.load_dataloaders() self.dataset.load_dataloaders()
def init_hardware_config(self): def init_hardware_config(self):
cuda_is_available = torch.cuda.is_available()
# Debug mode # Debug mode
if self.device_params["force_cpu"]: if self.device_params["force"] not in [None, "cuda"] or not cuda_is_available:
self.device_params["use_ddp"] = False self.device_params["use_ddp"] = False
self.device_params["use_amp"] = False self.device_params["use_amp"] = False
...@@ -116,17 +118,14 @@ class GenericTrainingManager: ...@@ -116,17 +118,14 @@ class GenericTrainingManager:
"rank": self.device_params["ddp_rank"], "rank": self.device_params["ddp_rank"],
} }
self.is_master = self.ddp_config["master"] or not self.device_params["use_ddp"] self.is_master = self.ddp_config["master"] or not self.device_params["use_ddp"]
if self.device_params["force_cpu"]: if self.device_params["use_ddp"]:
self.device = torch.device("cpu") self.device = torch.device(self.ddp_config["rank"])
self.device_params["ddp_rank"] = self.ddp_config["rank"]
self.launch_ddp()
else: else:
if self.device_params["use_ddp"]: self.device = torch.device(
self.device = torch.device(self.ddp_config["rank"]) self.device_params["force"] or "cuda" if cuda_is_available else "cpu"
self.device_params["ddp_rank"] = self.ddp_config["rank"] )
self.launch_ddp()
else:
self.device = torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu"
)
if self.device == torch.device("cpu"): if self.device == torch.device("cpu"):
self.params["model"]["device"] = "cpu" self.params["model"]["device"] = "cpu"
else: else:
......
...@@ -413,7 +413,7 @@ def process_batch( ...@@ -413,7 +413,7 @@ def process_batch(
) )
result["attention_gif"] = gif_filename result["attention_gif"] = gif_filename
json_filename = Path(output, image_path.stem).with_suffix(".json") json_filename = Path(output, f"{image_path.stem}.json")
logger.info(f"Saving JSON prediction in {json_filename}") logger.info(f"Saving JSON prediction in {json_filename}")
json_filename.write_text(json.dumps(result, indent=2)) json_filename.write_text(json.dumps(result, indent=2))
......
...@@ -146,7 +146,8 @@ def serialize_config(config): ...@@ -146,7 +146,8 @@ def serialize_config(config):
def start_training(config, mlflow_logging: bool) -> None: def start_training(config, mlflow_logging: bool) -> None:
if ( if (
config["training"]["device"]["use_ddp"] config["training"]["device"]["use_ddp"]
and not config["training"]["device"]["force_cpu"] and config["training"]["device"]["force"] in [None, "cuda"]
and torch.cuda.is_available()
): ):
mp.spawn( mp.spawn(
train_and_test, train_and_test,
......
...@@ -37,34 +37,34 @@ To determine the value to use for `dataset.max_char_prediction`, you can use the ...@@ -37,34 +37,34 @@ To determine the value to use for `dataset.max_char_prediction`, you can use the
## Training parameters ## Training parameters
| Name | Description | Type | Default | | Name | Description | Type | Default |
| ------------------------------------------------ | -------------------------------------------------------------------------------------- | ------------ | ----------------------------------------------------------------- | | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | ------------ | ----------------------------------------------------------------- |
| `training.data.batch_size` | Mini-batch size for the training loop. | `int` | `2` | | `training.data.batch_size` | Mini-batch size for the training loop. | `int` | `2` |
| `training.data.load_in_memory` | Load all images in CPU memory. | `bool` | `True` | | `training.data.load_in_memory` | Load all images in CPU memory. | `bool` | `True` |
| `training.data.worker_per_gpu` | Number of parallel processes per gpu for data loading. | `int` | `4` | | `training.data.worker_per_gpu` | Number of parallel processes per gpu for data loading. | `int` | `4` |
| `training.data.preprocessings` | List of pre-processing functions to apply to input images. | `list` | (see [dedicated section](#data-preprocessing)) | | `training.data.preprocessings` | List of pre-processing functions to apply to input images. | `list` | (see [dedicated section](#data-preprocessing)) |
| `training.data.augmentation` | Whether to use data augmentation on the training set. | `bool` | `True` (see [dedicated section](#data-augmentation)) | | `training.data.augmentation` | Whether to use data augmentation on the training set. | `bool` | `True` (see [dedicated section](#data-augmentation)) |
| `training.output_folder` | Directory for checkpoint and results. | `str` | | | `training.output_folder` | Directory for checkpoint and results. | `str` | |
| `training.max_nb_epochs` | Maximum number of epochs before stopping training. | `int` | `800` | | `training.max_nb_epochs` | Maximum number of epochs before stopping training. | `int` | `800` |
| `training.load_epoch` | Model to load. Should be either `"best"` (evaluation) or `last` (training). | `str` | `"last"` | | `training.load_epoch` | Model to load. Should be either `"best"` (evaluation) or `last` (training). | `str` | `"last"` |
| `training.device.use_ddp` | Whether to use DistributedDataParallel. | `bool` | `False` | | `training.device.use_ddp` | Whether to use DistributedDataParallel. | `bool` | `False` |
| `training.device.ddp_port` | DDP port. | `int` | `20027` | | `training.device.ddp_port` | DDP port. | `int` | `20027` |
| `training.device.use_amp` | Whether to enable automatic mix-precision. | `bool` | `True` | | `training.device.use_amp` | Whether to enable automatic mix-precision. | `bool` | `True` |
| `training.device.nb_gpu` | Number of GPUs to train DAN. Set to `null` to use all GPUs available. | `int` | | | `training.device.nb_gpu` | Number of GPUs to train DAN. Set to `null` to use all GPUs available. | `int` | |
| `training.device.force_cpu` | Whether to train on CPU (for debugging). | `bool` | `False` | | `training.device.force` | Use a specific device if available. Use `cpu` to train on CPU (for debugging) or `cuda`/`cuda:$gpu_device` to train on GPU. | `str` | |
| `training.optimizers.all.args.lr` | Learning rate for the optimizer. | `float` | `0.0001` | | `training.optimizers.all.args.lr` | Learning rate for the optimizer. | `float` | `0.0001` |
| `training.optimizers.all.args.amsgrad` | Whether to use AMSGrad optimization. | `bool` | `False` | | `training.optimizers.all.args.amsgrad` | Whether to use AMSGrad optimization. | `bool` | `False` |
| `training.lr_schedulers` | Learning rate schedulers. | custom class | | | `training.lr_schedulers` | Learning rate schedulers. | custom class | |
| `training.validation.eval_on_valid` | Whether to evaluate and log metrics on the validation set during training. | `bool` | `True` | | `training.validation.eval_on_valid` | Whether to evaluate and log metrics on the validation set during training. | `bool` | `True` |
| `training.validation.eval_on_valid_interval` | Interval (in epochs) to evaluate during training. | `int` | `5` | | `training.validation.eval_on_valid_interval` | Interval (in epochs) to evaluate during training. | `int` | `5` |
| `training.validation.set_name_focus_metric` | Dataset to focus on to select best weights. | `str` | | | `training.validation.set_name_focus_metric` | Dataset to focus on to select best weights. | `str` | |
| `training.metrics.train` | List of metrics to compute during training. | `list` | `["loss_ce", "cer", "wer", "wer_no_punct"]` | | `training.metrics.train` | List of metrics to compute during training. | `list` | `["loss_ce", "cer", "wer", "wer_no_punct"]` |
| `training.metrics.eval` | List of metrics to compute during validation. | `list` | `["cer", "wer", "wer_no_punct"]` | | `training.metrics.eval` | List of metrics to compute during validation. | `list` | `["cer", "wer", "wer_no_punct"]` |
| `training.label_noise_scheduler.min_error_rate` | Minimum ratio of teacher forcing. | `float` | `0.2` | | `training.label_noise_scheduler.min_error_rate` | Minimum ratio of teacher forcing. | `float` | `0.2` |
| `training.label_noise_scheduler.max_error_rate` | Maximum ratio of teacher forcing. | `float` | `0.2` | | `training.label_noise_scheduler.max_error_rate` | Maximum ratio of teacher forcing. | `float` | `0.2` |
| `training.label_noise_scheduler.total_num_steps` | Number of steps before stopping teacher forcing. | `float` | `5e4` | | `training.label_noise_scheduler.total_num_steps` | Number of steps before stopping teacher forcing. | `float` | `5e4` |
| `training.transfer_learning.encoder` | Model to load for the encoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list` | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]` | | `training.transfer_learning.encoder` | Model to load for the encoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list` | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]` |
| `training.transfer_learning.decoder` | Model to load for the decoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list` | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` | | `training.transfer_learning.decoder` | Model to load for the decoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list` | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` |
- To train on several GPUs, simply set the `training.use_ddp` parameter to `True`. By default, the model will use all available GPUs. To restrict access to fewer GPUs, one can modify the `training.nb_gpu` parameter. - To train on several GPUs, simply set the `training.use_ddp` parameter to `True`. By default, the model will use all available GPUs. To restrict access to fewer GPUs, one can modify the `training.nb_gpu` parameter.
- During the validation stage, the batch size is set to 1. This avoids problems associated with image sizes that can be very different inside batches and lead to significant padding, resulting in performance degradations. - During the validation stage, the batch size is set to 1. This avoids problems associated with image sizes that can be very different inside batches and lead to significant padding, resulting in performance degradations.
......
...@@ -248,7 +248,7 @@ def training_config(): ...@@ -248,7 +248,7 @@ def training_config():
"ddp_port": "20027", "ddp_port": "20027",
"use_amp": True, # Enable automatic mix-precision "use_amp": True, # Enable automatic mix-precision
"nb_gpu": 0, "nb_gpu": 0,
"force_cpu": True, # True for debug purposes "force": "cpu", # `cpu` for debug purposes
}, },
"metrics": { "metrics": {
"train": [ "train": [
......