From ccce90585b69a7a8e29d0ce0432904dcdb1261e8 Mon Sep 17 00:00:00 2001
From: Manon Blanco <blanco@teklia.com>
Date: Wed, 25 Oct 2023 13:35:17 +0000
Subject: [PATCH] Allow specifying device to use when training

---
 configs/quickstart.json     |  2 +-
 dan/ocr/manager/training.py | 21 +++++++-------
 dan/ocr/train.py            |  3 +-
 docs/usage/train/config.md  | 56 ++++++++++++++++++-------------------
 tests/conftest.py           |  2 +-
 5 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/configs/quickstart.json b/configs/quickstart.json
index b420fca5..6c8a00fa 100644
--- a/configs/quickstart.json
+++ b/configs/quickstart.json
@@ -71,7 +71,7 @@
             "ddp_port": "20027",
             "use_amp": true,
             "nb_gpu": null,
-            "force_cpu": false
+            "force": null
         },
         "metrics": {
             "train": [
diff --git a/dan/ocr/manager/training.py b/dan/ocr/manager/training.py
index 894e489d..ff35cff5 100644
--- a/dan/ocr/manager/training.py
+++ b/dan/ocr/manager/training.py
@@ -100,8 +100,10 @@ class GenericTrainingManager:
         self.dataset.load_dataloaders()
 
     def init_hardware_config(self):
+        cuda_is_available = torch.cuda.is_available()
+
         # Debug mode
-        if self.device_params["force_cpu"]:
+        if self.device_params["force"] not in [None, "cuda"] or not cuda_is_available:
             self.device_params["use_ddp"] = False
             self.device_params["use_amp"] = False
 
@@ -116,17 +118,14 @@ class GenericTrainingManager:
             "rank": self.device_params["ddp_rank"],
         }
         self.is_master = self.ddp_config["master"] or not self.device_params["use_ddp"]
-        if self.device_params["force_cpu"]:
-            self.device = torch.device("cpu")
+        if self.device_params["use_ddp"]:
+            self.device = torch.device(self.ddp_config["rank"])
+            self.device_params["ddp_rank"] = self.ddp_config["rank"]
+            self.launch_ddp()
         else:
-            if self.device_params["use_ddp"]:
-                self.device = torch.device(self.ddp_config["rank"])
-                self.device_params["ddp_rank"] = self.ddp_config["rank"]
-                self.launch_ddp()
-            else:
-                self.device = torch.device(
-                    "cuda:0" if torch.cuda.is_available() else "cpu"
-                )
+            self.device = torch.device(
+                self.device_params["force"] or "cuda" if cuda_is_available else "cpu"
+            )
         if self.device == torch.device("cpu"):
             self.params["model"]["device"] = "cpu"
         else:
diff --git a/dan/ocr/train.py b/dan/ocr/train.py
index b616effa..d4bf423d 100644
--- a/dan/ocr/train.py
+++ b/dan/ocr/train.py
@@ -146,7 +146,8 @@ def serialize_config(config):
 def start_training(config, mlflow_logging: bool) -> None:
     if (
         config["training"]["device"]["use_ddp"]
-        and not config["training"]["device"]["force_cpu"]
+        and config["training"]["device"]["force"] in [None, "cuda"]
+        and torch.cuda.is_available()
     ):
         mp.spawn(
             train_and_test,
diff --git a/docs/usage/train/config.md b/docs/usage/train/config.md
index abf7df75..148f0fa9 100644
--- a/docs/usage/train/config.md
+++ b/docs/usage/train/config.md
@@ -37,34 +37,34 @@ To determine the value to use for `dataset.max_char_prediction`, you can use the
 
 ## Training parameters
 
-| Name                                             | Description                                                                            | Type         | Default                                                           |
-| ------------------------------------------------ | -------------------------------------------------------------------------------------- | ------------ | ----------------------------------------------------------------- |
-| `training.data.batch_size`                       | Mini-batch size for the training loop.                                                 | `int`        | `2`                                                               |
-| `training.data.load_in_memory`                   | Load all images in CPU memory.                                                         | `bool`       | `True`                                                            |
-| `training.data.worker_per_gpu`                   | Number of parallel processes per gpu for data loading.                                 | `int`        | `4`                                                               |
-| `training.data.preprocessings`                   | List of pre-processing functions to apply to input images.                             | `list`       | (see [dedicated section](#data-preprocessing))                    |
-| `training.data.augmentation`                     | Whether to use data augmentation on the training set.                                  | `bool`       | `True` (see [dedicated section](#data-augmentation))              |
-| `training.output_folder`                         | Directory for checkpoint and results.                                                  | `str`        |                                                                   |
-| `training.max_nb_epochs`                         | Maximum number of epochs before stopping training.                                     | `int`        | `800`                                                             |
-| `training.load_epoch`                            | Model to load. Should be either `"best"` (evaluation) or `last` (training).            | `str`        | `"last"`                                                          |
-| `training.device.use_ddp`                        | Whether to use DistributedDataParallel.                                                | `bool`       | `False`                                                           |
-| `training.device.ddp_port`                       | DDP port.                                                                              | `int`        | `20027`                                                           |
-| `training.device.use_amp`                        | Whether to enable automatic mix-precision.                                             | `bool`       | `True`                                                            |
-| `training.device.nb_gpu`                         | Number of GPUs to train DAN. Set to `null` to use all GPUs available.                  | `int`        |                                                                   |
-| `training.device.force_cpu`                      | Whether to train on CPU (for debugging).                                               | `bool`       | `False`                                                           |
-| `training.optimizers.all.args.lr`                | Learning rate for the optimizer.                                                       | `float`      | `0.0001`                                                          |
-| `training.optimizers.all.args.amsgrad`           | Whether to use AMSGrad optimization.                                                   | `bool`       | `False`                                                           |
-| `training.lr_schedulers`                         | Learning rate schedulers.                                                              | custom class |                                                                   |
-| `training.validation.eval_on_valid`              | Whether to evaluate and log metrics on the validation set during training.             | `bool`       | `True`                                                            |
-| `training.validation.eval_on_valid_interval`     | Interval (in epochs) to evaluate during training.                                      | `int`        | `5`                                                               |
-| `training.validation.set_name_focus_metric`      | Dataset to focus on to select best weights.                                            | `str`        |                                                                   |
-| `training.metrics.train`                         | List of metrics to compute during training.                                            | `list`       | `["loss_ce", "cer", "wer", "wer_no_punct"]`                       |
-| `training.metrics.eval`                          | List of metrics to compute during validation.                                          | `list`       | `["cer", "wer", "wer_no_punct"]`                                  |
-| `training.label_noise_scheduler.min_error_rate`  | Minimum ratio of teacher forcing.                                                      | `float`      | `0.2`                                                             |
-| `training.label_noise_scheduler.max_error_rate`  | Maximum ratio of teacher forcing.                                                      | `float`      | `0.2`                                                             |
-| `training.label_noise_scheduler.total_num_steps` | Number of steps before stopping teacher forcing.                                       | `float`      | `5e4`                                                             |
-| `training.transfer_learning.encoder`             | Model to load for the encoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]`  |
-| `training.transfer_learning.decoder`             | Model to load for the decoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` |
+| Name                                             | Description                                                                                                                 | Type         | Default                                                           |
+| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | ------------ | ----------------------------------------------------------------- |
+| `training.data.batch_size`                       | Mini-batch size for the training loop.                                                                                      | `int`        | `2`                                                               |
+| `training.data.load_in_memory`                   | Load all images in CPU memory.                                                                                              | `bool`       | `True`                                                            |
+| `training.data.worker_per_gpu`                   | Number of parallel processes per gpu for data loading.                                                                      | `int`        | `4`                                                               |
+| `training.data.preprocessings`                   | List of pre-processing functions to apply to input images.                                                                  | `list`       | (see [dedicated section](#data-preprocessing))                    |
+| `training.data.augmentation`                     | Whether to use data augmentation on the training set.                                                                       | `bool`       | `True` (see [dedicated section](#data-augmentation))              |
+| `training.output_folder`                         | Directory for checkpoint and results.                                                                                       | `str`        |                                                                   |
+| `training.max_nb_epochs`                         | Maximum number of epochs before stopping training.                                                                          | `int`        | `800`                                                             |
+| `training.load_epoch`                            | Model to load. Should be either `"best"` (evaluation) or `last` (training).                                                 | `str`        | `"last"`                                                          |
+| `training.device.use_ddp`                        | Whether to use DistributedDataParallel.                                                                                     | `bool`       | `False`                                                           |
+| `training.device.ddp_port`                       | DDP port.                                                                                                                   | `int`        | `20027`                                                           |
+| `training.device.use_amp`                        | Whether to enable automatic mix-precision.                                                                                  | `bool`       | `True`                                                            |
+| `training.device.nb_gpu`                         | Number of GPUs to train DAN. Set to `null` to use all GPUs available.                                                       | `int`        |                                                                   |
+| `training.device.force`                          | Use a specific device if available. Use `cpu` to train on CPU (for debugging) or `cuda`/`cuda:$gpu_device` to train on GPU. | `str`        |                                                                   |
+| `training.optimizers.all.args.lr`                | Learning rate for the optimizer.                                                                                            | `float`      | `0.0001`                                                          |
+| `training.optimizers.all.args.amsgrad`           | Whether to use AMSGrad optimization.                                                                                        | `bool`       | `False`                                                           |
+| `training.lr_schedulers`                         | Learning rate schedulers.                                                                                                   | custom class |                                                                   |
+| `training.validation.eval_on_valid`              | Whether to evaluate and log metrics on the validation set during training.                                                  | `bool`       | `True`                                                            |
+| `training.validation.eval_on_valid_interval`     | Interval (in epochs) to evaluate during training.                                                                           | `int`        | `5`                                                               |
+| `training.validation.set_name_focus_metric`      | Dataset to focus on to select best weights.                                                                                 | `str`        |                                                                   |
+| `training.metrics.train`                         | List of metrics to compute during training.                                                                                 | `list`       | `["loss_ce", "cer", "wer", "wer_no_punct"]`                       |
+| `training.metrics.eval`                          | List of metrics to compute during validation.                                                                               | `list`       | `["cer", "wer", "wer_no_punct"]`                                  |
+| `training.label_noise_scheduler.min_error_rate`  | Minimum ratio of teacher forcing.                                                                                           | `float`      | `0.2`                                                             |
+| `training.label_noise_scheduler.max_error_rate`  | Maximum ratio of teacher forcing.                                                                                           | `float`      | `0.2`                                                             |
+| `training.label_noise_scheduler.total_num_steps` | Number of steps before stopping teacher forcing.                                                                            | `float`      | `5e4`                                                             |
+| `training.transfer_learning.encoder`             | Model to load for the encoder \[state_dict_name, checkpoint_path, learnable, strict\].                                      | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]`  |
+| `training.transfer_learning.decoder`             | Model to load for the decoder \[state_dict_name, checkpoint_path, learnable, strict\].                                      | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` |
 
 - To train on several GPUs, simply set the `training.use_ddp` parameter to `True`. By default, the model will use all available GPUs. To restrict access to fewer GPUs, one can modify the `training.nb_gpu` parameter.
 - During the validation stage, the batch size is set to 1. This avoids problems associated with image sizes that can be very different inside batches and lead to significant padding, resulting in performance degradations.
diff --git a/tests/conftest.py b/tests/conftest.py
index 93f5870a..93bae16c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -248,7 +248,7 @@ def training_config():
                 "ddp_port": "20027",
                 "use_amp": True,  # Enable automatic mix-precision
                 "nb_gpu": 0,
-                "force_cpu": True,  # True for debug purposes
+                "force": "cpu",  # `cpu` for debug purposes
             },
             "metrics": {
                 "train": [
-- 
GitLab