Manon Blanco
--- a/dan/ocr/train.py

+ 67

− 36
+++ b/dan/ocr/train.py

+ 67

− 36
 @@ -5,16 +5,20 @@
 import json
 import logging
 import random
+import sys
 from copy import deepcopy
+from itertools import pairwise

 import numpy as np
 import torch
 import torch.multiprocessing as mp
+from torch.cuda import OutOfMemoryError
+from torch.multiprocessing.spawn import ProcessExitedException

 from dan.ocr import wandb
 from dan.ocr.manager.training import Manager
 from dan.ocr.mlflow import MLFLOW_AVAILABLE
-from dan.ocr.utils import update_config
+from dan.ocr.utils import build_batch_sizes, update_config
 from dan.utils import MLflowNotInstalled

 if MLFLOW_AVAILABLE:
 @@ -25,8 +29,18 @@ if MLFLOW_AVAILABLE:

 logger = logging.getLogger(__name__)

+# Special exit code used when the training stopped because of a `torch.cuda.OutOfMemoryError`
+EXIT_CODE_OUT_OF_MEMORY_ERROR = 142
+

 def train(rank, params, mlflow_logging=False):
+    # Start "Weights & Biases" as soon as possible
+    wandb.init(
+        wandb_params=params.get("wandb", {}).get("init", {}),
+        config={wandb.Config.TRAINING.value: params},
+        output_folder=params["training"]["output_folder"],
+    )
+
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    np.random.seed(0)
 @@ -47,7 +61,11 @@ def train(rank, params, mlflow_logging=False):
    if mlflow_logging:
        logger.info("MLflow logging enabled")

-    model.train(mlflow_logging=mlflow_logging)
+    try:
+        model.train(mlflow_logging=mlflow_logging)
+    except OutOfMemoryError as e:
+        logger.error(repr(e))
+        sys.exit(EXIT_CODE_OUT_OF_MEMORY_ERROR)


 def serialize_config(config):
 @@ -88,21 +106,6 @@ def serialize_config(config):
    return serialized_config


-def start_training(config, mlflow_logging: bool) -> None:
-    if (
-        config["training"]["device"]["use_ddp"]
-        and config["training"]["device"]["force"] in [None, "cuda"]
-        and torch.cuda.is_available()
-    ):
-        mp.spawn(
-            train,
-            args=(config, mlflow_logging),
-            nprocs=config["training"]["device"]["nb_gpu"],
-        )
-    else:
-        train(0, config, mlflow_logging)
-
-
 def run(config: dict):
    """
    Main program, training a new model, using a valid configuration
 @@ -114,28 +117,19 @@ def run(config: dict):
    dataset_name = names.pop()
    update_config(config)

-    # Start "Weights & Biases" as soon as possible
-    wandb.init(
-        wandb_params=config.get("wandb", {}).get("init", {}),
-        config={wandb.Config.TRAINING.value: config},
-        output_folder=config["training"]["output_folder"],
-    )
-
-    if config.get("mlflow") and not MLFLOW_AVAILABLE:
-        logger.error(
-            "Cannot log to MLflow. Please install the `mlflow` extra requirements."
-        )
-        raise MLflowNotInstalled()
+    if config.get("mlflow"):
+        if not MLFLOW_AVAILABLE:
+            logger.error(
+                "Cannot log to MLflow. Please install the `mlflow` extra requirements."
+            )
+            raise MLflowNotInstalled()

-    if not config.get("mlflow"):
-        start_training(config, mlflow_logging=False)
-    else:
        labels_path = config["dataset"]["datasets"][dataset_name] / "labels.json"
-        with start_mlflow_run(config["mlflow"]) as (run, created):
+        with start_mlflow_run(config["mlflow"]) as (mlflow_run, created):
            if created:
-                logger.info(f"Started MLflow run with ID ({run.info.run_id})")
+                logger.info(f"Started MLflow run with ID ({mlflow_run.info.run_id})")
            else:
-                logger.info(f"Resumed MLflow run with ID ({run.info.run_id})")
+                logger.info(f"Resumed MLflow run with ID ({mlflow_run.info.run_id})")

            make_mlflow_request(
                mlflow_method=mlflow.set_tags, tags={"Dataset": dataset_name}
 @@ -153,4 +147,41 @@ def run(config: dict):
                    dictionary=artifact,
                    artifact_file=filename,
                )
-            start_training(config, mlflow_logging=True)
+
+    initial_batch_size = config["training"]["data"]["batch_size"]
+    batch_sizes = list(build_batch_sizes(initial_batch_size))
+    logger.info(
+        f"Training will start with a batch size of {initial_batch_size}. "
+        f"If training requires too much memory, it will be stopped and will be restarted with a smaller batch: {batch_sizes[1:]}."
+    )
+    for batch_size, next_batch_size in pairwise(batch_sizes + [None]):
+        config["training"]["data"]["batch_size"] = batch_size
+        try:
+            mp.spawn(
+                train,
+                args=(config, bool(config.get("mlflow"))),
+                nprocs=(
+                    config["training"]["device"]["nb_gpu"]
+                    if (
+                        config["training"]["device"]["use_ddp"]
+                        and config["training"]["device"]["force"] in [None, "cuda"]
+                        and torch.cuda.is_available()
+                    )
+                    else 1
+                ),
+            )
+            return
+        except ProcessExitedException as e:
+            # Training stopped for another reason
+            if e.exit_code != EXIT_CODE_OUT_OF_MEMORY_ERROR:
+                raise
+
+            # No more batch size available
+            if not next_batch_size:
+                raise Exception(
+                    "torch.cuda.OutOfMemoryError: No more batch size available"
+                )
+
+            logger.warning(
+                f"Failed to train with batch size of {batch_size}. Trying with smaller batch size of {next_batch_size}..."
+            )