Skip to content
Snippets Groups Projects

Memory leak: Catch error and retry

Merged Manon Blanco requested to merge memory-leak-catch-error-and-retry into main
All threads resolved!
1 file
+ 2
3
Compare changes
  • Side-by-side
  • Inline
+ 67
36
@@ -5,16 +5,20 @@
import json
import logging
import random
import sys
from copy import deepcopy
from itertools import pairwise
import numpy as np
import torch
import torch.multiprocessing as mp
from torch.cuda import OutOfMemoryError
from torch.multiprocessing.spawn import ProcessExitedException
from dan.ocr import wandb
from dan.ocr.manager.training import Manager
from dan.ocr.mlflow import MLFLOW_AVAILABLE
from dan.ocr.utils import update_config
from dan.ocr.utils import build_batch_sizes, update_config
from dan.utils import MLflowNotInstalled
if MLFLOW_AVAILABLE:
@@ -25,8 +29,18 @@ if MLFLOW_AVAILABLE:
logger = logging.getLogger(__name__)
# Special exit code used when the training stopped because of a `torch.cuda.OutOfMemoryError`
EXIT_CODE_OUT_OF_MEMORY_ERROR = 142
def train(rank, params, mlflow_logging=False):
# Start "Weights & Biases" as soon as possible
wandb.init(
wandb_params=params.get("wandb", {}).get("init", {}),
config={wandb.Config.TRAINING.value: params},
output_folder=params["training"]["output_folder"],
)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
@@ -47,7 +61,11 @@ def train(rank, params, mlflow_logging=False):
if mlflow_logging:
logger.info("MLflow logging enabled")
model.train(mlflow_logging=mlflow_logging)
try:
model.train(mlflow_logging=mlflow_logging)
except OutOfMemoryError as e:
logger.error(repr(e))
sys.exit(EXIT_CODE_OUT_OF_MEMORY_ERROR)
def serialize_config(config):
@@ -88,21 +106,6 @@ def serialize_config(config):
return serialized_config
def start_training(config, mlflow_logging: bool) -> None:
if (
config["training"]["device"]["use_ddp"]
and config["training"]["device"]["force"] in [None, "cuda"]
and torch.cuda.is_available()
):
mp.spawn(
train,
args=(config, mlflow_logging),
nprocs=config["training"]["device"]["nb_gpu"],
)
else:
train(0, config, mlflow_logging)
def run(config: dict):
"""
Main program, training a new model, using a valid configuration
@@ -114,28 +117,19 @@ def run(config: dict):
dataset_name = names.pop()
update_config(config)
# Start "Weights & Biases" as soon as possible
wandb.init(
wandb_params=config.get("wandb", {}).get("init", {}),
config={wandb.Config.TRAINING.value: config},
output_folder=config["training"]["output_folder"],
)
if config.get("mlflow") and not MLFLOW_AVAILABLE:
logger.error(
"Cannot log to MLflow. Please install the `mlflow` extra requirements."
)
raise MLflowNotInstalled()
if config.get("mlflow"):
if not MLFLOW_AVAILABLE:
logger.error(
"Cannot log to MLflow. Please install the `mlflow` extra requirements."
)
raise MLflowNotInstalled()
if not config.get("mlflow"):
start_training(config, mlflow_logging=False)
else:
labels_path = config["dataset"]["datasets"][dataset_name] / "labels.json"
with start_mlflow_run(config["mlflow"]) as (run, created):
with start_mlflow_run(config["mlflow"]) as (mlflow_run, created):
if created:
logger.info(f"Started MLflow run with ID ({run.info.run_id})")
logger.info(f"Started MLflow run with ID ({mlflow_run.info.run_id})")
else:
logger.info(f"Resumed MLflow run with ID ({run.info.run_id})")
logger.info(f"Resumed MLflow run with ID ({mlflow_run.info.run_id})")
make_mlflow_request(
mlflow_method=mlflow.set_tags, tags={"Dataset": dataset_name}
@@ -153,4 +147,41 @@ def run(config: dict):
dictionary=artifact,
artifact_file=filename,
)
start_training(config, mlflow_logging=True)
initial_batch_size = config["training"]["data"]["batch_size"]
batch_sizes = list(build_batch_sizes(initial_batch_size))
logger.info(
f"Training will start with a batch size of {initial_batch_size}. "
f"If training requires too much memory, it will be stopped and will be restarted with a smaller batch: {batch_sizes[1:]}."
)
for batch_size, next_batch_size in pairwise(batch_sizes + [None]):
config["training"]["data"]["batch_size"] = batch_size
try:
mp.spawn(
train,
args=(config, bool(config.get("mlflow"))),
nprocs=(
config["training"]["device"]["nb_gpu"]
if (
config["training"]["device"]["use_ddp"]
and config["training"]["device"]["force"] in [None, "cuda"]
and torch.cuda.is_available()
)
else 1
),
)
return
except ProcessExitedException as e:
# Training stopped for another reason
if e.exit_code != EXIT_CODE_OUT_OF_MEMORY_ERROR:
raise
# No more batch size available
if not next_batch_size:
raise Exception(
"torch.cuda.OutOfMemoryError: No more batch size available"
)
logger.warning(
f"Failed to train with batch size of {batch_size}. Trying with smaller batch size of {next_batch_size}..."
)
Loading