Skip to content
Snippets Groups Projects

Memory leak: Catch error and retry

Merged Manon Blanco requested to merge memory-leak-catch-error-and-retry into main
All threads resolved!
1 file
+ 2
3
Compare changes
  • Side-by-side
  • Inline
+ 2
3
@@ -36,7 +36,6 @@ def train(rank, params, mlflow_logging=False):
params["training"]["device"]["ddp_rank"] = rank
model = Manager(params)
model.load_model()
if params["dataset"]["tokens"] is not None:
@@ -93,7 +92,7 @@ def run(config: dict):
"""
Main program, training a new model, using a valid configuration
"""
initial_config = config.copy() # Config will be updated, save its initial value
initial_config = config.copy() # Config will be updated, save its initial values
names = list(config["dataset"]["datasets"].keys())
# We should only have one dataset
@@ -167,6 +166,6 @@ def run(config: dict):
initial_config["training"]["data"]["batch_size"] = new_batch_size
logger.warning(
f"Failed to train with batch size of {batch_size} images. Trying with smaller batches of {new_batch_size}..."
f"Failed to train with batch size of {batch_size}. Trying with smaller batches of {new_batch_size}..."
)
run(initial_config)
Loading