Skip to content
Snippets Groups Projects
Commit 803610d3 authored by Manon Blanco's avatar Manon Blanco Committed by Yoann Schneider
Browse files

Always use the same max_training_time

parent 158252be
No related branches found
No related tags found
1 merge request!220Do not limit training by time
......@@ -33,7 +33,6 @@ class GenericTrainingManager:
self.params = params
self.dropout_scheduler = None
self.models = {}
self.begin_time = None
self.dataset = None
self.dataset_name = list(self.params["dataset_params"]["datasets"].values())[0]
self.paths = None
......@@ -534,7 +533,6 @@ class GenericTrainingManager:
self.writer = SummaryWriter(self.paths["results"])
self.save_params()
# init variables
self.begin_time = time()
nb_epochs = self.params["training_params"]["max_nb_epochs"]
metric_names = self.params["training_params"]["train_metrics"]
......@@ -547,13 +545,6 @@ class GenericTrainingManager:
self.init_curriculum()
# perform epochs
for num_epoch in range(self.latest_epoch + 1, nb_epochs):
# Check maximum training time stop condition
if (
self.params["training_params"]["max_training_time"]
and time() - self.begin_time
> self.params["training_params"]["max_training_time"]
):
break
# set models trainable
for model_name in self.models.keys():
self.models[model_name].train()
......
......@@ -160,9 +160,6 @@ def get_config():
"training_params": {
"output_folder": "outputs/dan_esposalles_record", # folder name for checkpoint and results
"max_nb_epochs": 800, # maximum number of epochs before to stop
"max_training_time": 3600
* 24
* 1.9, # maximum time before to stop (in seconds)
"load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate
"batch_size": 2, # mini-batch size for training
"use_ddp": False, # Use DistributedDataParallel
......
......@@ -145,7 +145,6 @@ For a detailed description of all augmentation transforms, see the [dedicated pa
| ------------------------------------------------------- | --------------------------------------------------------------------------- | ------------ | ------------------------------------------- |
| `training_params.output_folder` | Directory for checkpoint and results. | `str` | |
| `training_params.max_nb_epochs` | Maximum number of epochs before stopping training. | `int` | `800` |
| `training_params.max_training_time` | Maximum time (in seconds) before stopping training. | `int` | `164160` |
| `training_params.load_epoch` | Model to load. Should be either `"best"` (evaluation) or `last` (training). | `str` | `"last"` |
| `training_params.batch_size` | Mini-batch size for the training loop. | `int` | `2` |
| `training_params.use_ddp` | Whether to use DistributedDataParallel. | `bool` | `False` |
......
......@@ -108,7 +108,6 @@ def training_config():
"training_params": {
"output_folder": "dan_trained_model", # folder name for checkpoint and results
"max_nb_epochs": 4, # maximum number of epochs before to stop
"max_training_time": 1200, # maximum time before to stop (in seconds)
"load_epoch": "last", # ["best", "last"]: last to continue training, best to evaluate
"batch_size": 2, # mini-batch size for training
"use_ddp": False, # Use DistributedDataParallel
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment