Skip to content
Snippets Groups Projects

Robust mlflow requests

Merged Yoann Schneider requested to merge robust-mlflow-requests into main
All threads resolved!
1 file
+ 0
1
Compare changes
  • Side-by-side
  • Inline
+ 29
16
@@ -3,11 +3,22 @@ import os
from contextlib import contextmanager
import mlflow
from mlflow.exceptions import MlflowException
import requests
from mlflow.environment_variables import MLFLOW_HTTP_REQUEST_MAX_RETRIES
from dan import logger
def make_mlflow_request(mlflow_method, *args, **kwargs):
"""
Encapsulate MLflow HTTP requests to prevent them from crashing the whole training process.
"""
try:
mlflow_method(*args, **kwargs)
except requests.exceptions.ConnectionError as e:
logger.error(f"Call to `{str(mlflow_method)}` failed with error: {str(e)}")
def setup_environment(config: dict):
"""
Get the necessary variables from the config file and put them in the environment variables
@@ -24,6 +35,13 @@ def setup_environment(config: dict):
if config_key in config:
os.environ[variable_name] = config[config_key]
# Check max retry setting
max_retries = MLFLOW_HTTP_REQUEST_MAX_RETRIES.get()
if max_retries and int(max_retries) <= 1:
logger.warning(
f"The maximum number of retries for MLflow HTTP requests is set to {max_retries}, which is low. Consider using a higher value."
)
def logging_metrics(
display_values: dict,
@@ -42,10 +60,11 @@ def logging_metrics(
:param is_master: bool, makes sure you're on the right thread, defaults to False
"""
if mlflow_logging and is_master:
mlflow_values = {
f"{step}_{name}": value for name, value in display_values.items()
}
mlflow.log_metrics(mlflow_values, epoch)
make_mlflow_request(
mlflow_method=mlflow.log_metrics,
metrics={f"{step}_{name}": value for name, value in display_values.items()},
step=epoch,
)
def logging_tags_metrics(
@@ -63,10 +82,10 @@ def logging_tags_metrics(
:param is_master: bool, makes sure you're on the right thread, defaults to False
"""
if mlflow_logging and is_master:
mlflow_values = {
f"{step}_{name}": value for name, value in display_values.items()
}
mlflow.set_tags(mlflow_values)
make_mlflow_request(
mlflow_method=mlflow.set_tags,
tags={f"{step}_{name}": value for name, value in display_values.items()},
)
@contextmanager
@@ -83,13 +102,7 @@ def start_mlflow_run(config: dict):
# Set experiment from config
experiment_id = config.get("experiment_id")
assert experiment_id, "Missing MLflow experiment ID in the configuration"
try:
mlflow.set_experiment(experiment_id=experiment_id)
logger.info(f"Run Experiment ID : {experiment_id} on MLFlow")
except MlflowException as e:
logger.error(f"Couldn't set Mlflow experiment with ID: {experiment_id}")
raise e
# Start run
yield mlflow.start_run(run_name=config.get("run_name"))
yield mlflow.start_run(run_name=config.get("run_name"), experiment_id=experiment_id)
mlflow.end_run()
Loading