Yoann Schneider
--- a/dan/mlflow.py

+ 29

− 16
+++ b/dan/mlflow.py

+ 29

− 16
 @@ -3,11 +3,22 @@ import os
 from contextlib import contextmanager

 import mlflow
-from mlflow.exceptions import MlflowException
+import requests
+from mlflow.environment_variables import MLFLOW_HTTP_REQUEST_MAX_RETRIES

 from dan import logger


+def make_mlflow_request(mlflow_method, *args, **kwargs):
+    """
+    Encapsulate MLflow HTTP requests to prevent them from crashing the whole training process.
+    """
+    try:
+        mlflow_method(*args, **kwargs)
+    except requests.exceptions.ConnectionError as e:
+        logger.error(f"Call to `{str(mlflow_method)}` failed with error: {str(e)}")
+
+
 def setup_environment(config: dict):
    """
    Get the necessary variables from the config file and put them in the environment variables
 @@ -24,6 +35,13 @@ def setup_environment(config: dict):
        if config_key in config:
            os.environ[variable_name] = config[config_key]

+    # Check max retry setting
+    max_retries = MLFLOW_HTTP_REQUEST_MAX_RETRIES.get()
+    if max_retries and int(max_retries) <= 1:
+        logger.warning(
+            f"The maximum number of retries for MLflow HTTP requests is set to {max_retries}, which is low. Consider using a higher value."
+        )
+

 def logging_metrics(
    display_values: dict,
 @@ -42,10 +60,11 @@ def logging_metrics(
    :param is_master: bool, makes sure you're on the right thread, defaults to False
    """
    if mlflow_logging and is_master:
-        mlflow_values = {
-            f"{step}_{name}": value for name, value in display_values.items()
-        }
-        mlflow.log_metrics(mlflow_values, epoch)
+        make_mlflow_request(
+            mlflow_method=mlflow.log_metrics,
+            metrics={f"{step}_{name}": value for name, value in display_values.items()},
+            step=epoch,
+        )


 def logging_tags_metrics(
 @@ -63,10 +82,10 @@ def logging_tags_metrics(
    :param is_master: bool, makes sure you're on the right thread, defaults to False
    """
    if mlflow_logging and is_master:
-        mlflow_values = {
-            f"{step}_{name}": value for name, value in display_values.items()
-        }
-        mlflow.set_tags(mlflow_values)
+        make_mlflow_request(
+            mlflow_method=mlflow.set_tags,
+            tags={f"{step}_{name}": value for name, value in display_values.items()},
+        )


 @contextmanager
 @@ -83,13 +102,7 @@ def start_mlflow_run(config: dict):
    # Set experiment from config
    experiment_id = config.get("experiment_id")
    assert experiment_id, "Missing MLflow experiment ID in the configuration"
-    try:
-        mlflow.set_experiment(experiment_id=experiment_id)
-        logger.info(f"Run Experiment ID : {experiment_id} on MLFlow")
-    except MlflowException as e:
-        logger.error(f"Couldn't set Mlflow experiment with ID: {experiment_id}")
-        raise e

    # Start run
-    yield mlflow.start_run(run_name=config.get("run_name"))
+    yield mlflow.start_run(run_name=config.get("run_name"), experiment_id=experiment_id)
    mlflow.end_run()