diff --git a/dan/mlflow.py b/dan/mlflow.py index c1d6cd3f9d8cfd5e7b05deb6a967ca3a0a472d3c..337b7837bbe1bdffedd6b28075f9eaa1ac6b14f6 100644 --- a/dan/mlflow.py +++ b/dan/mlflow.py @@ -10,6 +10,9 @@ from dan import logger def make_mlflow_request(mlflow_method, *args, **kwargs): + """ + Encapsulate MLflow HTTP requests to prevent them from crashing the whole training process. + """ try: mlflow_method(*args, **kwargs) except requests.exceptions.ConnectionError as e: @@ -50,11 +53,10 @@ def logging_metrics( :param is_master: bool, makes sure you're on the right thread, defaults to False """ if mlflow_logging and is_master: - mlflow_values = { - f"{step}_{name}": value for name, value in display_values.items() - } make_mlflow_request( - mlflow_method=mlflow.log_metrics, metrics=mlflow_values, step=epoch + mlflow_method=mlflow.log_metrics, metrics={ + f"{step}_{name}": value for name, value in display_values.items() + }, step=epoch ) @@ -73,10 +75,10 @@ def logging_tags_metrics( :param is_master: bool, makes sure you're on the right thread, defaults to False """ if mlflow_logging and is_master: - mlflow_values = { + make_mlflow_request(mlflow_method=mlflow.set_tags, tags= + { f"{step}_{name}": value for name, value in display_values.items() - } - make_mlflow_request(mlflow_method=mlflow.set_tags, tags=mlflow_values) + }) @contextmanager @@ -93,15 +95,7 @@ def start_mlflow_run(config: dict): # Set experiment from config experiment_id = config.get("experiment_id") assert experiment_id, "Missing MLflow experiment ID in the configuration" - try: - make_mlflow_request( - mlflow_method=mlflow.set_experiment, experiment_id=experiment_id - ) - logger.info(f"Run Experiment ID : {experiment_id} on MLFlow") - except MlflowException as e: - logger.error(f"Couldn't set Mlflow experiment with ID: {experiment_id}") - raise e # Start run - yield mlflow.start_run(run_name=config.get("run_name")) + yield mlflow.start_run(run_name=config.get("run_name"), experiment_id=experiment_id) mlflow.end_run() diff --git a/dan/ocr/document/train.py b/dan/ocr/document/train.py index 6a392dc2de3f476d5ac8d07b880345498775cea8..550e7e7d57b1ec322c7cf26eff82d12a187906fc 100644 --- a/dan/ocr/document/train.py +++ b/dan/ocr/document/train.py @@ -24,7 +24,7 @@ try: MLFLOW = True logger.info("MLflow Logging available.") - from dan.mlflow import start_mlflow_run + from dan.mlflow import start_mlflow_run, make_mlflow_request except ImportError: MLFLOW = False @@ -70,17 +70,17 @@ def get_config(): """ Retrieve model configuration """ - dataset_name = "esposalles" - dataset_level = "page" + dataset_name = "synist" + dataset_level = "manual_text_lines" dataset_variant = "" - dataset_path = "/home/training_data/ATR_paragraph/Esposalles" + dataset_path = "." params = { "mlflow": { "dataset_name": dataset_name, "run_name": "Test log DAN", "s3_endpoint_url": "", "tracking_uri": "", - "experiment_id": "9", + "experiment_id": "0", "aws_access_key_id": "", "aws_secret_access_key": "", }, @@ -287,18 +287,25 @@ def run(): / "labels.json" ) with start_mlflow_run(config["mlflow"]) as run: - logger.info(f"Set tags to MLflow on {config['mlflow']['run_name']}") - mlflow.set_tags({"Dataset": config["mlflow"]["dataset_name"]}) + logger.info(f"Started MLflow run with ID ({run.info.run_id})") + + make_mlflow_request( + mlflow_method=mlflow.set_tags, + tags={"Dataset": dataset_name} + ) # Get the labels json file with open(labels_path) as json_file: labels_artifact = json.load(json_file) # Log MLflow artifacts - mlflow.log_dict(config_artifact, "config.json") - mlflow.log_dict(labels_artifact, "labels.json") - - logger.info(f"Started MLflow run with ID ({run.info.run_id})") + for artifact, filename in [(config_artifact, "config.json"), (labels_artifact, "labels.json")]: + make_mlflow_request( + mlflow_method=mlflow.log_dict, + dictionary=artifact, + artifact_file=filename, + ) + if ( config["training_params"]["use_ddp"] and not config["training_params"]["force_cpu"]