MLFlow

Experiment Tracking with MLFlow

from detectron2.engine import HookBase
from aim import Run
import torch
import os
import detectron2.utils.comm as comm
from datetime import datetime

MLFLow_URL = os.environ["MLFlow_URL"]

class MLFlowHook(HookBase):
    """
    A custom hook class that logs artifacts, metrics, and parameters to MLflow.

    All taken from https://philenius.github.io/machine%20learning/2022/01/09/how-to-log-artifacts-metrics-and-parameters-of-your-detectron2-model-training-to-mlflow.html
    And adapted for Aim

    Looking at write_metrics in this file can help with further development.
    https://github.com/facebookresearch/detectron2/blob/80e2673da161f57afe37ef769836a61976108ef1/detectron2/engine/train_loop.py#LL346
    """

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg.clone()

    def before_train(self):

        clean_datetime = str(datetime.now()).replace(' ','_').replace(':','-')

        # Have to check if it's the main process so that you dont 
        # get multiple tracked run for a single, multi-gpu process.s
        if comm.is_main_process():
            self.run = Run(
                repo=AIM_URL,
                experiment=clean_datetime,
            )
            self.run['hparams'] = self.cfg

    def after_step(self):
        # Only write metrics if it's the main process
        if comm.is_main_process():
            with torch.no_grad():
                latest_metrics = self.trainer.storage.latest()
                for k, v in latest_metrics.items():
                    self.run.track(name=k, value=v[0], step=self.trainer.storage.iter)

    def after_train(self):
        with torch.no_grad():
            with open(os.path.join(self.cfg.OUTPUT_DIR, "model-config.yaml"), "w") as f:
                f.write(self.cfg.dump())

from detectron2.engine import HookBase
import detectron2.utils.comm as comm
import mlflow


class MLflowHook(HookBase):
    """
    A custom hook class that logs artifacts, metrics, and parameters to MLflow.
    """

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg.clone()

    def before_train(self):
        if comm.is_main_process():
            with torch.no_grad():
                mlflow.set_tracking_uri(self.cfg.MLFLOW.TRACKING_URI)
                mlflow.set_experiment(self.cfg.MLFLOW.EXPERIMENT_NAME)
                mlflow.start_run(run_name=self.cfg.MLFLOW.RUN_NAME)
                mlflow.set_tag("mlflow.note.content",
                               self.cfg.MLFLOW.RUN_DESCRIPTION)
                for k, v in self.cfg.items():
                    mlflow.log_param(k, v)

    def after_step(self):
        if comm.is_main_process():
            with torch.no_grad():
                latest_metrics = self.trainer.storage.latest()
                for k, v in latest_metrics.items():
                    mlflow.log_metric(key=k, value=v[0], step=v[1])

    def after_train(self):
        if comm.is_main_process():
            with torch.no_grad():
                with open(os.path.join(self.cfg.OUTPUT_DIR, "model-config.yaml"), "w") as f:
                    f.write(self.cfg.dump())
                mlflow.log_artifacts(self.cfg.OUTPUT_DIR)

You then need to register the hook with your trainer

mlflow_hook = MLFlowHook(cfg)
trainer = DefaultTrainer()
trainer.register_hooks(hooks=[mlflow_hook])