Valentin Rigal · Valentin Rigal · e2f11ca8 · 76aa513a · 8f87ea89 · 04a0981f
--- a/arkindex/ponos/tasks.py

+ 121

− 0
+++ b/arkindex/ponos/tasks.py

+ 121

− 0
 import logging
+import tempfile
+from io import BytesIO
+from pathlib import Path
 from urllib.parse import urljoin

 from django.conf import settings
 @@ -8,7 +11,11 @@ from django.db.models.functions import Round
 from django.shortcuts import reverse
 from django.template.loader import render_to_string
 from django_rq import job
+from rq.job import Dependency

+import docker
+from arkindex.ponos.models import State, Task
+from arkindex.ponos.utils import upload_artifacts
 from arkindex.process.models import Process, WorkerActivityState

 logger = logging.getLogger(__name__)
 @@ -64,3 +71,117 @@ def notify_process_completion(
        recipient_list=[process.creator.email],
        fail_silently=False,
    )
+
+
+@job("tasks", timeout=None)
+def schedule_tasks(process: Process, run: int):
+    # Build a simple dependency scheme between tasks, based on depth
+    tasks = process.tasks.filter(run=run).order_by("depth", "id")
+    tasks.update(state=State.Pending)
+    # Run tasks in RQ, one by one
+    parent_job = None
+    for task in tasks:
+        kwargs = {}
+        if parent_job:
+            kwargs["depends_on"] = Dependency(jobs=[parent_job], allow_failure=True)
+        parent_job = run_task_rq.delay(task, **kwargs)
+
+
+def run_docker_task(client, task, temp_dir):
+    # 1. Pull the docker image
+    logger.debug(f"Pulling docker image '{task.image}'")
+    client.images.pull(task.image)
+
+    # 2. Do run the container asynchronously
+    logger.debug("Running container")
+    kwargs = {
+        "environment": {
+            **task.env,
+            "PONOS_DATA": "/data",
+        },
+        "detach": True,
+        "network": "host",
+        "volumes": {temp_dir: {"bind": "/data/current", "mode": "rw"}},
+    }
+    if task.requires_gpu:
+        # Assign all GPUs to that container
+        # TODO: Make sure this works
+        kwargs["environment"] = {"NVIDIA_VISIBLE_DEVICES": "all"}
+        logger.info("Starting container with GPU support")
+    if task.command is not None:
+        kwargs["command"] = task.command
+    container = client.containers.run(task.image, **kwargs)
+    task.state = State.Running
+    task.save()
+
+    # 3. Read logs (see agent.setup_logging)
+    logger.debug("Reading logs from the docker container")
+    data = b""
+    for line in container.logs(stream=True):
+        data += line
+        try:
+            task.logs.s3_object.upload_fileobj(
+                BytesIO(data),
+                ExtraArgs={"ContentType": "text/plain; charset=utf-8"},
+            )
+        except Exception as e:
+            logger.warning(f"Failed uploading logs for task {task}: {e}")
+
+    # 4. Retrieve the state of the container
+    container.reload()
+    exit_code = container.attrs["State"]["ExitCode"]
+    if exit_code != 0:
+        logger.info("Task failed")
+        task.state = State.Failed
+        task.save()
+        return
+
+    task.state = State.Completed
+    task.save()
+
+    # 5. Upload artifacts
+    logger.info(f"Uploading artifacts for task {task}")
+
+    for path in Path(temp_dir).glob("**/*"):
+        if path.is_dir():
+            continue
+        try:
+            upload_artifacts(task, path, temp_dir)
+        except Exception as e:
+            logger.warning(
+                f"Failed uploading artifacts for task {task}: {e}"
+            )
+
+
+@job("tasks", timeout=settings.RQ_TIMEOUTS["task"])
+def run_task_rq(task: Task):
+    """Run a single task in RQ"""
+    # Update task and parents from the DB
+    task.refresh_from_db()
+    parents = list(task.parents.order_by("depth", "id"))
+
+    client = docker.from_env()
+
+    if not task.image:
+        raise ValueError("The task must have a docker image.")
+
+    if task.state != State.Pending:
+        raise ValueError("The task must be in pending state run in RQ.")
+
+    # Automatically update children in case an error occurred
+    if (parent_state := next(
+        (parent.state for parent in parents if parent.state in (State.Stopped, State.Error, State.Failed)),
+        None
+    )) is not None:
+        task.state = parent_state
+        task.save()
+        return
+
+    with tempfile.TemporaryDirectory(suffix=f"_{task.id}") as temp_dir:
+        try:
+            run_docker_task(client, task, Path(temp_dir))
+        except Exception as e:
+            logger.error("An unexpected error occurred, updating state to Error.")
+            task.state = State.Error
+            task.save()
+            raise e