diff --git a/arkindex/metrics/tests/test_metrics_api.py b/arkindex/metrics/tests/test_metrics_api.py index 33d401fe87c40f93eb3ff4a980454131fc30fe8e..688fb9db23f9fbe7a099b3ea0602873495aafc49 100644 --- a/arkindex/metrics/tests/test_metrics_api.py +++ b/arkindex/metrics/tests/test_metrics_api.py @@ -1,6 +1,10 @@ +from datetime import datetime + from django.test import override_settings from django.urls import reverse +from arkindex.ponos.models import Agent, AgentMode, Farm, State +from arkindex.process.models import Process, ProcessMode from arkindex.project.tests import FixtureAPITestCase @@ -15,3 +19,40 @@ class TestMetricsAPI(FixtureAPITestCase): response = self.client.get(reverse("metrics:base-metrics"), SERVER_PORT=42) self.assertEqual(response.status_code, 200) self.assertEqual(response.content, b'arkindex_instance{hostname="hostname", env="test"} 1') + + @override_settings(PROMETHEUS_METRICS_PORT=42, PUBLIC_HOSTNAME="hostname", ARKINDEX_ENV="test") + def test_metrics_tasks(self): + + # Retrieve objects + process = Process.objects.get(mode=ProcessMode.Workers) + process.run() + task1, task2, task3 = process.tasks.all().order_by("depth") + + farm = Farm.objects.first() + agent = Agent.objects.create( + mode=AgentMode.Docker, + hostname="Demo Agent", + farm=farm, + last_ping=datetime.now(), + cpu_cores=42, + cpu_frequency=42e8, + ram_total=42e3 + ) + last_ping = int(agent.last_ping.timestamp()) + + # Create some tasks with various tasks + task1.state = State.Pending + task1.save() + task2.state = State.Error + task2.save() + task3.state = State.Unscheduled + task3.save() + + response = self.client.get(reverse("metrics:base-metrics"), SERVER_PORT=42) + self.assertEqual(response.status_code, 200) + + metrics = """arkindex_instance{hostname="hostname", env="test"} 1 +arkindex_tasks{hostname="hostname", env="test", state="Pending", farm="Wheat farm"} 1 +arkindex_tasks{hostname="hostname", env="test", state="Unscheduled", farm="Wheat farm"} 1 +arkindex_agent_ping{hostname="hostname", env="test", agent_name="Demo Agent", farm="Wheat farm"} """ + str(last_ping) + self.assertEqual(response.content, metrics.encode("utf-8")) diff --git a/arkindex/metrics/utils.py b/arkindex/metrics/utils.py index b9e39c9863618845cad446d1ef61524feec82888..c3dae1e3402a34c66711971cdad511c5f0d8d257 100644 --- a/arkindex/metrics/utils.py +++ b/arkindex/metrics/utils.py @@ -1,5 +1,10 @@ +def _render_attribute(key, value): + if value is None: + value = "null" + return f'{key}="{value}"' + def build_metric(label, attributes={}, value=1, timestamp=None): - attrs_fmt = ", ".join(["=".join((k, f'"{v}"')) for k, v in attributes.items()]) + attrs_fmt = ", ".join([_render_attribute(k, v) for k, v in attributes.items()]) metric = f"{label}{{{attrs_fmt}}} {value}" if timestamp: metric = f"{metric} {timestamp}" diff --git a/arkindex/metrics/views.py b/arkindex/metrics/views.py index deca4c0b64827a771017c9bdc8e760fa339d6e80..b1564f52818538f3836a813a7945fba9d3773c41 100644 --- a/arkindex/metrics/views.py +++ b/arkindex/metrics/views.py @@ -1,21 +1,64 @@ from django.conf import settings +from django.db.models import Count from django.http import Http404, HttpResponse from django.views import View from arkindex.metrics.utils import build_metric +from arkindex.ponos.models import Agent, Farm, State, Task class MetricsView(View): def get(self, request, *args, **kwargs): if settings.PROMETHEUS_METRICS_PORT != int(request.get_port()): raise Http404() - return HttpResponse( + + common_attributes = { + "hostname": settings.PUBLIC_HOSTNAME, + "env": settings.ARKINDEX_ENV + } + + # Count nb of tasks in pending or unscheduled states + # grouped by farm + tasks_count = ( + Task.objects + .filter(state__in=(State.Pending, State.Unscheduled)) + .values("process__farm", "state") + .annotate(nb=Count("id")) + ) + + # Load all ponos agents details + agents = Agent.objects.all().values("hostname", "farm_id", "last_ping") + + # Load all farms to use in attributes of farms & tasks + farms = dict(Farm.objects.all().values_list("id", "name")) + + metrics = [ + # Backend ping build_metric( "arkindex_instance", - { - "hostname": settings.PUBLIC_HOSTNAME, - "env": settings.ARKINDEX_ENV - } + attributes=common_attributes ), + ] + [ + + # Pending or Unscheduled tasks count + build_metric( + "arkindex_tasks", + attributes={**common_attributes, "state": t["state"], "farm": farms.get(t["process__farm"])}, + value=t["nb"] + ) + for t in tasks_count + ] + [ + # Ponos agent last known ping + build_metric( + "arkindex_agent_ping", + attributes={**common_attributes, "agent_name": a["hostname"], "farm": farms.get(a["farm_id"])}, + value=int(a["last_ping"].timestamp()) + ) + for a in agents + ] + + # Render text response with all metrics + return HttpResponse( + "\n".join(metrics), content_type="text/plain" )