From fe0fc10bd0774909922a8fd71dc45b77dfe132c8 Mon Sep 17 00:00:00 2001 From: Valentin Rigal <rigal@teklia.com> Date: Tue, 10 Oct 2023 13:05:16 +0000 Subject: [PATCH] Fix ponos tasks assignment failures --- arkindex/ponos/models.py | 3 ++ arkindex/ponos/tests/test_api.py | 55 ++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/arkindex/ponos/models.py b/arkindex/ponos/models.py index 907aee6680..bcf831a13b 100644 --- a/arkindex/ponos/models.py +++ b/arkindex/ponos/models.py @@ -159,6 +159,9 @@ class Agent(models.Model): :param tasks: Number of tasks to estimate the cost for. :returns: A cost expressed as a percentage. If > 1, the agent would be overloaded. """ + if self.cpu_load is None or self.ram_load is None: + # The agent has not shared its state yet + return 1 current_tasks_count = getattr(self, 'current_tasks', 0) if current_tasks_count + AGENT_SLOT["cpu"] >= self.cpu_cores: return 1 diff --git a/arkindex/ponos/tests/test_api.py b/arkindex/ponos/tests/test_api.py index 72b96030d1..2c88c3e28b 100644 --- a/arkindex/ponos/tests/test_api.py +++ b/arkindex/ponos/tests/test_api.py @@ -56,6 +56,8 @@ class TestAPI(FixtureAPITestCase): public_key=pubkey, ram_total=2e9, last_ping=timezone.now(), + cpu_load=.1, + ram_load=.1e9, ) cls.rev = Revision.objects.first() cls.process = Process.objects.get(mode=ProcessMode.Workers) @@ -834,7 +836,7 @@ class TestAPI(FixtureAPITestCase): "agent": { "cpu_cores": 2, "cpu_frequency": 1000000000, - "cpu_load": None, + "cpu_load": .1, "farm": {"id": str(self.agent.farm_id), "name": "Wheat farm"}, "gpus": [ { @@ -853,7 +855,7 @@ class TestAPI(FixtureAPITestCase): "hostname": "ghostname", "id": str(self.agent.id), "last_ping": str_date(self.agent.last_ping), - "ram_load": None, + "ram_load": 100000000, "ram_total": 2000000000, }, "gpu": { @@ -890,7 +892,7 @@ class TestAPI(FixtureAPITestCase): "agent": { "cpu_cores": 2, "cpu_frequency": 1000000000, - "cpu_load": None, + "cpu_load": .1, "farm": {"id": str(self.agent.farm_id), "name": "Wheat farm"}, "gpus": [ { @@ -909,7 +911,7 @@ class TestAPI(FixtureAPITestCase): "hostname": "ghostname", "id": str(self.agent.id), "last_ping": str_date(self.agent.last_ping), - "ram_load": None, + "ram_load": 100000000, "ram_total": 2000000000, }, "gpu": { @@ -1650,7 +1652,7 @@ class TestAPI(FixtureAPITestCase): { "cpu_cores": 12, "cpu_frequency": 1000000000, - "cpu_load": None, + "cpu_load": .1, "farm": str(self.wheat_farm.id), "gpus": [ { @@ -1663,7 +1665,7 @@ class TestAPI(FixtureAPITestCase): "hostname": "ghostname", "id": str(self.agent.id), "last_ping": "2000-01-01T12:00:00Z", - "ram_load": None, + "ram_load": 100000000, "ram_total": 32000000000, }, ) @@ -1859,25 +1861,52 @@ class TestAPI(FixtureAPITestCase): }, ) + def test_agent_null_state(self): + """ + Agents with unknown CPU or RAM load are excluded by the assignation algorithm + """ + self.agent.cpu_load = None + self.agent.ram_load = None + self.agent.save() + pubkey = build_public_key() + second_agent = AgentUser.objects.create( + id=hashlib.md5(pubkey.encode("utf-8")).hexdigest(), + farm=self.wheat_farm, + hostname="new agent", + cpu_cores=2, + cpu_frequency=1e9, + public_key=pubkey, + ram_total=1e9, + last_ping=timezone.now(), + ) + with self.assertNumQueries(6): + resp = self.client.get( + reverse("api:agent-actions"), + HTTP_AUTHORIZATION=f'Bearer {second_agent.token.access_token}', + data={"cpu_load": 1.9, "ram_load": 0.49}, + ) + self.assertEqual(resp.status_code, status.HTTP_200_OK) + self.assertDictEqual(resp.json(), {"actions": []}) + def test_agent_non_pending_actions(self): """ Only pending tasks may be retrieved as new actions """ - self.process.tasks.update(state=State.Error) + Task.objects.filter(process__farm=self.agent.farm_id).update(state=State.Error) with self.assertNumQueries(7): resp = self.client.get( reverse("api:agent-actions"), HTTP_AUTHORIZATION=f'Bearer {self.agent.token.access_token}', data={"cpu_load": 0.9, "ram_load": 0.49}, ) - self.assertEqual(resp.status_code, status.HTTP_200_OK) + self.assertEqual(resp.status_code, status.HTTP_200_OK) self.assertDictEqual(resp.json(), {"actions": []}) def test_agent_no_stealing(self): """ An agent may not take another agent's tasks """ - self.process.tasks.update(agent=self.agent, state=State.Pending) + Task.objects.filter(process__farm=self.agent.farm_id).update(agent=self.agent, state=State.Pending) pubkey = build_public_key() agent2 = AgentUser.objects.create( id=uuid.UUID(hashlib.md5(pubkey.encode("utf-8")).hexdigest()), @@ -2377,7 +2406,7 @@ class TestAPI(FixtureAPITestCase): "active": True, "cpu_cores": 2, "cpu_frequency": 1000000000, - "cpu_load": None, + "cpu_load": .1, "farm": { "id": str(self.wheat_farm.id), "name": "Wheat farm", @@ -2397,7 +2426,7 @@ class TestAPI(FixtureAPITestCase): }, ], "hostname": "ghostname", - "ram_load": None, + "ram_load": 100000000, "ram_total": 2000000000, "running_tasks_count": 1, }, @@ -2458,7 +2487,7 @@ class TestAPI(FixtureAPITestCase): "active": True, "cpu_cores": 2, "cpu_frequency": 1000000000, - "cpu_load": None, + "cpu_load": .1, "farm": { "id": str(self.wheat_farm.id), "name": "Wheat farm", @@ -2478,7 +2507,7 @@ class TestAPI(FixtureAPITestCase): }, ], "hostname": "ghostname", - "ram_load": None, + "ram_load": 100000000, "ram_total": 2000000000, "running_tasks": [ { -- GitLab