diff --git a/arkindex/dataimport/admin.py b/arkindex/dataimport/admin.py index c6fc5408ac25c29bccc77956ffee26b71382374c..9188f0d2a2d9594bb342a87ce51783e09394217a 100644 --- a/arkindex/dataimport/admin.py +++ b/arkindex/dataimport/admin.py @@ -38,7 +38,7 @@ class DataImportAdmin(admin.ModelAdmin): fieldsets = ( (None, {'fields': ('id', 'name', 'creator', 'corpus', 'state', 'mode', 'workflow', 'activity_state', 'template')}), ('Elements filters', { - 'fields': ('element', 'element_type', 'folder_type', 'name_contains', 'best_class') + 'fields': ('element', 'element_type', 'folder_type', 'name_contains') }), ) readonly_fields = ('id', 'name', 'workflow', 'state', 'activity_state', 'template') diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 35cc10699ee2daac3df4cbffd6d1daefe3285f3c..69f8a1943e4d40669711ea6eed8093ec18dbfd43 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -392,7 +392,7 @@ class CorpusWorkflow(SelectionMixin, CorpusACLMixin, CreateAPIView): serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) - corpus, element, process_name, name_contains, element_type, selection, best_class, load_children, use_cache = map( + corpus, element, process_name, name_contains, element_type, selection, load_children, use_cache = map( lambda key: serializer.validated_data.pop(key, None), ( 'corpus', @@ -401,7 +401,6 @@ class CorpusWorkflow(SelectionMixin, CorpusACLMixin, CreateAPIView): 'element_name_contains', 'element_type', 'selection', - 'best_class', 'load_children', 'use_cache', ) @@ -414,7 +413,6 @@ class CorpusWorkflow(SelectionMixin, CorpusACLMixin, CreateAPIView): name_contains=name_contains, element_type=element_type, element=element, - best_class=best_class, load_children=load_children, use_cache=use_cache, ) diff --git a/arkindex/dataimport/migrations/0045_remove_dataimport_best_class.py b/arkindex/dataimport/migrations/0045_remove_dataimport_best_class.py new file mode 100644 index 0000000000000000000000000000000000000000..4c718179ed51af5f916c951b7f4b21acacda81d1 --- /dev/null +++ b/arkindex/dataimport/migrations/0045_remove_dataimport_best_class.py @@ -0,0 +1,17 @@ +# Generated by Django 3.2.6 on 2021-12-14 14:32 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataimport', '0044_alter_workerconfiguration_configuration'), + ] + + operations = [ + migrations.RemoveField( + model_name='dataimport', + name='best_class', + ), + ] diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index f19f614ba66379dd127d850d781844d48cdc5407..67b9e439f23b8874a9ceeccfda2892d82d017807 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -2,7 +2,6 @@ import shlex import urllib.parse import uuid from os import path -from uuid import UUID import yaml from django.conf import settings @@ -16,7 +15,7 @@ from rest_framework.exceptions import ValidationError from arkindex.dataimport.managers import ActivityManager, CorpusWorkerVersionManager from arkindex.dataimport.providers import get_provider, git_providers from arkindex.dataimport.utils import get_default_farm_id -from arkindex.documents.models import ClassificationState, Element +from arkindex.documents.models import Element from arkindex.project.aws import S3FileMixin, S3FileStatus from arkindex.project.fields import ArrayField, MD5HashField from arkindex.project.models import IndexableModel @@ -86,10 +85,6 @@ class DataImport(IndexableModel): # Used to filter out elements with a name that doesn't contain the substring, only on Workers imports name_contains = models.CharField(null=True, blank=True, max_length=250) - # Used to filter elements by best class, either having or not having a best class (boolean) - # or having a specific best class (UUID), only on Workers imports - best_class = models.CharField(null=True, blank=True, max_length=36) - # Used to save a user's selection for Element workflows elements = models.ManyToManyField( 'documents.Element', @@ -150,28 +145,6 @@ class DataImport(IndexableModel): return filters - def _get_classifications_filters(self): - if self.best_class is None: - return - - # Generic ORM query to find best classes: - # - elements with a validated classification - # - OR where high confidence is True - best_classifications = Q(classifications__state=ClassificationState.Validated) \ - | Q(classifications__high_confidence=True) - - # List elements without any best classes, by inverting the query above - if self.best_class in ('false', '0'): - return ~best_classifications - - try: - # Filter on a specific class - class_filter = UUID(self.best_class) - return best_classifications & Q(classifications__ml_class_id=class_filter) - except (TypeError, ValueError): - # By default, use all best classifications - return best_classifications - def copy_runs(self, new_process): """ Copies this process' WorkerRuns to another process. @@ -237,13 +210,7 @@ class DataImport(IndexableModel): elements = Element.objects.filter(corpus=self.corpus_id) # Filter elements depending on process properties - elements = elements.filter(**self._get_filters()) - class_filters = self._get_classifications_filters() - if class_filters is not None: - # Distinct is required because multiple classes may match the filter - elements = elements.filter(class_filters).distinct() - - return elements + return elements.filter(**self._get_filters()) def build_workflow(self, farm=None, chunks=None, thumbnails=False, corpus_id=None): ''' diff --git a/arkindex/dataimport/serializers/imports.py b/arkindex/dataimport/serializers/imports.py index 3b6d47fd4dbcb5f9e6f666508367160d1be7ff5b..cadeb7f46a0c7a46225ea67aa09efd494f235922 100644 --- a/arkindex/dataimport/serializers/imports.py +++ b/arkindex/dataimport/serializers/imports.py @@ -1,5 +1,3 @@ -from uuid import UUID - from django.conf import settings from rest_framework import serializers from rest_framework.exceptions import PermissionDenied, ValidationError @@ -17,7 +15,7 @@ from arkindex.dataimport.serializers.workers import WorkerLightSerializer from arkindex.documents.models import Corpus, Element, ElementType from arkindex.documents.serializers.elements import ElementSlimSerializer from arkindex.project.mixins import ProcessACLMixin -from arkindex.project.serializer_fields import BestClassField, EnumField, LinearRingField +from arkindex.project.serializer_fields import EnumField, LinearRingField from arkindex.users.models import Role from arkindex.users.utils import get_max_level from ponos.models import Farm, State @@ -315,7 +313,6 @@ class ElementsWorkflowSerializer(serializers.Serializer): element = serializers.UUIDField(required=False) element_name_contains = serializers.CharField(required=False, max_length=250) element_type = serializers.SlugField(required=False, max_length=50) - best_class = BestClassField(required=False) selection = serializers.BooleanField(default=False) load_children = serializers.BooleanField(default=False) use_cache = serializers.BooleanField(default=False) @@ -350,7 +347,6 @@ class ElementsWorkflowSerializer(serializers.Serializer): element = data.get('element') elt_type = data.get('element_type') - best_class = data.get('best_class') selection = data.get('selection') # Ensure element is a folder or has a zone @@ -381,11 +377,6 @@ class ElementsWorkflowSerializer(serializers.Serializer): 'element_type': [f'This type does not exist in corpus "{corpus.name}"'] }) - if best_class is not None and isinstance(best_class, UUID) and not corpus.ml_classes.filter(id=best_class).exists(): - raise ValidationError({ - 'best_class': [f'MLClass with ID {best_class} does not exist in corpus "{corpus.name}"'] - }) - return data diff --git a/arkindex/dataimport/tests/test_process_elements.py b/arkindex/dataimport/tests/test_process_elements.py index 469bcf5e1c03b59d04a20bce70a2176c348ae265..fab9e0f1a9ec578162489e5082ddfeeac856992e 100644 --- a/arkindex/dataimport/tests/test_process_elements.py +++ b/arkindex/dataimport/tests/test_process_elements.py @@ -3,8 +3,8 @@ import uuid from django.urls import reverse from rest_framework import status -from arkindex.dataimport.models import DataImport, DataImportMode, WorkerVersion -from arkindex.documents.models import Classification, ClassificationState, Corpus, Element, MLClass +from arkindex.dataimport.models import DataImport, DataImportMode +from arkindex.documents.models import Corpus, Element from arkindex.images.models import Image from arkindex.project.tests import FixtureAPITestCase from arkindex.users.models import User @@ -132,47 +132,6 @@ class TestProcessElements(FixtureAPITestCase): cls.line_4.add_parent(cls.page_4) cls.line_5.add_parent(cls.page_5) - # Create best classes - worker_version = WorkerVersion.objects.get(worker__slug='reco') - cls.coffee_class = MLClass.objects.create(name='C0FFEE', corpus=cls.private_corpus) - cls.food_class = MLClass.objects.create(name='F00D', corpus=cls.private_corpus) - Classification.objects.create( - element=cls.folder_2, - state=ClassificationState.Validated, - ml_class=cls.food_class, - worker_version=worker_version, - ) - Classification.objects.create( - element=cls.page_1, - state=ClassificationState.Validated, - ml_class=cls.coffee_class, - worker_version=worker_version, - ) - Classification.objects.create( - element=cls.page_2, - high_confidence=True, - ml_class=cls.food_class, - worker_version=worker_version, - ) - Classification.objects.create( - element=cls.page_3, - state=ClassificationState.Validated, - ml_class=cls.food_class, - worker_version=worker_version, - ) - Classification.objects.create( - element=cls.page_5, - high_confidence=True, - ml_class=cls.food_class, - worker_version=worker_version, - ) - Classification.objects.create( - element=cls.page_5, - state=ClassificationState.Validated, - ml_class=cls.coffee_class, - worker_version=worker_version, - ) - def setUp(self): super().setUp() self.dataimport = DataImport.objects.create( @@ -338,69 +297,6 @@ class TestProcessElements(FixtureAPITestCase): for element in elements ]) - def test_filter_best_class_by_id(self): - self.dataimport.best_class = self.food_class.id - self.dataimport.save() - elements = [self.page_5, self.page_3, self.folder_2, self.page_2] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get(reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - - def test_filter_any_best_class(self): - self.dataimport.best_class = "true" - self.dataimport.save() - elements = [self.page_1, self.page_5, self.page_3, self.folder_2, self.page_2] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get(reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - - def test_filter_no_best_class(self): - self.dataimport.best_class = "false" - self.dataimport.save() - elements = [self.folder_1, self.line_1, self.line_2, self.line_3, self.line_4, self.line_5, self.page_4] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get(reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - def test_filter_element(self): self.dataimport.element = self.page_1 self.dataimport.save() @@ -487,73 +383,6 @@ class TestProcessElements(FixtureAPITestCase): for element in elements ]) - def test_load_children_and_filter_best_class_by_id(self): - self.dataimport.best_class = self.food_class.id - self.dataimport.load_children = True - self.dataimport.save() - elements = [self.folder_2, self.page_2, self.page_3, self.page_5] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get(reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - - def test_load_children_and_filter_best_class(self): - self.dataimport.best_class = "true", - self.dataimport.load_children = True - self.dataimport.save() - elements = [self.page_1, self.page_5, self.page_3, self.folder_2, self.page_2] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get( - reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - - def test_load_children_and_filter_no_best_class(self): - self.dataimport.best_class = "false" - self.dataimport.load_children = True - self.dataimport.save() - elements = [self.folder_1, self.line_1, self.line_2, self.line_3, self.line_4, self.line_5, self.page_4] - - self.client.force_login(self.superuser) - with self.assertNumQueries(6): - response = self.client.get(reverse('api:process-elements-list', kwargs={'pk': self.dataimport.id})) - self.assertEqual(response.status_code, status.HTTP_200_OK) - data = response.json() - self.assertEqual(data["count"], None) - self.assertEqual(data["next"], None) - self.assertCountEqual(data["results"], [ - { - 'id': str(element.id), - 'type': element.type.slug, - 'name': element.name - } - for element in elements - ]) - def test_load_children_and_filter_element(self): self.dataimport.element = self.folder_1 self.dataimport.load_children = True diff --git a/arkindex/dataimport/tests/test_workeractivity.py b/arkindex/dataimport/tests/test_workeractivity.py index 3b7bf82cd76561f0456231848a18c18ffebafee0..5616ef68efd7b0a6ded8887e050ddc6a8665a724 100644 --- a/arkindex/dataimport/tests/test_workeractivity.py +++ b/arkindex/dataimport/tests/test_workeractivity.py @@ -15,7 +15,7 @@ from arkindex.dataimport.models import ( WorkerVersion, ) from arkindex.dataimport.tasks import initialize_activity -from arkindex.documents.models import Classification, ClassificationState, Corpus, Element, MLClass +from arkindex.documents.models import Corpus, Element from arkindex.project.tests import FixtureTestCase from arkindex.users.models import User @@ -101,35 +101,6 @@ class TestWorkerActivity(FixtureTestCase): self.assertEqual(WorkerActivity.objects.filter(state=WorkerActivityState.Started).count(), 3) self.assertEqual(WorkerActivity.objects.filter(process=old_process).count(), 2) - @patch('arkindex.project.triggers.dataimport_tasks.initialize_activity.delay') - def test_bulk_insert_children_class_filter(self, activities_delay_mock): - """ - Worker activities creation should work with complex elements selection (e.g. with a class filter) - """ - # Mock workers activity to make it a synchronous job - activities_delay_mock.side_effect = initialize_activity - - agent_class = MLClass.objects.create(name='James', corpus=self.corpus) - Classification.objects.bulk_create( - Classification(ml_class=agent_class, state=ClassificationState.Validated, element=e) - for e in self.corpus.elements.filter(type__slug='page') - ) - dataimport = self.corpus.imports.create( - activity_state=ActivityState.Pending, - creator=self.user, - mode=DataImportMode.Workers, - corpus=self.corpus, - best_class=agent_class.name - ) - dataimport.worker_runs.create(version=self.worker_version, parents=[], configuration=self.configuration) - with self.assertNumQueries(22): - dataimport.start() - - self.assertCountEqual( - WorkerActivity.objects.filter(worker_version=self.worker_version).values_list('element_id', flat=True), - self.corpus.elements.filter(type__slug='page').values_list('id', flat=True) - ) - def test_put_activity_requires_internal(self): """ Only internal users (workers) are able to update the state of a worker activity