From c35215fb91f51d5d4b2d8f364320530ff588fc3c Mon Sep 17 00:00:00 2001 From: mlbonhomme <bonhomme@teklia.com> Date: Fri, 13 May 2022 15:21:03 +0000 Subject: [PATCH] support worker runs on metadata --- arkindex/documents/admin.py | 4 +- .../0053_metadata_worker_run_and_more.py | 24 ++++ arkindex/documents/models.py | 14 +++ arkindex/documents/serializers/elements.py | 33 ++++- arkindex/documents/tests/test_metadata.py | 116 ++++++++++++++++-- arkindex/sql_validation/indexer_prefetch.sql | 3 +- 6 files changed, 178 insertions(+), 16 deletions(-) create mode 100644 arkindex/documents/migrations/0053_metadata_worker_run_and_more.py diff --git a/arkindex/documents/admin.py b/arkindex/documents/admin.py index d77a8bef34..d7228c4abb 100644 --- a/arkindex/documents/admin.py +++ b/arkindex/documents/admin.py @@ -76,12 +76,12 @@ class MetaDataAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'type', ) list_filter = [('type', EnumFieldListFilter), ] readonly_fields = ('id', ) - raw_id_fields = ('element', 'entity', 'worker_version') + raw_id_fields = ('element', 'entity', 'worker_version', 'worker_run') class MetaDataInline(admin.TabularInline): model = MetaData - raw_id_fields = ('entity', 'worker_version', ) + raw_id_fields = ('entity', 'worker_version', 'worker_run') class ElementAdmin(admin.ModelAdmin): diff --git a/arkindex/documents/migrations/0053_metadata_worker_run_and_more.py b/arkindex/documents/migrations/0053_metadata_worker_run_and_more.py new file mode 100644 index 0000000000..e87139d38e --- /dev/null +++ b/arkindex/documents/migrations/0053_metadata_worker_run_and_more.py @@ -0,0 +1,24 @@ +# Generated by Django 4.0.2 on 2022-05-13 14:54 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataimport', '0047_workerversion_model_usage'), + ('documents', '0052_element_worker_run_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='metadata', + name='worker_run', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='metadatas', to='dataimport.workerrun'), + ), + migrations.AddConstraint( + model_name='metadata', + constraint=models.CheckConstraint(check=models.Q(('worker_version_id__isnull', False), ('worker_run_id__isnull', True), _connector='OR'), name='metadata_worker_run_requires_worker_version'), + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 8610e87b3e..d5f58c9e91 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -772,6 +772,13 @@ class MetaData(InterpretedDateMixin, models.Model): null=True, blank=True, ) + worker_run = models.ForeignKey( + 'dataimport.WorkerRun', + related_name='metadatas', + on_delete=models.DO_NOTHING, + null=True, + blank=True + ) class Meta: ordering = ('element', 'name', 'id') @@ -785,6 +792,13 @@ class MetaData(InterpretedDateMixin, models.Model): # have to use RawSQL. https://code.djangoproject.com/ticket/31646 check=~Q(type=MetaType.Numeric) | RawSQL('value::double precision IS NOT NULL', params=(), output_field=models.BooleanField()), name='metadata_numeric_values' + ), + # There can be a worker run ID only if there is a worker version ID, + # but there can be a worker version ID without a worker run ID (backwards compatibility). + # In other words, either the worker run ID is null, or the worker version ID is not null. + models.CheckConstraint( + check=Q(worker_version_id__isnull=False) | Q(worker_run_id__isnull=True), + name='metadata_worker_run_requires_worker_version', ) ] indexes = [ diff --git a/arkindex/documents/serializers/elements.py b/arkindex/documents/serializers/elements.py index 5987f7f88e..d50399d368 100644 --- a/arkindex/documents/serializers/elements.py +++ b/arkindex/documents/serializers/elements.py @@ -110,11 +110,12 @@ class MetaDataUpdateSerializer(MetaDataSerializer): class Meta: model = MetaData - fields = MetaDataSerializer.Meta.fields + ('entity_id', 'worker_version', ) - read_only_fields = ('worker_version', ) + fields = MetaDataSerializer.Meta.fields + ('entity_id', 'worker_version', 'worker_run_id', ) + read_only_fields = ('worker_version', 'worker_run_id', ) validators = [ metadata_number_validator, metadata_url_validator, + WorkerRunOrVersionValidator(worker_version_field='worker_version', ), ] def __init__(self, *args, **kwargs): @@ -126,10 +127,15 @@ class MetaDataUpdateSerializer(MetaDataSerializer): # self.instance can be a queryset if this serializer is used for a list; just ignore anything that isn't a metadata elif isinstance(self.instance, MetaData): corpus_id = self.instance.element.corpus_id - if corpus_id: self.fields['entity_id'].queryset = Entity.objects.filter(corpus_id=corpus_id) + def validate(self, data): + worker_run = data.get('worker_run') + if worker_run is not None: + data['worker_version'] = WorkerVersion(id=worker_run.version_id) + return data + class MetaDataCreateSerializer(MetaDataUpdateSerializer): worker_version = serializers.PrimaryKeyRelatedField( @@ -138,6 +144,13 @@ class MetaDataCreateSerializer(MetaDataUpdateSerializer): allow_null=True, style={'base_template': 'input.html'}, ) + worker_run_id = serializers.PrimaryKeyRelatedField( + queryset=WorkerRun.objects.all(), + required=False, + allow_null=True, + style={'base_template': 'input.html'}, + source='worker_run' + ) class Meta(MetaDataUpdateSerializer.Meta): read_only_fields = () @@ -179,6 +192,16 @@ class MetaDataBulkSerializer(serializers.Serializer): allow_null=True, style={'base_template': 'input.html'}, ) + worker_run_id = serializers.PrimaryKeyRelatedField( + queryset=WorkerRun.objects.all(), + required=False, + allow_null=True, + style={'base_template': 'input.html'}, + source='worker_run' + ) + + class Meta: + validators = [WorkerRunOrVersionValidator(worker_version_field='worker_version')] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -197,6 +220,9 @@ class MetaDataBulkSerializer(serializers.Serializer): def validate(self, data): data = super().validate(data) + worker_run = data.get('worker_run') + if worker_run is not None: + data['worker_version'] = WorkerVersion(id=worker_run.version_id) request_metadata = self.make_metadata_tuples(data['metadata_list']) unique_metadata = set(request_metadata) if len(unique_metadata) != len(request_metadata): @@ -216,6 +242,7 @@ class MetaDataBulkSerializer(serializers.Serializer): id=uuid.uuid4(), element=self.context['element'], worker_version=validated_data.get('worker_version', None), + worker_run=validated_data.get('worker_run', None), **m ) for m in validated_data['metadata_list'] diff --git a/arkindex/documents/tests/test_metadata.py b/arkindex/documents/tests/test_metadata.py index 0df3d4402e..4313dc4239 100644 --- a/arkindex/documents/tests/test_metadata.py +++ b/arkindex/documents/tests/test_metadata.py @@ -28,6 +28,7 @@ class TestMetaData(FixtureAPITestCase): ) ) cls.worker_version = WorkerVersion.objects.first() + cls.worker_run = cls.worker_version.worker_runs.first() def setUp(self): super().setUp() @@ -47,7 +48,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'dates': [], 'entity': None, - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None }, ]) @@ -64,7 +66,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'dates': [], 'entity': None, - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None }, ]) @@ -98,6 +101,7 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'entity': None, 'worker_version': None, + 'worker_run_id': None }, { 'id': str(entity_meta.id), @@ -115,6 +119,7 @@ class TestMetaData(FixtureAPITestCase): 'worker_version_id': str(self.worker_version.id), }, 'worker_version': None, + 'worker_run_id': None }, ]) @@ -134,6 +139,7 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'entity': None, 'worker_version': str(self.worker_version.id), + 'worker_run_id': None }, ]) @@ -201,6 +207,38 @@ class TestMetaData(FixtureAPITestCase): self.assertEqual(md.value, 'Texas') self.assertEqual(md.worker_version, self.worker_version) + def test_create_metadata_worker_run(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}), + data={'type': 'location', 'name': 'location', 'value': 'Texas', 'worker_run_id': str(self.worker_run.id)} + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + md = self.vol.metadatas.get(type=MetaType.Location, name='location') + self.assertEqual(md.value, 'Texas') + self.assertEqual(md.worker_run_id, self.worker_run.id) + self.assertEqual(md.worker_version, self.worker_version) + + def test_create_metadata_worker_run_or_version(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}), + data={'type': 'location', 'name': 'location', 'value': 'Texas', 'worker_run_id': str(self.worker_run.id), 'worker_version': str(self.worker_version.id)} + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), {'non_field_errors': ['Only one of `worker_version` and `worker_run_id` may be set.']}) + + def test_create_metadata_bad_worker_run(self): + self.client.force_login(self.user) + non_existent_worker_run_id = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + response = self.client.post( + reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}), + data={'type': 'location', 'name': 'location', 'value': 'Texas', 'worker_run_id': non_existent_worker_run_id}, + format='json' + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), {'worker_run_id': [f'Invalid pk "{non_existent_worker_run_id}" - object does not exist.']}) + def test_create_metadata_date(self): self.client.force_login(self.user) response = self.client.post( @@ -221,7 +259,8 @@ class TestMetaData(FixtureAPITestCase): 'type': 'exact', 'year': 1885 }], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None }) def test_create_metadata_empty(self): @@ -388,6 +427,7 @@ class TestMetaData(FixtureAPITestCase): 'value': '2019-12-04', 'entity': None, 'worker_version': None, + 'worker_run_id': None }) self.metadata.refresh_from_db() self.assertEqual(self.metadata.type, MetaType.Date) @@ -413,6 +453,7 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'entity': None, 'worker_version': None, + 'worker_run_id': None }) self.metadata.refresh_from_db() self.assertEqual(self.metadata.type, MetaType.Date) @@ -437,6 +478,7 @@ class TestMetaData(FixtureAPITestCase): 'value': '2019-12-04', 'entity': None, 'worker_version': None, + 'worker_run_id': None }) self.metadata.refresh_from_db() self.assertEqual(self.metadata.type, MetaType.Text) @@ -570,7 +612,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '123', 'entity': None, 'dates': [], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None } ) @@ -592,7 +635,8 @@ class TestMetaData(FixtureAPITestCase): 'value': 123.0, 'entity': None, 'dates': [], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None } ) @@ -686,7 +730,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '# Title\n## Subtitle\nbla', 'entity': None, 'dates': [], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None } ) @@ -704,7 +749,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '<h1>Title</h1>', 'entity': None, 'dates': [], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None } ) @@ -722,7 +768,8 @@ class TestMetaData(FixtureAPITestCase): 'value': '<style type="text/css">* { display: none !important; }</style>', 'entity': None, 'dates': [], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None } ) @@ -893,7 +940,8 @@ class TestMetaData(FixtureAPITestCase): 'dates': [] } ], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None }) def test_bulk_create_metadata_worker_version(self): @@ -914,6 +962,53 @@ class TestMetaData(FixtureAPITestCase): self.assertEqual(md.value, 'Mandarin') self.assertEqual(md.worker_version, self.worker_version) + def test_bulk_create_metadata_worker_run(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:element-metadata-bulk', kwargs={'pk': str(self.vol.id)}), + data={'metadata_list': [ + {'type': 'text', 'name': 'language', 'value': 'Chinese'}, + {'type': 'text', 'name': 'alt_language', 'value': 'Mandarin'} + ], 'worker_run_id': str(self.worker_run.id)}, + format='json' + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + md = self.vol.metadatas.get(type=MetaType.Text, name='language') + self.assertEqual(md.value, 'Chinese') + self.assertEqual(md.worker_run_id, self.worker_run.id) + self.assertEqual(md.worker_version, self.worker_version) + md = self.vol.metadatas.get(type=MetaType.Text, name='alt_language') + self.assertEqual(md.value, 'Mandarin') + self.assertEqual(md.worker_run_id, self.worker_run.id) + self.assertEqual(md.worker_version, self.worker_version) + + def test_bulk_create_metadata_worker_run_or_version(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:element-metadata-bulk', kwargs={'pk': str(self.vol.id)}), + data={'metadata_list': [ + {'type': 'text', 'name': 'language', 'value': 'Chinese'}, + {'type': 'text', 'name': 'alt_language', 'value': 'Mandarin'} + ], 'worker_run_id': str(self.worker_run.id), 'worker_version': str(self.worker_version.id)}, + format='json' + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), {'non_field_errors': ['Only one of `worker_version` and `worker_run_id` may be set.']}) + + def test_bulk_create_metadata_bad_worker_run(self): + self.client.force_login(self.user) + non_existent_worker_run_id = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + response = self.client.post( + reverse('api:element-metadata-bulk', kwargs={'pk': str(self.vol.id)}), + data={'metadata_list': [ + {'type': 'text', 'name': 'language', 'value': 'Chinese'}, + {'type': 'text', 'name': 'alt_language', 'value': 'Mandarin'} + ], 'worker_run_id': non_existent_worker_run_id}, + format='json' + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), {'worker_run_id': [f'Invalid pk "{non_existent_worker_run_id}" - object does not exist.']}) + def test_bulk_create_metadata_empty(self): self.client.force_login(self.user) response = self.client.post(reverse('api:element-metadata-bulk', kwargs={'pk': str(self.vol.id)})) @@ -974,7 +1069,8 @@ class TestMetaData(FixtureAPITestCase): 'entity_id': None, }, ], - 'worker_version': None + 'worker_version': None, + 'worker_run_id': None }) self.vol.metadatas.all().delete() diff --git a/arkindex/sql_validation/indexer_prefetch.sql b/arkindex/sql_validation/indexer_prefetch.sql index beb012cbdb..3e68a122c9 100644 --- a/arkindex/sql_validation/indexer_prefetch.sql +++ b/arkindex/sql_validation/indexer_prefetch.sql @@ -94,7 +94,8 @@ SELECT "documents_metadata"."id", "documents_metadata"."type", "documents_metadata"."value", "documents_metadata"."entity_id", - "documents_metadata"."worker_version_id" + "documents_metadata"."worker_version_id", + "documents_metadata"."worker_run_id" FROM "documents_metadata" WHERE "documents_metadata"."element_id" IN ('{page_id}'::uuid) ORDER BY "documents_metadata"."element_id" ASC, -- GitLab