From 2ce49f7974caa8d2b4f867ca57315fe10231de8c Mon Sep 17 00:00:00 2001 From: blancoma <blanco@teklia.com> Date: Fri, 4 Sep 2020 11:34:06 +0200 Subject: [PATCH] Build Transkribus entities --- arkindex/dataimport/api.py | 3 +- .../0018_dataimport_build_entities.py | 18 +++++ arkindex/dataimport/models.py | 12 +++- arkindex/dataimport/serializers/imports.py | 1 + .../tests/test_transkribus_import.py | 66 ++++++++++++++++++- 5 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 arkindex/dataimport/migrations/0018_dataimport_build_entities.py diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 65ab011c30..ad5822ab9f 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -738,6 +738,7 @@ class ImportTranskribus(CreateAPIView): self.dataimport = corpus.imports.create( creator=self.request.user, mode=DataImportMode.Transkribus, - collection_id=collection_id + collection_id=collection_id, + build_entities=serializer.validated_data['build_entities'] ) self.dataimport.start() diff --git a/arkindex/dataimport/migrations/0018_dataimport_build_entities.py b/arkindex/dataimport/migrations/0018_dataimport_build_entities.py new file mode 100644 index 0000000000..e9b2a74aa8 --- /dev/null +++ b/arkindex/dataimport/migrations/0018_dataimport_build_entities.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1 on 2020-09-02 13:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataimport', '0017_dataimport_collection_id'), + ] + + operations = [ + migrations.AddField( + model_name='dataimport', + name='build_entities', + field=models.BooleanField(default=False), + ), + ] diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 1b32ce999e..8d166a2cb0 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -69,8 +69,9 @@ class DataImport(IndexableModel): related_name='imports', ) - # Used to define the collection ID for Transkribus import + # Used to define the collection ID and entities import for Transkribus import collection_id = models.PositiveIntegerField(null=True, blank=True) + build_entities = models.BooleanField(default=False) class Meta: ordering = ['corpus', '-created'] @@ -177,6 +178,15 @@ class DataImport(IndexableModel): }, } + # Import entities directy after import step + if self.mode == DataImportMode.Transkribus and self.build_entities: + transcriptions_path = shlex.quote(path.join('/data', import_task_name, str(self.collection_id), 'transcriptions.json')) + tasks['build_entities'] = { + 'image': settings.ARKINDEX_TASKS_IMAGE, + 'command': 'python -m arkindex_tasks.build_entities {}'.format(transcriptions_path), + 'parents': [import_task_name], + } + elts_chunk_files = ['elements.json'] if ml_workflow_chunks > 1: elts_chunk_files = ['elements_chunk_{}.json'.format(n) for n in range(1, ml_workflow_chunks + 1)] diff --git a/arkindex/dataimport/serializers/imports.py b/arkindex/dataimport/serializers/imports.py index 824d4b983d..bba42420e8 100644 --- a/arkindex/dataimport/serializers/imports.py +++ b/arkindex/dataimport/serializers/imports.py @@ -309,6 +309,7 @@ class ImportTranskribusSerializer(serializers.Serializer): Serialize a Transkribus import """ collection_id = serializers.IntegerField(min_value=1) + build_entities = serializers.BooleanField(default=False) def validate(self, data): collection_id = data.get('collection_id') diff --git a/arkindex/dataimport/tests/test_transkribus_import.py b/arkindex/dataimport/tests/test_transkribus_import.py index 4f8c119f8c..1eb011a807 100644 --- a/arkindex/dataimport/tests/test_transkribus_import.py +++ b/arkindex/dataimport/tests/test_transkribus_import.py @@ -111,7 +111,7 @@ class TestTranskribusImport(FixtureAPITestCase): }, 'tasks': { 'import': { - 'command': 'python -m arkindex_tasks.import_transkribus 12345 --corpus {}'.format(corpus.id), + 'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id), 'image': 'registry.gitlab.com/arkindex/tasks' }, 'thumbnails': { @@ -121,3 +121,67 @@ class TestTranskribusImport(FixtureAPITestCase): } } }) + + @override_settings( + PONOS_RECIPE={} + ) + @patch("transkribus.TranskribusAPI.list_user_collection") + def test_create_import_build_entities(self, mock_transkribus): + mock_transkribus.return_value = [{"email": "nope@nope.fr"}, {"email": "arkindex@teklia.com"}] + + self.client.force_login(self.user) + response = self.client.post(reverse("api:import-transkribus"), { + "collection_id": "12345", + "build_entities": True + }, format="json") + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + dataimport = DataImport.objects.get(id=data["id"]) + self.assertEqual(dataimport.mode, DataImportMode.Transkribus) + self.assertEqual(dataimport.collection_id, 12345) + corpus = dataimport.corpus + right = corpus.corpus_right.get(user=self.user) + self.assertTrue(right.can_write) + self.assertTrue(right.can_admin) + self.assertEqual(corpus.name, "Transkribus collection n°12345") + self.assertEqual(corpus.description, "") + self.assertEqual(corpus.public, False) + # Assert defaults types are set on the new corpus + self.assertCountEqual( + list(corpus.types.values( + "slug", + "display_name", + "folder", + "allowed_transcription" + )), + [{ + "folder": False, + "allowed_transcription": None, + **values + } for values in DEFAULT_TRANSKRIBUS_TYPES] + ) + self.assertEqual(dataimport.state, State.Unscheduled) + self.assertIsNotNone(dataimport.workflow) + recipe = yaml.safe_load(dataimport.workflow.recipe) + self.assertDictEqual(recipe, { + 'env': { + 'TRANSKRIBUS_EMAIL': settings.TRANSKRIBUS_EMAIL, + 'TRANSKRIBUS_PASSWORD': settings.TRANSKRIBUS_PASSWORD + }, + 'tasks': { + 'import': { + 'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id), + 'image': 'registry.gitlab.com/arkindex/tasks' + }, + 'build_entities': { + 'command': 'python -m arkindex_tasks.build_entities /data/import/{}/transcriptions.json'.format(dataimport.collection_id), + 'image': 'registry.gitlab.com/arkindex/tasks', + 'parents': ['import'] + }, + 'thumbnails': { + 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import/elements.json', + 'image': 'registry.gitlab.com/arkindex/tasks', + 'parents': ['import'] + } + } + }) -- GitLab