From 2ce49f7974caa8d2b4f867ca57315fe10231de8c Mon Sep 17 00:00:00 2001
From: blancoma <blanco@teklia.com>
Date: Fri, 4 Sep 2020 11:34:06 +0200
Subject: [PATCH] Build Transkribus entities

---
 arkindex/dataimport/api.py                    |  3 +-
 .../0018_dataimport_build_entities.py         | 18 +++++
 arkindex/dataimport/models.py                 | 12 +++-
 arkindex/dataimport/serializers/imports.py    |  1 +
 .../tests/test_transkribus_import.py          | 66 ++++++++++++++++++-
 5 files changed, 97 insertions(+), 3 deletions(-)
 create mode 100644 arkindex/dataimport/migrations/0018_dataimport_build_entities.py

diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py
index 65ab011c30..ad5822ab9f 100644
--- a/arkindex/dataimport/api.py
+++ b/arkindex/dataimport/api.py
@@ -738,6 +738,7 @@ class ImportTranskribus(CreateAPIView):
         self.dataimport = corpus.imports.create(
             creator=self.request.user,
             mode=DataImportMode.Transkribus,
-            collection_id=collection_id
+            collection_id=collection_id,
+            build_entities=serializer.validated_data['build_entities']
         )
         self.dataimport.start()
diff --git a/arkindex/dataimport/migrations/0018_dataimport_build_entities.py b/arkindex/dataimport/migrations/0018_dataimport_build_entities.py
new file mode 100644
index 0000000000..e9b2a74aa8
--- /dev/null
+++ b/arkindex/dataimport/migrations/0018_dataimport_build_entities.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1 on 2020-09-02 13:10
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('dataimport', '0017_dataimport_collection_id'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='dataimport',
+            name='build_entities',
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py
index 1b32ce999e..8d166a2cb0 100644
--- a/arkindex/dataimport/models.py
+++ b/arkindex/dataimport/models.py
@@ -69,8 +69,9 @@ class DataImport(IndexableModel):
         related_name='imports',
     )
 
-    # Used to define the collection ID for Transkribus import
+    # Used to define the collection ID and entities import for Transkribus import
     collection_id = models.PositiveIntegerField(null=True, blank=True)
+    build_entities = models.BooleanField(default=False)
 
     class Meta:
         ordering = ['corpus', '-created']
@@ -177,6 +178,15 @@ class DataImport(IndexableModel):
                 },
             }
 
+        # Import entities directy after import step
+        if self.mode == DataImportMode.Transkribus and self.build_entities:
+            transcriptions_path = shlex.quote(path.join('/data', import_task_name, str(self.collection_id), 'transcriptions.json'))
+            tasks['build_entities'] = {
+                'image': settings.ARKINDEX_TASKS_IMAGE,
+                'command': 'python -m arkindex_tasks.build_entities {}'.format(transcriptions_path),
+                'parents': [import_task_name],
+            }
+
         elts_chunk_files = ['elements.json']
         if ml_workflow_chunks > 1:
             elts_chunk_files = ['elements_chunk_{}.json'.format(n) for n in range(1, ml_workflow_chunks + 1)]
diff --git a/arkindex/dataimport/serializers/imports.py b/arkindex/dataimport/serializers/imports.py
index 824d4b983d..bba42420e8 100644
--- a/arkindex/dataimport/serializers/imports.py
+++ b/arkindex/dataimport/serializers/imports.py
@@ -309,6 +309,7 @@ class ImportTranskribusSerializer(serializers.Serializer):
     Serialize a Transkribus import
     """
     collection_id = serializers.IntegerField(min_value=1)
+    build_entities = serializers.BooleanField(default=False)
 
     def validate(self, data):
         collection_id = data.get('collection_id')
diff --git a/arkindex/dataimport/tests/test_transkribus_import.py b/arkindex/dataimport/tests/test_transkribus_import.py
index 4f8c119f8c..1eb011a807 100644
--- a/arkindex/dataimport/tests/test_transkribus_import.py
+++ b/arkindex/dataimport/tests/test_transkribus_import.py
@@ -111,7 +111,7 @@ class TestTranskribusImport(FixtureAPITestCase):
             },
             'tasks': {
                 'import': {
-                    'command': 'python -m arkindex_tasks.import_transkribus 12345 --corpus {}'.format(corpus.id),
+                    'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id),
                     'image': 'registry.gitlab.com/arkindex/tasks'
                 },
                 'thumbnails': {
@@ -121,3 +121,67 @@ class TestTranskribusImport(FixtureAPITestCase):
                 }
             }
         })
+
+    @override_settings(
+        PONOS_RECIPE={}
+    )
+    @patch("transkribus.TranskribusAPI.list_user_collection")
+    def test_create_import_build_entities(self, mock_transkribus):
+        mock_transkribus.return_value = [{"email": "nope@nope.fr"}, {"email": "arkindex@teklia.com"}]
+
+        self.client.force_login(self.user)
+        response = self.client.post(reverse("api:import-transkribus"), {
+            "collection_id": "12345",
+            "build_entities": True
+        }, format="json")
+        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+        data = response.json()
+        dataimport = DataImport.objects.get(id=data["id"])
+        self.assertEqual(dataimport.mode, DataImportMode.Transkribus)
+        self.assertEqual(dataimport.collection_id, 12345)
+        corpus = dataimport.corpus
+        right = corpus.corpus_right.get(user=self.user)
+        self.assertTrue(right.can_write)
+        self.assertTrue(right.can_admin)
+        self.assertEqual(corpus.name, "Transkribus collection n°12345")
+        self.assertEqual(corpus.description, "")
+        self.assertEqual(corpus.public, False)
+        # Assert defaults types are set on the new corpus
+        self.assertCountEqual(
+            list(corpus.types.values(
+                "slug",
+                "display_name",
+                "folder",
+                "allowed_transcription"
+            )),
+            [{
+                "folder": False,
+                "allowed_transcription": None,
+                **values
+            } for values in DEFAULT_TRANSKRIBUS_TYPES]
+        )
+        self.assertEqual(dataimport.state, State.Unscheduled)
+        self.assertIsNotNone(dataimport.workflow)
+        recipe = yaml.safe_load(dataimport.workflow.recipe)
+        self.assertDictEqual(recipe, {
+            'env': {
+                'TRANSKRIBUS_EMAIL': settings.TRANSKRIBUS_EMAIL,
+                'TRANSKRIBUS_PASSWORD': settings.TRANSKRIBUS_PASSWORD
+            },
+            'tasks': {
+                'import': {
+                    'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id),
+                    'image': 'registry.gitlab.com/arkindex/tasks'
+                },
+                'build_entities': {
+                    'command': 'python -m arkindex_tasks.build_entities /data/import/{}/transcriptions.json'.format(dataimport.collection_id),
+                    'image': 'registry.gitlab.com/arkindex/tasks',
+                    'parents': ['import']
+                },
+                'thumbnails': {
+                    'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import/elements.json',
+                    'image': 'registry.gitlab.com/arkindex/tasks',
+                    'parents': ['import']
+                }
+            }
+        })
-- 
GitLab