From ea03418bb5697db205dcb33f13ac0f25239dc0fa Mon Sep 17 00:00:00 2001
From: Bastien Abadie <bastien@nextcairn.com>
Date: Fri, 29 Jun 2018 10:54:09 +0200
Subject: [PATCH] Expose bulk_transcriptions through API

---
 arkindex/documents/api.py         | 44 ++++++++++++++++++++++++++++---
 arkindex/documents/models.py      |  6 -----
 arkindex/documents/serializers.py | 30 ++++++++++++++++++++-
 arkindex/images/importer.py       | 22 +++++++++++-----
 arkindex/project/api_v1.py        |  5 ++--
 5 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/arkindex/documents/api.py b/arkindex/documents/api.py
index 395a8352e5..7c638bf723 100644
--- a/arkindex/documents/api.py
+++ b/arkindex/documents/api.py
@@ -14,7 +14,7 @@ from arkindex.documents.serializers import \
     PageAnnotationListSerializer, PageActAnnotationListSerializer, \
     SurfaceAnnotationListSerializer, TranscriptionSearchAnnotationListSerializer, \
     ActSerializer, ElementLinkSerializer, SurfaceSerializer, \
-    TextCreationSerializer
+    TranscriptionCreateSerializer, TranscriptionsSerializer
 from arkindex.documents.models import \
     Element, ElementType, Page, Act, Transcription, ElementLink, Corpus
 from arkindex.documents.search import \
@@ -22,6 +22,7 @@ from arkindex.documents.search import \
 from arkindex.documents.tasks import refresh_db_cache
 from arkindex.documents.indexer import Indexer
 from arkindex.images.models import Zone
+from arkindex.images.importer import bulk_transcriptions
 from arkindex.project.elastic import ESQuerySet
 
 
@@ -141,8 +142,45 @@ class LinkRetrieveDestroy(RetrieveDestroyAPIView):
         refresh_db_cache.delay()
 
 
-class TextElementCreate(CreateAPIView):
-    serializer_class = TextCreationSerializer
+class TranscriptionBulk(CreateAPIView):
+    '''
+    Create transcriptions in bulk, all linked to the same image
+    and parent element
+    '''
+    serializer_class = TranscriptionsSerializer
+    permission_classes = (IsAuthenticated, )
+
+    def perform_create(self, serializer):
+
+        parent = serializer.validated_data['parent']
+        image = serializer.validated_data['image']
+        transcriptions = bulk_transcriptions(
+            image=image,
+            parent=parent,
+            items=[
+                {
+                    'polygon': tr['polygon'],
+                    'text': tr['text'],
+                    'score': tr['score'],
+                    'type': tr['type'],
+                }
+                for tr in serializer.validated_data['transcriptions']
+            ],
+            use_polygons=True,
+        )
+
+        # Reindex all created
+        if transcriptions:
+            Indexer().run_index(
+                settings.ES_INDEX_TRANSCRIPTIONS,
+                Transcription.INDEX_TYPE,
+                Transcription.objects.filter(
+                    id__in=[t.id for t in transcriptions]
+                ),
+            )
+
+class TranscriptionCreate(CreateAPIView):
+    serializer_class = TranscriptionCreateSerializer
     permission_classes = (IsAuthenticated, )
 
     def perform_create(self, serializer):
diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py
index 2a8722674d..413743c95c 100644
--- a/arkindex/documents/models.py
+++ b/arkindex/documents/models.py
@@ -334,12 +334,6 @@ class Transcription(Element):
     def __str__(self):
         return 'Transcription: {}'.format(self.text[:20])
 
-    def save(self, *args, **kwargs):
-        # TODO: move this in Element through introspection
-        if self.type is None:
-            self.type = ElementType.Word
-        super().save(*args, **kwargs)
-
     def build_search_index(self):
         """
         Structure indexed into Elastic Search
diff --git a/arkindex/documents/serializers.py b/arkindex/documents/serializers.py
index 8a29748dda..e4c3424a1d 100644
--- a/arkindex/documents/serializers.py
+++ b/arkindex/documents/serializers.py
@@ -149,7 +149,7 @@ class CorpusSerializer(serializers.ModelSerializer):
         fields = ('id', 'name')
 
 
-class TextCreationSerializer(serializers.Serializer):
+class TranscriptionCreateSerializer(serializers.Serializer):
     """
     Allows for insertion of new transcriptions and zones
     """
@@ -167,6 +167,34 @@ class TextCreationSerializer(serializers.Serializer):
     type = EnumField(ElementType)
 
 
+class TranscriptionBulkSerializer(serializers.Serializer):
+    """
+    Allows for insertion of new transcriptions and zones
+    in Bulk (used by serializer below)
+    Note: no element !
+    """
+    polygon = serializers.ListField(
+        child=serializers.ListField(
+            child=serializers.IntegerField(min_value=0),
+            min_length=2,
+            max_length=2
+        ),
+        min_length=3
+    )
+    text = serializers.CharField()
+    score = serializers.FloatField(min_value=0, max_value=1)
+    type = EnumField(ElementType)
+
+
+class TranscriptionsSerializer(serializers.Serializer):
+    """
+    Allows for insertion of new transcriptions and zones
+    in Bulk (uses serializer above) on a common parent
+    """
+    transcriptions = TranscriptionBulkSerializer(many=True)
+    parent = serializers.PrimaryKeyRelatedField(queryset=Element.objects.all())
+    image = serializers.PrimaryKeyRelatedField(queryset=Image.objects.all())
+
 class TranscriptionSearchResultSerializer(serializers.ModelSerializer):
     """
     Link between objects & their search indexation
diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py
index 988c444995..dda625b2f3 100644
--- a/arkindex/images/importer.py
+++ b/arkindex/images/importer.py
@@ -60,7 +60,7 @@ def import_indexes(image, page, index_path, extension='jpg'):
     logger.info('Created {} transcriptions '.format(len(new_transcriptions)))
 
 
-def bulk_transcriptions(image, parent, items):
+def bulk_transcriptions(image, parent, items, use_polygons=False):
     """
     Create transcriptions and zones in bulk
     """
@@ -68,16 +68,23 @@ def bulk_transcriptions(image, parent, items):
 
     # Link a transcription data with a bounding box
     # This is hashable (box is hashable)
-    TrBox = namedtuple('TrBox', 'box, line, text, score')
+    TrBox = namedtuple('TrBox', 'type, box, line, text, score')
+
+    # TODO : hash raw polygons
+    def _tuple(data):
+        if isinstance(data, list):
+            return tuple(_tuple(x) for x in data)
+        return data
 
     # Build all TrBox from items
     required = {
         TrBox(
-            BoundingBox(
+            i.get('type', ElementType.Word),
+            _tuple(i['polygon']) if use_polygons else BoundingBox(
                 [[i['x'], i['y']],
                  [i['x'] + i['width'], i['y'] + i['height']]]
             ),
-            int(i['line']),
+            int(i.get('line', 0)),
             i['text'],
             float(i['score']),
         )
@@ -93,7 +100,8 @@ def bulk_transcriptions(image, parent, items):
     # Build all TrBox from existing
     existing = {
         TrBox(
-            BoundingBox(tr.zone.polygon),
+            tr.type,
+            _tuple(tr.zone.polygon) if use_polygons else BoundingBox(tr.zone.polygon),
             tr.line,
             tr.text,
             tr.score,
@@ -112,7 +120,7 @@ def bulk_transcriptions(image, parent, items):
         elements = Element.objects.bulk_create(
             Element(
                 corpus=parent.corpus,
-                type=ElementType.Word,
+                type=n.type,
                 name=n.text,
                 zone_id=uuid.uuid4()
             )
@@ -131,7 +139,7 @@ def bulk_transcriptions(image, parent, items):
                 Zone(
                     id=elt.zone_id,
                     image=image,
-                    polygon=n.box.to_polygon(),
+                    polygon=n.box if use_polygons else n.box.to_polygon(),
                 )
             )
             for elt, n in zip(elements, needed)
diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py
index 4acafaa1b6..addbbd8b4f 100644
--- a/arkindex/project/api_v1.py
+++ b/arkindex/project/api_v1.py
@@ -5,7 +5,7 @@ from arkindex.documents.api import \
     VolumeManifest, ActManifest, \
     PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \
     TranscriptionSearch, ActSearch, TranscriptionSearchAnnotationList, \
-    ActEdit, LinkCreate, LinkRetrieveDestroy, TextElementCreate, SurfaceDetails
+    ActEdit, LinkCreate, LinkRetrieveDestroy, TranscriptionCreate, TranscriptionBulk, SurfaceDetails
 
 api = [
 
@@ -66,5 +66,6 @@ api = [
     url(r'^link/(?P<pk>[\w\-]+)/?$', LinkRetrieveDestroy.as_view(), name='link-manage'),
 
     # Ingest transcriptions
-    url(r'^element/?$', TextElementCreate.as_view(), name='element'),
+    url(r'^transcription/?$', TranscriptionCreate.as_view(), name='transcription-create'),
+    url(r'^transcription/bulk/?$', TranscriptionBulk.as_view(), name='transcription-bulk'),
 ]
-- 
GitLab