From e42911d43d4f7baa29246d9fa0bda65e868ad343 Mon Sep 17 00:00:00 2001
From: Bastien Abadie <bastien@nextcairn.com>
Date: Wed, 16 May 2018 23:56:26 +0200
Subject: [PATCH] Remove bulk_zones

---
 arkindex/documents/importer.py                |   6 +-
 arkindex/images/importer.py                   | 127 +++++++++---------
 .../migrations/0003_auto_20180516_2111.py     |  37 +++++
 arkindex/images/models.py                     |   7 -
 arkindex/project/tools.py                     |  20 ++-
 5 files changed, 116 insertions(+), 81 deletions(-)
 create mode 100644 arkindex/images/migrations/0003_auto_20180516_2111.py

diff --git a/arkindex/documents/importer.py b/arkindex/documents/importer.py
index c84609f617..10c62c7d8d 100644
--- a/arkindex/documents/importer.py
+++ b/arkindex/documents/importer.py
@@ -1,6 +1,6 @@
 from arkindex.documents.models import PageType, PageDirection, Page, ElementType, Element, ElementLink
 from arkindex.images.models import Image, ImageServer
-from arkindex.images.importer import bulk_zones, bulk_transcriptions
+from arkindex.images.importer import bulk_transcriptions
 from abc import ABC, abstractmethod
 from urllib.parse import urlsplit
 import re
@@ -295,10 +295,6 @@ class ManifestsImporter(ABC):
         total_zones, total_transcriptions, total_indexes = 0, 0, 0
 
         for (image, page), data in self.images_transcription_data.items():
-            new_zones = bulk_zones(image, data)
-            total_zones += len(new_zones)
-            logger.debug("Created {0} zones for image {1}".format(len(new_zones), image.path))
-
             new_transcriptions = bulk_transcriptions(image, page, data)
             total_transcriptions += len(new_transcriptions)
             logger.debug("Created {0} transcriptions for image {1}".format(len(new_transcriptions), image.path))
diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py
index eb5be45a8c..333731fd5e 100644
--- a/arkindex/images/importer.py
+++ b/arkindex/images/importer.py
@@ -1,6 +1,8 @@
 from arkindex.images.models import Zone, Image
 from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page
 from arkindex.project.tools import BoundingBox
+from collections import namedtuple
+from django.db import transaction
 import os
 import re
 import gzip
@@ -51,10 +53,6 @@ def import_indexes(image, page, index_path, extension='jpg'):
 
     logger.info('Parsed {} lines'.format(len(lines)))
 
-    # Create zones
-    new_zones = bulk_zones(image, lines)
-    logger.info('Created {} zones'.format(len(new_zones)))
-
     # Create transcriptions
     new_transcriptions = bulk_transcriptions(image, page, lines)
     logger.info('Created {} transcriptions '.format(len(new_transcriptions)))
@@ -64,79 +62,80 @@ def import_indexes(image, page, index_path, extension='jpg'):
     logger.info('Added {} ES indexes'.format(nb_inserts))
 
 
-def bulk_zones(image, items):
-    """
-    Create zones in bulk (one SQL statement)
-    This is WAY faster
-    """
-    # Load existing zones in images
-    existing = [
-        BoundingBox(p)
-        for p in image.zones.all().values_list('polygon', flat=True)
-    ]
-    existing = [
-        (b.x, b.y, b.x + b.width, b.y + b.height) for b in existing
-    ]
-
-    # Calc needed insert
-    needed = set([
-        (item['x'], item['y'], item['x'] + item['width'], item['y'] + item['height'])
-        for item in items
-    ]).difference(existing)
-
-    # Bulk create in db
-    return Zone.objects.bulk_create([
-        Zone(image=image, polygon=[[z[0], z[1]], [z[2], z[1]], [z[2], z[3]], [z[0], z[3]]])
-        for z in needed
-    ])
-
-
 def bulk_transcriptions(image, page, items):
     """
-    Create transcriptions in bulk
+    Create transcriptions and zones in bulk
     """
-
-    # Index existing zones, by unique keys
-    zones = {
-        repr(BoundingBox(z[1])): z[0]
-        for z in image.zones.all().values_list('id', 'polygon')
-    }
-
-    # Load existing transcriptions
-    existing = Transcription.objects.filter(
-        zones__image=image
-    ).values_list('zones__id', 'line', 'text', 'score')
-
-    # Calc needed insert
-    needed = set([
-        (
-            zones[repr(BoundingBox(
+    # Link a transcription data with a bounding box
+    # This is hashable (box is hashable)
+    TrBox = namedtuple('TrBox', 'box, line, text, score')
+
+    # Build all TrBox from items
+    required = {
+        TrBox(
+            BoundingBox(
                 [[i['x'], i['y']],
                  [i['x'] + i['width'], i['y'] + i['height']]]
-            ))],
-            i['line'],
+            ),
+            int(i['line']),
             i['text'],
-            i['score'],
+            float(i['score']),
         )
         for i in items
-    ]).difference(existing)
-
-    # Create transcriptions and associate zones
-    all_ts = []
-    for t in needed:
-        ts = Transcription.objects.create(line=t[1], text=t[2], score=t[3])
-        all_ts.append(ts)
-        z = image.zones.get(id=t[0])
-        z.element_id = ts.id
-        z.save()
+    }
+
+    # List all zones in image
+    zones = {
+        z.id: z.polygon
+        for z in image.zones.all()
+    }
+
+    # Build all TrBox from existing
+    existing = {
+        TrBox(
+            BoundingBox(zone.polygon),
+            tr.line,
+            tr.text,
+            tr.score,
+        )
+        for tr in Transcription.objects.filter(zones__image=image).prefetch_related('zones')
+        for zone in tr.zones.all()
+    }
+
+    # Calc needed TrBox to build
+    needed = required.difference(existing)
+
+    zones = []
+    transcriptions = []
+    with transaction.atomic():
+
+        # Create transcriptions and linked zones
+        for n in needed:
+            tr = Transcription.objects.create(
+                line=n.line,
+                text=n.text,
+                score=n.score,
+            )
+            transcriptions.append(tr)
+
+            zones.append(Zone(
+                element_id=tr.id,
+                image=image,
+                polygon=n.box.to_polygon(),
+            ))
+
+        # Build zones in bulk
+        Zone.objects.bulk_create(zones)
+
+    # Create all links between transcription and page
     max_order_dl = ElementLink.objects.filter(parent=page).order_by('-order').first()
     max_order = 0 if max_order_dl is None else max_order_dl.order + 1
     ElementLink.objects.bulk_create(
-        ElementLink(parent=page, child=ts, order=i)
-        for i, ts in enumerate(all_ts, max_order)
+        ElementLink(parent=page, child=tr, order=i)
+        for i, tr in enumerate(transcriptions, max_order)
     )
 
-    return all_ts
+    return transcriptions
 
 
 class IndexImporter(object):
diff --git a/arkindex/images/migrations/0003_auto_20180516_2111.py b/arkindex/images/migrations/0003_auto_20180516_2111.py
new file mode 100644
index 0000000000..2157f7fbe5
--- /dev/null
+++ b/arkindex/images/migrations/0003_auto_20180516_2111.py
@@ -0,0 +1,37 @@
+# Generated by Django 2.0 on 2018-05-16 21:11
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+def clean_zones(apps, schema_editor):
+    '''
+    Remove zones with empty element
+    '''
+    Zone = apps.get_model('images', 'Zone')
+    for zone in Zone.objects.filter(element__isnull=True):
+        zone.delete()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('images', '0002_image_status'),
+    ]
+
+    operations = [
+        migrations.RunPython(clean_zones),
+        migrations.AlterField(
+            model_name='zone',
+            name='element',
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='zones',
+                to='documents.Element',
+            ),
+        ),
+        migrations.AlterUniqueTogether(
+            name='zone',
+            unique_together=set(),
+        ),
+    ]
diff --git a/arkindex/images/models.py b/arkindex/images/models.py
index 3298ccfc56..5c20a18e97 100644
--- a/arkindex/images/models.py
+++ b/arkindex/images/models.py
@@ -168,8 +168,6 @@ class Zone(IndexableModel):
         'documents.Element',
         on_delete=models.CASCADE,
         related_name='zones',
-        null=True,
-        blank=True,
     )
 
     polygon = ArrayField(
@@ -178,11 +176,6 @@ class Zone(IndexableModel):
             size=2)
     )
 
-    class Meta:
-        unique_together = (
-            ('image', 'polygon'),
-        )
-
     @cached_property
     def box(self):
         return BoundingBox(self.polygon)
diff --git a/arkindex/project/tools.py b/arkindex/project/tools.py
index cc9d8ebb9b..f7c0a61cf9 100644
--- a/arkindex/project/tools.py
+++ b/arkindex/project/tools.py
@@ -71,12 +71,22 @@ class BoundingBox(object):
         return "BoundingBox({}, {}, {}, {})".format(
             self.x, self.y, self.width, self.height)
 
+    def __eq__(self, other):
+        return self.x == other.x \
+            and self.y == other.y \
+            and self.width == other.width \
+            and self.height == other.height
+
+    def __hash__(self):
+        return hash((self.x, self.y, self.width, self.height))
+
     def to_polygon(self):
-        points = [(self.x, self.y),
-                  (self.x, self.y + self.height),
-                  (self.x + self.width, self.y + self.height),
-                  (self.x + self.width, self.y)]
-        return tuple('({},{})'.format(i, j) for i, j in points)
+        return [
+            (self.x, self.y),
+            (self.x, self.y + self.height),
+            (self.x + self.width, self.y + self.height),
+            (self.x + self.width, self.y),
+        ]
 
 
 def sslify_url(url):
-- 
GitLab