From e42911d43d4f7baa29246d9fa0bda65e868ad343 Mon Sep 17 00:00:00 2001 From: Bastien Abadie <bastien@nextcairn.com> Date: Wed, 16 May 2018 23:56:26 +0200 Subject: [PATCH] Remove bulk_zones --- arkindex/documents/importer.py | 6 +- arkindex/images/importer.py | 127 +++++++++--------- .../migrations/0003_auto_20180516_2111.py | 37 +++++ arkindex/images/models.py | 7 - arkindex/project/tools.py | 20 ++- 5 files changed, 116 insertions(+), 81 deletions(-) create mode 100644 arkindex/images/migrations/0003_auto_20180516_2111.py diff --git a/arkindex/documents/importer.py b/arkindex/documents/importer.py index c84609f617..10c62c7d8d 100644 --- a/arkindex/documents/importer.py +++ b/arkindex/documents/importer.py @@ -1,6 +1,6 @@ from arkindex.documents.models import PageType, PageDirection, Page, ElementType, Element, ElementLink from arkindex.images.models import Image, ImageServer -from arkindex.images.importer import bulk_zones, bulk_transcriptions +from arkindex.images.importer import bulk_transcriptions from abc import ABC, abstractmethod from urllib.parse import urlsplit import re @@ -295,10 +295,6 @@ class ManifestsImporter(ABC): total_zones, total_transcriptions, total_indexes = 0, 0, 0 for (image, page), data in self.images_transcription_data.items(): - new_zones = bulk_zones(image, data) - total_zones += len(new_zones) - logger.debug("Created {0} zones for image {1}".format(len(new_zones), image.path)) - new_transcriptions = bulk_transcriptions(image, page, data) total_transcriptions += len(new_transcriptions) logger.debug("Created {0} transcriptions for image {1}".format(len(new_transcriptions), image.path)) diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index eb5be45a8c..333731fd5e 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -1,6 +1,8 @@ from arkindex.images.models import Zone, Image from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page from arkindex.project.tools import BoundingBox +from collections import namedtuple +from django.db import transaction import os import re import gzip @@ -51,10 +53,6 @@ def import_indexes(image, page, index_path, extension='jpg'): logger.info('Parsed {} lines'.format(len(lines))) - # Create zones - new_zones = bulk_zones(image, lines) - logger.info('Created {} zones'.format(len(new_zones))) - # Create transcriptions new_transcriptions = bulk_transcriptions(image, page, lines) logger.info('Created {} transcriptions '.format(len(new_transcriptions))) @@ -64,79 +62,80 @@ def import_indexes(image, page, index_path, extension='jpg'): logger.info('Added {} ES indexes'.format(nb_inserts)) -def bulk_zones(image, items): - """ - Create zones in bulk (one SQL statement) - This is WAY faster - """ - # Load existing zones in images - existing = [ - BoundingBox(p) - for p in image.zones.all().values_list('polygon', flat=True) - ] - existing = [ - (b.x, b.y, b.x + b.width, b.y + b.height) for b in existing - ] - - # Calc needed insert - needed = set([ - (item['x'], item['y'], item['x'] + item['width'], item['y'] + item['height']) - for item in items - ]).difference(existing) - - # Bulk create in db - return Zone.objects.bulk_create([ - Zone(image=image, polygon=[[z[0], z[1]], [z[2], z[1]], [z[2], z[3]], [z[0], z[3]]]) - for z in needed - ]) - - def bulk_transcriptions(image, page, items): """ - Create transcriptions in bulk + Create transcriptions and zones in bulk """ - - # Index existing zones, by unique keys - zones = { - repr(BoundingBox(z[1])): z[0] - for z in image.zones.all().values_list('id', 'polygon') - } - - # Load existing transcriptions - existing = Transcription.objects.filter( - zones__image=image - ).values_list('zones__id', 'line', 'text', 'score') - - # Calc needed insert - needed = set([ - ( - zones[repr(BoundingBox( + # Link a transcription data with a bounding box + # This is hashable (box is hashable) + TrBox = namedtuple('TrBox', 'box, line, text, score') + + # Build all TrBox from items + required = { + TrBox( + BoundingBox( [[i['x'], i['y']], [i['x'] + i['width'], i['y'] + i['height']]] - ))], - i['line'], + ), + int(i['line']), i['text'], - i['score'], + float(i['score']), ) for i in items - ]).difference(existing) - - # Create transcriptions and associate zones - all_ts = [] - for t in needed: - ts = Transcription.objects.create(line=t[1], text=t[2], score=t[3]) - all_ts.append(ts) - z = image.zones.get(id=t[0]) - z.element_id = ts.id - z.save() + } + + # List all zones in image + zones = { + z.id: z.polygon + for z in image.zones.all() + } + + # Build all TrBox from existing + existing = { + TrBox( + BoundingBox(zone.polygon), + tr.line, + tr.text, + tr.score, + ) + for tr in Transcription.objects.filter(zones__image=image).prefetch_related('zones') + for zone in tr.zones.all() + } + + # Calc needed TrBox to build + needed = required.difference(existing) + + zones = [] + transcriptions = [] + with transaction.atomic(): + + # Create transcriptions and linked zones + for n in needed: + tr = Transcription.objects.create( + line=n.line, + text=n.text, + score=n.score, + ) + transcriptions.append(tr) + + zones.append(Zone( + element_id=tr.id, + image=image, + polygon=n.box.to_polygon(), + )) + + # Build zones in bulk + Zone.objects.bulk_create(zones) + + # Create all links between transcription and page max_order_dl = ElementLink.objects.filter(parent=page).order_by('-order').first() max_order = 0 if max_order_dl is None else max_order_dl.order + 1 ElementLink.objects.bulk_create( - ElementLink(parent=page, child=ts, order=i) - for i, ts in enumerate(all_ts, max_order) + ElementLink(parent=page, child=tr, order=i) + for i, tr in enumerate(transcriptions, max_order) ) - return all_ts + return transcriptions class IndexImporter(object): diff --git a/arkindex/images/migrations/0003_auto_20180516_2111.py b/arkindex/images/migrations/0003_auto_20180516_2111.py new file mode 100644 index 0000000000..2157f7fbe5 --- /dev/null +++ b/arkindex/images/migrations/0003_auto_20180516_2111.py @@ -0,0 +1,37 @@ +# Generated by Django 2.0 on 2018-05-16 21:11 + +from django.db import migrations, models +import django.db.models.deletion + + +def clean_zones(apps, schema_editor): + ''' + Remove zones with empty element + ''' + Zone = apps.get_model('images', 'Zone') + for zone in Zone.objects.filter(element__isnull=True): + zone.delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0002_image_status'), + ] + + operations = [ + migrations.RunPython(clean_zones), + migrations.AlterField( + model_name='zone', + name='element', + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='zones', + to='documents.Element', + ), + ), + migrations.AlterUniqueTogether( + name='zone', + unique_together=set(), + ), + ] diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 3298ccfc56..5c20a18e97 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -168,8 +168,6 @@ class Zone(IndexableModel): 'documents.Element', on_delete=models.CASCADE, related_name='zones', - null=True, - blank=True, ) polygon = ArrayField( @@ -178,11 +176,6 @@ class Zone(IndexableModel): size=2) ) - class Meta: - unique_together = ( - ('image', 'polygon'), - ) - @cached_property def box(self): return BoundingBox(self.polygon) diff --git a/arkindex/project/tools.py b/arkindex/project/tools.py index cc9d8ebb9b..f7c0a61cf9 100644 --- a/arkindex/project/tools.py +++ b/arkindex/project/tools.py @@ -71,12 +71,22 @@ class BoundingBox(object): return "BoundingBox({}, {}, {}, {})".format( self.x, self.y, self.width, self.height) + def __eq__(self, other): + return self.x == other.x \ + and self.y == other.y \ + and self.width == other.width \ + and self.height == other.height + + def __hash__(self): + return hash((self.x, self.y, self.width, self.height)) + def to_polygon(self): - points = [(self.x, self.y), - (self.x, self.y + self.height), - (self.x + self.width, self.y + self.height), - (self.x + self.width, self.y)] - return tuple('({},{})'.format(i, j) for i, j in points) + return [ + (self.x, self.y), + (self.x, self.y + self.height), + (self.x + self.width, self.y + self.height), + (self.x + self.width, self.y), + ] def sslify_url(url): -- GitLab