Skip to content
Snippets Groups Projects
Commit e42911d4 authored by Bastien Abadie's avatar Bastien Abadie Committed by Erwan Rouchet
Browse files

Remove bulk_zones

parent 2ea3ed96
No related branches found
No related tags found
1 merge request!22Add score to transcriptions
from arkindex.documents.models import PageType, PageDirection, Page, ElementType, Element, ElementLink
from arkindex.images.models import Image, ImageServer
from arkindex.images.importer import bulk_zones, bulk_transcriptions
from arkindex.images.importer import bulk_transcriptions
from abc import ABC, abstractmethod
from urllib.parse import urlsplit
import re
......@@ -295,10 +295,6 @@ class ManifestsImporter(ABC):
total_zones, total_transcriptions, total_indexes = 0, 0, 0
for (image, page), data in self.images_transcription_data.items():
new_zones = bulk_zones(image, data)
total_zones += len(new_zones)
logger.debug("Created {0} zones for image {1}".format(len(new_zones), image.path))
new_transcriptions = bulk_transcriptions(image, page, data)
total_transcriptions += len(new_transcriptions)
logger.debug("Created {0} transcriptions for image {1}".format(len(new_transcriptions), image.path))
......
from arkindex.images.models import Zone, Image
from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page
from arkindex.project.tools import BoundingBox
from collections import namedtuple
from django.db import transaction
import os
import re
import gzip
......@@ -51,10 +53,6 @@ def import_indexes(image, page, index_path, extension='jpg'):
logger.info('Parsed {} lines'.format(len(lines)))
# Create zones
new_zones = bulk_zones(image, lines)
logger.info('Created {} zones'.format(len(new_zones)))
# Create transcriptions
new_transcriptions = bulk_transcriptions(image, page, lines)
logger.info('Created {} transcriptions '.format(len(new_transcriptions)))
......@@ -64,79 +62,80 @@ def import_indexes(image, page, index_path, extension='jpg'):
logger.info('Added {} ES indexes'.format(nb_inserts))
def bulk_zones(image, items):
"""
Create zones in bulk (one SQL statement)
This is WAY faster
"""
# Load existing zones in images
existing = [
BoundingBox(p)
for p in image.zones.all().values_list('polygon', flat=True)
]
existing = [
(b.x, b.y, b.x + b.width, b.y + b.height) for b in existing
]
# Calc needed insert
needed = set([
(item['x'], item['y'], item['x'] + item['width'], item['y'] + item['height'])
for item in items
]).difference(existing)
# Bulk create in db
return Zone.objects.bulk_create([
Zone(image=image, polygon=[[z[0], z[1]], [z[2], z[1]], [z[2], z[3]], [z[0], z[3]]])
for z in needed
])
def bulk_transcriptions(image, page, items):
"""
Create transcriptions in bulk
Create transcriptions and zones in bulk
"""
# Index existing zones, by unique keys
zones = {
repr(BoundingBox(z[1])): z[0]
for z in image.zones.all().values_list('id', 'polygon')
}
# Load existing transcriptions
existing = Transcription.objects.filter(
zones__image=image
).values_list('zones__id', 'line', 'text', 'score')
# Calc needed insert
needed = set([
(
zones[repr(BoundingBox(
# Link a transcription data with a bounding box
# This is hashable (box is hashable)
TrBox = namedtuple('TrBox', 'box, line, text, score')
# Build all TrBox from items
required = {
TrBox(
BoundingBox(
[[i['x'], i['y']],
[i['x'] + i['width'], i['y'] + i['height']]]
))],
i['line'],
),
int(i['line']),
i['text'],
i['score'],
float(i['score']),
)
for i in items
]).difference(existing)
# Create transcriptions and associate zones
all_ts = []
for t in needed:
ts = Transcription.objects.create(line=t[1], text=t[2], score=t[3])
all_ts.append(ts)
z = image.zones.get(id=t[0])
z.element_id = ts.id
z.save()
}
# List all zones in image
zones = {
z.id: z.polygon
for z in image.zones.all()
}
# Build all TrBox from existing
existing = {
TrBox(
BoundingBox(zone.polygon),
tr.line,
tr.text,
tr.score,
)
for tr in Transcription.objects.filter(zones__image=image).prefetch_related('zones')
for zone in tr.zones.all()
}
# Calc needed TrBox to build
needed = required.difference(existing)
zones = []
transcriptions = []
with transaction.atomic():
# Create transcriptions and linked zones
for n in needed:
tr = Transcription.objects.create(
line=n.line,
text=n.text,
score=n.score,
)
transcriptions.append(tr)
zones.append(Zone(
element_id=tr.id,
image=image,
polygon=n.box.to_polygon(),
))
# Build zones in bulk
Zone.objects.bulk_create(zones)
# Create all links between transcription and page
max_order_dl = ElementLink.objects.filter(parent=page).order_by('-order').first()
max_order = 0 if max_order_dl is None else max_order_dl.order + 1
ElementLink.objects.bulk_create(
ElementLink(parent=page, child=ts, order=i)
for i, ts in enumerate(all_ts, max_order)
ElementLink(parent=page, child=tr, order=i)
for i, tr in enumerate(transcriptions, max_order)
)
return all_ts
return transcriptions
class IndexImporter(object):
......
# Generated by Django 2.0 on 2018-05-16 21:11
from django.db import migrations, models
import django.db.models.deletion
def clean_zones(apps, schema_editor):
'''
Remove zones with empty element
'''
Zone = apps.get_model('images', 'Zone')
for zone in Zone.objects.filter(element__isnull=True):
zone.delete()
class Migration(migrations.Migration):
dependencies = [
('images', '0002_image_status'),
]
operations = [
migrations.RunPython(clean_zones),
migrations.AlterField(
model_name='zone',
name='element',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='zones',
to='documents.Element',
),
),
migrations.AlterUniqueTogether(
name='zone',
unique_together=set(),
),
]
......@@ -168,8 +168,6 @@ class Zone(IndexableModel):
'documents.Element',
on_delete=models.CASCADE,
related_name='zones',
null=True,
blank=True,
)
polygon = ArrayField(
......@@ -178,11 +176,6 @@ class Zone(IndexableModel):
size=2)
)
class Meta:
unique_together = (
('image', 'polygon'),
)
@cached_property
def box(self):
return BoundingBox(self.polygon)
......
......@@ -71,12 +71,22 @@ class BoundingBox(object):
return "BoundingBox({}, {}, {}, {})".format(
self.x, self.y, self.width, self.height)
def __eq__(self, other):
return self.x == other.x \
and self.y == other.y \
and self.width == other.width \
and self.height == other.height
def __hash__(self):
return hash((self.x, self.y, self.width, self.height))
def to_polygon(self):
points = [(self.x, self.y),
(self.x, self.y + self.height),
(self.x + self.width, self.y + self.height),
(self.x + self.width, self.y)]
return tuple('({},{})'.format(i, j) for i, j in points)
return [
(self.x, self.y),
(self.x, self.y + self.height),
(self.x + self.width, self.y + self.height),
(self.x + self.width, self.y),
]
def sslify_url(url):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment