diff --git a/arkindex/documents/importer.py b/arkindex/documents/importer.py index 8f98272ca9004dbd45ae79278f7eec1bdef99d0e..7edc7799d4c6728e1fccfb0b64af6b1467166ee8 100644 --- a/arkindex/documents/importer.py +++ b/arkindex/documents/importer.py @@ -1,6 +1,6 @@ from arkindex.documents.models import PageType, PageDirection, Page, ElementType, Element, ElementLink -from arkindex.images.models import Image, ImageServer -from arkindex.images.importer import bulk_zones, bulk_transcriptions +from arkindex.images.models import Image, ImageServer, Zone +from arkindex.images.importer import bulk_transcriptions from abc import ABC, abstractmethod from urllib.parse import urlsplit import re @@ -49,17 +49,17 @@ def import_page(volume, image, register, folio, order): poly = [[0, 0], [image.width, 0], [image.width, image.height], [0, image.height]] zone = image.zones.filter( polygon=poly, - element__type=ElementType.Page + elements__type=ElementType.Page ) # Get the page or build zone and page if zone.exists(): - p = zone.first().element.page + p = zone.first().elements.first().page else: page_type, page_nb, page_direction = parse_folio(folio) p = Page.objects.create( folio=folio, name="Page {0} du volume {1}".format(folio, volume.name), - page_type=page_type, nb=page_nb, direction=page_direction) - p.zones.create(polygon=poly, image=image) + page_type=page_type, nb=page_nb, direction=page_direction, + zone=Zone.objects.create(polygon=poly, image=image)) ElementLink.objects.get_or_create(parent=element, child=p, defaults={'order': order}) return p @@ -109,7 +109,7 @@ class ManifestsImporter(ABC): Parses JSON manifests and annotation data to import them in the database. """ - def __init__(self, imgserv, offline=False): + def __init__(self, imgserv, offline=False, annotations=True): """Initialize a manifest importer `imgserv` can be either one ImageServer or a list of ImageServers.""" if isinstance(imgserv, ImageServer): @@ -119,6 +119,7 @@ class ManifestsImporter(ABC): self.imgserv = imgserv self.offline = offline + self.annotations = annotations # This dictionary associates canvas IDs with images and pages # Filled by parse_manifest ; used by parse_annotation_list @@ -157,7 +158,7 @@ class ManifestsImporter(ABC): stream.seek(0) self.parse_manifest(stream) break - elif value == 'sc:AnnotationList': + elif value == 'sc:AnnotationList' and self.annotations: stream.seek(0) self.parse_annotation_list(stream) break @@ -165,18 +166,23 @@ class ManifestsImporter(ABC): def parse_manifest(self, stream): """Parse a IIIF manifest loaded as a stream.""" # Get this file's volume range ID from the top-most structure - range_id = next(struct['ranges'][0] - for struct in ijson.items(stream, "structures.item") - if struct.get('viewingHint') == "top") - stream.seek(0) - - # Get our volume's structure and label - vol_struct = next(struct for struct in ijson.items(stream, "structures.item") - if struct.get('@id') == range_id) - vol_name = vol_struct['label'] - logger.debug("Creating volume {}".format(vol_name)) + try: + range_id = next(struct['ranges'][0] + for struct in ijson.items(stream, "structures.item") + if struct.get('viewingHint') == "top") + stream.seek(0) + + # Get our volume's structure and label + vol_struct = next(struct for struct in ijson.items(stream, "structures.item") + if struct.get('@id') == range_id) + vol_name = vol_struct['label'] + except StopIteration: + logger.debug("Invalid structures in manifest - using manifest label as volume name") + stream.seek(0) + vol_name = next(ijson.items(stream, 'label')) # Create a volume and a register + logger.debug("Creating volume {}".format(vol_name)) vol, _ = Element.objects.get_or_create(name=vol_name, type=ElementType.Volume) doc, _ = Element.objects.get_or_create(name=vol_name, type=ElementType.Register) ElementLink.objects.get_or_create(parent=vol, child=doc) @@ -283,13 +289,12 @@ class ManifestsImporter(ABC): def save_transcriptions(self): """To optimize transcription parsing, saving and indexing is done in bulk.""" + if len(self.images_transcription_data) < 1: + return + total_zones, total_transcriptions, total_indexes = 0, 0, 0 for (image, page), data in self.images_transcription_data.items(): - new_zones = bulk_zones(image, data) - total_zones += len(new_zones) - logger.debug("Created {0} zones for image {1}".format(len(new_zones), image.path)) - new_transcriptions = bulk_transcriptions(image, page, data) total_transcriptions += len(new_transcriptions) logger.debug("Created {0} transcriptions for image {1}".format(len(new_transcriptions), image.path)) @@ -375,3 +380,19 @@ class LocalManifestsImporter(ManifestsImporter): for path in paths: logger.info("Opening {}".format(path)) yield open(path, 'rb') + + +class URLManifestsImporter(ManifestsImporter): + """Allows importing of remote JSON files.""" + + def __init__(self, imgserv, url, **kwargs): + super().__init__(imgserv, **kwargs) + self.url = url + + def get_json_files(self): + import requests + from io import BytesIO + logger.info("Downloading from {}".format(self.url)) + r = requests.get(self.url) + r.raise_for_status() + yield BytesIO(r.content) diff --git a/arkindex/documents/management/commands/import_annotations.py b/arkindex/documents/management/commands/import_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..8b1da5430aba7357c2a54e4aac95f674ea42eee4 --- /dev/null +++ b/arkindex/documents/management/commands/import_annotations.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +from django.core.management.base import BaseCommand, CommandError +from arkindex.images.importer import IndexImporter +from arkindex.documents.models import Element, ElementType +import logging +import re + +logging.basicConfig( + level=logging.INFO, + format='[%(levelname)s] %(message)s', +) + + +class Command(BaseCommand): + help = 'Import annotations from index files (.idx.gz)' + + def add_arguments(self, parser): + parser.add_argument( + 'index_folder', + help='Folder to recursively search for index files', + default='.', + ) + parser.add_argument( + '--volume', + help='ID or exact name of the volume to import annotations in', + required=True, + ) + parser.add_argument( + '--dry-run', + help='Only show the associated images found for all index files, without importing anything', + action='store_true', + default=False, + ) + parser.add_argument( + '--mask', + help="""A mask to identify images from the index file path. Cannot be used with --regex. + Syntax: "something<PATH>something" + <PATH> will be used as the image path.""", + ) + parser.add_argument( + '--regex', + help="""A regex to use as a mask for more complex cases. + Must have only one capturing group. Cannot be used with --mask.""", + default=IndexImporter.DEFAULT_MASK, + ) + + def handle(self, *args, **options): + # Handle verbosity level + verbosity = int(options['verbosity']) + root_logger = logging.getLogger('') + if verbosity > 1: + root_logger.setLevel(logging.DEBUG) + + # Find volume + try: + volume = Element.objects.get(pk=options['volume'], type=ElementType.Volume) + except Exception: + volume = Element.objects.get(name=options['volume'], type=ElementType.Volume) + + # Handle mask + if options['mask'] is not None and options['regex'] != IndexImporter.DEFAULT_MASK: + raise CommandError('--mask and --regex cannot be used simultaneously.') + + mask_regex = options['regex'] + + if options['mask'] is not None: + mask = options['mask'] + assert mask.count('<PATH>') == 1 + # Replace <PATH> with (.+) and escape the rest + mask_regex = '^' + r'(.+)'.join(re.escape(p) for p in mask.split('<PATH>')) + '$' + + importer = IndexImporter(options['index_folder'], volume, mask=mask_regex) + if options['dry_run']: + importer.dry_run() + else: + importer.run() diff --git a/arkindex/documents/management/commands/import_manifest.py b/arkindex/documents/management/commands/import_manifest.py index c6ba39711323a662df9fa2925f57dde5b4dcee45..d74416b4d095d2d1425f06026d5309e5ed646f61 100644 --- a/arkindex/documents/management/commands/import_manifest.py +++ b/arkindex/documents/management/commands/import_manifest.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from django.core.management.base import BaseCommand, CommandError from arkindex.images.models import ImageServer -from arkindex.documents.importer import LocalManifestsImporter +from arkindex.documents.importer import LocalManifestsImporter, URLManifestsImporter import logging logging.basicConfig( @@ -16,7 +16,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( 'manifest_folder', - help='Folder to recursively search for IIIF manifests', + help='Folder to recursively search for IIIF manifests, or a URL pointing to a manifest', default='.' ) parser.add_argument( @@ -31,9 +31,16 @@ class Command(BaseCommand): default=False, help='Allow importer to make network queries', ) + parser.add_argument( + '--no-annotations', + action='store_false', + default=True, + help='Ignore annotation files', + dest='annotations', + ) def handle(self, *args, **options): - # Handel verbosity level + # Handle verbosity level verbosity = int(options['verbosity']) root_logger = logging.getLogger('') if verbosity > 1: @@ -46,8 +53,20 @@ class Command(BaseCommand): except Exception as e: raise CommandError("Image server not found: {}".format(e)) - LocalManifestsImporter( - servers, - options['manifest_folder'], - offline=options['offline'] - ).run() + # Use the proper importer (URL or local path) + if any(options['manifest_folder'].startswith(scheme) for scheme in ('http://', 'https://',)): + importer = URLManifestsImporter( + servers, + options['manifest_folder'], + offline=options['offline'], + annotations=options['annotations'], + ) + else: + importer = LocalManifestsImporter( + servers, + options['manifest_folder'], + offline=options['offline'], + annotations=options['annotations'], + ) + + importer.run() diff --git a/arkindex/documents/migrations/0002_element_zone.py b/arkindex/documents/migrations/0002_element_zone.py new file mode 100644 index 0000000000000000000000000000000000000000..dd21f3f5f1383636120c4fba3c7cbca905a0b57c --- /dev/null +++ b/arkindex/documents/migrations/0002_element_zone.py @@ -0,0 +1,26 @@ +# Generated by Django 2.0 on 2018-05-17 09:30 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0004_auto_20180517_0930'), + ('documents', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='element', + name='zone', + field=models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name='elements', + to='images.Zone' + ), + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 2bb1c08696ee949fc7d20b1d6b9b5943a9b95eab..2834783b1a32d8a8cb7616427799451f7dd9f42d 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -61,6 +61,13 @@ class Element(IndexableModel): parents = models.ManyToManyField('self', through=ElementLink, symmetrical=False) type = EnumField(ElementType, max_length=50) name = models.CharField(max_length=250) + zone = models.ForeignKey( + 'images.Zone', + on_delete=models.CASCADE, + related_name='elements', + null=True, + blank=True, + ) objects = ElementManager() @@ -73,7 +80,7 @@ class Element(IndexableModel): """Get an Image instance corresponding to a thumbnail for this element.""" if self.type == ElementType.Volume: first_page = Page.objects.get_descending(self.id).first() - return first_page and first_page.images.first() + return first_page and first_page.zone.image elif self.type == ElementType.Page: return self.page.images.first() return None diff --git a/arkindex/documents/search.py b/arkindex/documents/search.py index 90f657216a550ba6c220f1a11c95489d0749f32c..ef8d64e17cc9e33f924464bd8bdcf10bd09556d9 100644 --- a/arkindex/documents/search.py +++ b/arkindex/documents/search.py @@ -55,4 +55,4 @@ def search_transcriptions(query): return Transcription.objects \ .filter(id__in=transcription_ids) \ .order_by('-score') \ - .prefetch_related('zones__image__server') + .prefetch_related('zone__image__server') diff --git a/arkindex/documents/serializers.py b/arkindex/documents/serializers.py index 0e8dce9a48f9d8d097ecea46d0d1a54ccd5811ca..0fa5ea89cfbe2e0f9b4ddf0e37f47b95fa803a1d 100644 --- a/arkindex/documents/serializers.py +++ b/arkindex/documents/serializers.py @@ -55,7 +55,7 @@ class PageLightSerializer(serializers.ModelSerializer): """ page_type = EnumField() direction = EnumField() - images = ImageSerializer(many=True) + image = ImageSerializer(source='zone.image') class Meta: model = Page @@ -65,7 +65,7 @@ class PageLightSerializer(serializers.ModelSerializer): 'nb', 'direction', 'display_name', - 'images', + 'image', ) @@ -88,7 +88,7 @@ class SearchResultSerializer(serializers.ModelSerializer): """ Link between objects & their search indexation """ - zones = ZoneSerializer(many=True) + zone = ZoneSerializer() parents = ElementLightSerializer(source='parent_docs', many=True) class Meta: @@ -98,7 +98,7 @@ class SearchResultSerializer(serializers.ModelSerializer): 'text', 'line', 'score', - 'zones', + 'zone', 'parents', ) @@ -111,9 +111,8 @@ class VolumeManifestSerializer(serializers.BaseSerializer): def to_representation(self, volume): assert isinstance(volume, Element) and volume.type == ElementType.Volume assert 'request' in self.context, "A request is required to generate absolute URLs" - zones = [] - for p in Page.objects.get_descending(volume.id): - zones.extend(list(p.zones.all())) + zones = [p.zone for p in Page.objects.get_descending(volume.id)] + canvases = PageZoneCanvasManifestSerializer( zones, context=self.context, @@ -182,8 +181,8 @@ class PageZoneCanvasManifestSerializer(serializers.BaseSerializer): def to_representation(self, zone): assert isinstance(zone, Zone) - assert isinstance(zone.element, Page) - page = zone.element + page = zone.elements.first().page + assert isinstance(page, Page) assert 'request' in self.context, "A request is required to generate absolute URLs" return { "@id": page.build_absolute_url(self.context['request'], 'api:canvas-manifest'), @@ -237,16 +236,14 @@ class PageAnnotationListSerializer(serializers.BaseSerializer): assert isinstance(page, Page) assert 'request' in self.context, "A request is required to generate absolute URLs" - transcriptions = [] - for zone in page.zones.all(): - transcriptions.extend(Transcription.objects.filter(zones__in=zone.image.zones.all())) - return { "@context": settings.IIIF_PRESENTATION_CONTEXT, "@id": self.context['request'].build_absolute_uri(), "@type": "sc:AnnotationList", "resources": TranscriptionAnnotationSerializer( - transcriptions, context=self.context, many=True).data + Transcription.objects.get_descending(page.id), + context=self.context, many=True + ).data } @@ -261,12 +258,11 @@ class TranscriptionAnnotationSerializer(serializers.BaseSerializer): tid = ts.build_absolute_url( self.context['request'], 'api:transcription-manifest', id_argument='page_pk', transcription_pk=ts.id) - zone = ts.zones.first() return { "@id": tid, "@type": "oa:Annotation", "motivation": "sc:painting", - "on": zone.image.get_thumbnail_url() + "#xywh={0.x},{0.y},{0.width},{0.height}".format(zone.box), + "on": ts.zone.image.get_thumbnail_url() + "#xywh={0.x},{0.y},{0.width},{0.height}".format(ts.zone.box), "resource": { "@id": tid, "@type": "cnt:ContentAsText", diff --git a/arkindex/documents/tests.py b/arkindex/documents/tests.py index ec397b83814cb5d5aeea9de276f24951545fe359..ff88cafe12ff638df4ee3d1495f0a4814141907b 100644 --- a/arkindex/documents/tests.py +++ b/arkindex/documents/tests.py @@ -74,11 +74,11 @@ class TestVolumeManifestSerializer(APITestCase): self.imgsrv = ImageServer.objects.create(name="Test Server", url="http://server") self.img1 = Image.objects.create(path='img1', width=1337, height=42, server=self.imgsrv) self.img2 = Image.objects.create(path='img2', width=255, height=420, server=self.imgsrv) + self.z1 = Zone.objects.create(polygon=[[0, 0], [1337, 0], [1337, 42], [42, 0]], image=self.img1) + self.z2 = Zone.objects.create(polygon=[[0, 0], [255, 0], [255, 420], [0, 420]], image=self.img2) self.vol = Element.objects.create(name="Volume Name", type=ElementType.Volume) - self.p1 = Page.objects.create(name="p1", folio="p1") - self.p2 = Page.objects.create(name="p2", folio="p2") - Zone.objects.create(polygon=[[0, 0], [1337, 0], [1337, 42], [42, 0]], image=self.img1, element=self.p1) - Zone.objects.create(polygon=[[0, 0], [255, 0], [255, 420], [0, 420]], image=self.img2, element=self.p2) + self.p1 = Page.objects.create(name="p1", folio="p1", zone=self.z1) + self.p2 = Page.objects.create(name="p2", folio="p2", zone=self.z2) ElementLink.objects.create(parent=self.vol, child=self.p1, order=0) ElementLink.objects.create(parent=self.vol, child=self.p2, order=1) @@ -176,15 +176,12 @@ class TestPageAnnotationListSerializer(APITestCase): # Create a page and an image with some transcriptions self.imgsrv = ImageServer.objects.create(name="Test Server", url="http://server") self.img = Image.objects.create(path='img', width=1337, height=42, server=self.imgsrv) - self.page = Page.objects.create(name="page", folio="page") - Zone.objects.create(polygon=[[0, 0], [1337, 0], [1337, 42], [42, 0]], - image=self.img, element=self.page) + pagezone = Zone.objects.create(polygon=[[0, 0], [1337, 0], [1337, 42], [42, 0]], image=self.img) + self.page = Page.objects.create(name="page", folio="page", zone=pagezone) self.z1 = Zone.objects.create(polygon=[[100, 200], [100, 300], [300, 300], [300, 200]], image=self.img) self.z2 = Zone.objects.create(polygon=[[50, 100], [50, 150], [150, 150], [150, 100]], image=self.img) - self.t1 = Transcription.objects.create(text="AAA") - self.t2 = Transcription.objects.create(text="BBB") - self.t1.zones.add(self.z1) - self.t2.zones.add(self.z2) + self.t1 = Transcription.objects.create(text="AAA", zone=self.z1) + self.t2 = Transcription.objects.create(text="BBB", zone=self.z2) ElementLink.objects.create(parent=self.page, child=self.t1, order=0) ElementLink.objects.create(parent=self.page, child=self.t2, order=1) diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index 05bc8903f12098b5482743647ade295be817dbe9..4787050c6497df7d8665cd2dff27690bdda70392 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -1,10 +1,14 @@ from arkindex.images.models import Zone, Image -from arkindex.documents.models import Transcription, ElementLink +from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page from arkindex.project.tools import BoundingBox +from collections import namedtuple +from django.db import transaction import os import re import gzip import logging +import fnmatch +import uuid REGEX_INDEX = re.compile( b'^(?:line_(\d+) )?(.+) \d+ ([\de\-\.]+) (\d+) (\d+) (\d+) (\d+)') @@ -12,13 +16,14 @@ REGEX_INDEX = re.compile( logger = logging.getLogger(__name__) -def import_indexes(image, index_path, extension='jpg'): +def import_indexes(image, page, index_path, extension='jpg'): """ Import indexes from file One gzipped index file per image Format: #line word n/a score-[0,1] x1 y1 x2 y2 """ assert isinstance(image, Image) + assert isinstance(page, Page) assert os.path.exists(index_path), \ 'Missing index {}'.format(index_path) assert index_path.endswith('.idx.gz') @@ -31,7 +36,7 @@ def import_indexes(image, index_path, extension='jpg'): continue index = REGEX_INDEX.match(line) if index is None: - logger.warn('Index parsing failed : {}'.format(line)) + logger.warning('Index parsing failed : {}'.format(line)) continue # Build zone @@ -49,12 +54,8 @@ def import_indexes(image, index_path, extension='jpg'): logger.info('Parsed {} lines'.format(len(lines))) - # Create zones - new_zones = bulk_zones(image, lines) - logger.info('Created {} zones'.format(len(new_zones))) - # Create transcriptions - new_transcriptions = bulk_transcriptions(image, lines) + new_transcriptions = bulk_transcriptions(image, page, lines) logger.info('Created {} transcriptions '.format(len(new_transcriptions))) # Index all transcriptions into ES @@ -62,76 +63,170 @@ def import_indexes(image, index_path, extension='jpg'): logger.info('Added {} ES indexes'.format(nb_inserts)) -def bulk_zones(image, items): - """ - Create zones in bulk (one SQL statement) - This is WAY faster - """ - # Load existing zones in images - existing = [ - BoundingBox(p) - for p in image.zones.all().values_list('polygon', flat=True) - ] - existing = [ - (b.x, b.y, b.x + b.width, b.y + b.height) for b in existing - ] - - # Calc needed insert - needed = set([ - (item['x'], item['y'], item['x'] + item['width'], item['y'] + item['height']) - for item in items - ]).difference(existing) - - # Bulk create in db - return Zone.objects.bulk_create([ - Zone(image=image, polygon=[[z[0], z[1]], [z[2], z[1]], [z[2], z[3]], [z[0], z[3]]]) - for z in needed - ]) - - def bulk_transcriptions(image, page, items): """ - Create transcriptions in bulk + Create transcriptions and zones in bulk """ + # Link a transcription data with a bounding box + # This is hashable (box is hashable) + TrBox = namedtuple('TrBox', 'box, line, text, score') + + # Build all TrBox from items + required = { + TrBox( + BoundingBox( + [[i['x'], i['y']], + [i['x'] + i['width'], i['y'] + i['height']]] + ), + int(i['line']), + i['text'], + float(i['score']), + ) + for i in items + } - # Index existing zones, by unique keys + # List all zones in image zones = { - repr(BoundingBox(z[1])): z[0] - for z in image.zones.all().values_list('id', 'polygon') + z.id: z.polygon + for z in image.zones.all() } - # Load existing transcriptions - existing = Transcription.objects.filter( - zones__image=image - ).values_list('zones__id', 'line', 'text', 'score') + # Build all TrBox from existing + existing = { + TrBox( + BoundingBox(tr.zone.polygon), + tr.line, + tr.text, + tr.score, + ) + for tr in Transcription.objects.filter(zone__image=image).prefetch_related('zone') + } - # Calc needed insert - needed = set([ - ( - zones[repr(BoundingBox( - [[i['x'], i['y']], - [i['x'] + i['width'], i['y'] + i['height']]] - ))], - i['line'], - i['text'], - i['score'], + # Calc needed TrBox to build + needed = required.difference(existing) + if not needed: + return [] + + with transaction.atomic(): + + # Raw elements + elements = Element.objects.bulk_create( + Element(type=ElementType.Transcription, zone_id=uuid.uuid4()) + for _ in needed ) - for i in items - ]).difference(existing) - - # Create transcriptions and associate zones - all_ts = [] - for t in needed: - ts = Transcription.objects.create(line=t[1], text=t[2], score=t[3]) - all_ts.append(ts) - z = image.zones.get(id=t[0]) - z.element_id = ts.id - z.save() + + # Build transcriptions & zones instances at the same time + transcriptions, zones = zip(*[ + ( + Transcription( + element_ptr_id=elt.id, + line=n.line, + text=n.text, + score=n.score, + ), + Zone( + id=elt.zone_id, + image=image, + polygon=n.box.to_polygon(), + ) + ) + for elt, n in zip(elements, needed) + ]) + + # Create zones in bulk + Zone.objects.bulk_create(zones) + + # Create transcriptions using a low-level bulk_create + # as multi table is not supported yet by Django + Transcription.objects.none()._batched_insert( + transcriptions, + + # Here is the magic: we need only to insert the fields from documents_transcription + fields=set(Transcription._meta.concrete_fields).difference(Element._meta.concrete_fields), + + # Default + batch_size=None, + ) + + # Create all links between transcription and page max_order_dl = ElementLink.objects.filter(parent=page).order_by('-order').first() max_order = 0 if max_order_dl is None else max_order_dl.order + 1 ElementLink.objects.bulk_create( - ElementLink(parent=page, child=ts, order=i) - for i, ts in enumerate(all_ts, max_order) + ElementLink(parent=page, child=elt, order=i) + for i, elt in enumerate(elements, max_order) ) - return all_ts + return transcriptions + + +class IndexImporter(object): + """Import index files (.idx.gz) as transcriptions.""" + + DEFAULT_MASK = r'(?:.*/)?([^/]+)\.idx\.gz' + + def __init__(self, path, volume, mask=DEFAULT_MASK): + assert os.path.exists(path) + assert isinstance(volume, Element) + assert volume.type == ElementType.Volume + self.path = path + self.volume = volume + self.mask = re.compile(mask) + + logger.debug('Using mask {}'.format(self.mask.pattern)) + logger.debug('Fetching pages for volume {}'.format(str(self.volume))) + self.pages = list(Page.objects.get_descending(self.volume.id).prefetch_related('zone', 'zone__image')) + self.images = [p.zone.image for p in self.pages] + + def get_index_paths(self): + # Support single file & directories + if os.path.isdir(self.path): + for root, _, filenames in os.walk(self.path): + for filename in fnmatch.filter(filenames, "*.idx.gz"): + yield os.path.join(root, filename) + else: + yield os.path.realpath(self.path) + + def get_image(self, path): + try: + image_id = self.mask.findall(path)[0] + logger.debug('Matched {} for path {}'.format(image_id, path)) + except IndexError: + logger.debug('Mask did not match path {}'.format(path)) + raise Image.DoesNotExist + try: + return next(img for img in self.images if image_id in img.path) + except StopIteration: + raise Image.DoesNotExist + + def get_page(self, image): + assert isinstance(image, Image) + try: + return next(p for p in self.pages if p.zone.image == image) + except StopIteration: + raise Page.DoesNotExist + + def run(self): + for index_path in self.get_index_paths(): + logger.info("Parsing index file {}".format(index_path)) + try: + image = self.get_image(index_path) + page = self.get_page(image) + import_indexes(image, page, index_path) + except Image.DoesNotExist: + logger.warning("No associated image found for file {}".format(index_path)) + except Page.DoesNotExist: + logger.warning("No associated page found for file {}".format(index_path)) + except Image.MultipleObjectsReturned: + logger.warning("Multiple associated images found for file {}".format(index_path)) + + def dry_run(self): + for index_path in self.get_index_paths(): + image = None + try: + image = self.get_image(index_path) + except (Image.DoesNotExist, Image.MultipleObjectsReturned): + pass + if image is None: + logger.warning("{}\tFAIL".format(index_path)) + else: + logger.info("{}\t{}".format(index_path, image.path)) diff --git a/arkindex/images/migrations/0003_auto_20180516_2111.py b/arkindex/images/migrations/0003_auto_20180516_2111.py new file mode 100644 index 0000000000000000000000000000000000000000..2157f7fbe57b87db76ebd829f9a5d649a08db369 --- /dev/null +++ b/arkindex/images/migrations/0003_auto_20180516_2111.py @@ -0,0 +1,37 @@ +# Generated by Django 2.0 on 2018-05-16 21:11 + +from django.db import migrations, models +import django.db.models.deletion + + +def clean_zones(apps, schema_editor): + ''' + Remove zones with empty element + ''' + Zone = apps.get_model('images', 'Zone') + for zone in Zone.objects.filter(element__isnull=True): + zone.delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0002_image_status'), + ] + + operations = [ + migrations.RunPython(clean_zones), + migrations.AlterField( + model_name='zone', + name='element', + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='zones', + to='documents.Element', + ), + ), + migrations.AlterUniqueTogether( + name='zone', + unique_together=set(), + ), + ] diff --git a/arkindex/images/migrations/0004_auto_20180517_0930.py b/arkindex/images/migrations/0004_auto_20180517_0930.py new file mode 100644 index 0000000000000000000000000000000000000000..36237dea4b0c91152ff66126689293d47f58e1a7 --- /dev/null +++ b/arkindex/images/migrations/0004_auto_20180517_0930.py @@ -0,0 +1,32 @@ +# Generated by Django 2.0 on 2018-05-17 09:30 + +import arkindex.images.models +from django.db import migrations +import enumfields.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0003_auto_20180516_2111'), + ] + + operations = [ + migrations.RemoveField( + model_name='image', + name='elements', + ), + migrations.RemoveField( + model_name='zone', + name='element', + ), + migrations.AlterField( + model_name='image', + name='status', + field=enumfields.fields.EnumField( + default='unchecked', + enum=arkindex.images.models.ImageStatus, + max_length=50 + ), + ), + ] diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 3298ccfc56cb0a3c3ecf588901560912bda8149e..2700fede22d57f164863c35770e5eb024d367a5b 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -110,7 +110,6 @@ class Image(IndexableModel): path = models.URLField() width = models.PositiveIntegerField(default=0) height = models.PositiveIntegerField(default=0) - elements = models.ManyToManyField('documents.Element', related_name='images', through='Zone') status = EnumField(ImageStatus, default=ImageStatus.Unchecked, max_length=50) class Meta: @@ -140,9 +139,7 @@ class Image(IndexableModel): 'No zones to index' # Load directly all transcriptions - transcriptions = Transcription.objects.filter( - zones__image_id=self.id, - ).prefetch_related('zones') + transcriptions = Transcription.objects.filter(zone__image_id=self.id) # Build raw ElasticSearch insert actions = [{ @@ -164,13 +161,6 @@ class Zone(IndexableModel): A zone on an image """ image = models.ForeignKey(Image, on_delete=models.CASCADE, related_name='zones') - element = models.ForeignKey( - 'documents.Element', - on_delete=models.CASCADE, - related_name='zones', - null=True, - blank=True, - ) polygon = ArrayField( ArrayField( @@ -178,11 +168,6 @@ class Zone(IndexableModel): size=2) ) - class Meta: - unique_together = ( - ('image', 'polygon'), - ) - @cached_property def box(self): return BoundingBox(self.polygon) diff --git a/arkindex/images/tests.py b/arkindex/images/tests.py new file mode 100644 index 0000000000000000000000000000000000000000..bc856c8d60380c16f40f1c433da90279fba6df3f --- /dev/null +++ b/arkindex/images/tests.py @@ -0,0 +1,84 @@ +from django.test import TestCase +from arkindex.documents.models import Page +from arkindex.images.models import ImageServer, Image, Zone, Transcription +from arkindex.images.importer import bulk_transcriptions + + +class TestBulkTranscriptions(TestCase): + """Tests for bulk transcription and zone importing""" + + def setUp(self): + # Create a page and an image + self.imgsrv = ImageServer.objects.create(name="Test Server", url="http://server") + self.img = Image.objects.create(path='img', width=1337, height=42, server=self.imgsrv) + pagezone = Zone.objects.create(polygon=[[0, 0], [1337, 0], [1337, 42], [42, 0]], image=self.img) + self.page = Page.objects.create(name="page", folio="page", zone=pagezone) + + def test_bulk_transcriptions(self): + items = [ + { + 'x': 0, + 'y': 0, + 'width': 100, + 'height': 100, + 'line': '1', + 'text': 'test 1', + 'score': 0.1, + }, + { + 'x': 20, + 'y': 20, + 'width': 100, + 'height': 100, + 'line': '2', + 'text': 'test 2', + 'score': 0.2, + }, + ] + bulk_transcriptions(self.img, self.page, items) + out = Transcription.objects.all().order_by('line') + + self.assertEqual(len(out), 2) + self.assertIsInstance(out[0], Transcription) + self.assertIsInstance(out[1], Transcription) + + self.assertEqual(out[0].line, 1) + self.assertEqual(out[0].text, 'test 1') + self.assertEqual(out[0].score, 0.1) + + self.assertEqual(out[1].line, 2) + self.assertEqual(out[1].text, 'test 2') + self.assertEqual(out[1].score, 0.2) + + self.assertIsNotNone(out[0].zone) + self.assertIsNotNone(out[1].zone) + + self.assertListEqual(out[0].zone.polygon, [[0, 0], [0, 100], [100, 100], [100, 0]]) + self.assertListEqual(out[1].zone.polygon, [[20, 20], [20, 120], [120, 120], [120, 20]]) + + def test_bulk_transcriptions_unique(self): + """Check bulk_transcriptions does not import the same transcriptions twice""" + items = [ + { + 'x': 0, + 'y': 0, + 'width': 100, + 'height': 100, + 'line': '1', + 'text': 'test 1', + 'score': 0.1, + }, + { + 'x': 20, + 'y': 20, + 'width': 100, + 'height': 100, + 'line': '2', + 'text': 'test 2', + 'score': 0.2, + }, + ] + bulk_transcriptions(self.img, self.page, items) + out = bulk_transcriptions(self.img, self.page, items) + + self.assertEqual(len(out), 0) diff --git a/arkindex/project/tools.py b/arkindex/project/tools.py index cc9d8ebb9b85dda0f8779b87a679d3d595d3b591..f7c0a61cf9af00836732e058825031f30d623a9a 100644 --- a/arkindex/project/tools.py +++ b/arkindex/project/tools.py @@ -71,12 +71,22 @@ class BoundingBox(object): return "BoundingBox({}, {}, {}, {})".format( self.x, self.y, self.width, self.height) + def __eq__(self, other): + return self.x == other.x \ + and self.y == other.y \ + and self.width == other.width \ + and self.height == other.height + + def __hash__(self): + return hash((self.x, self.y, self.width, self.height)) + def to_polygon(self): - points = [(self.x, self.y), - (self.x, self.y + self.height), - (self.x + self.width, self.y + self.height), - (self.x + self.width, self.y)] - return tuple('({},{})'.format(i, j) for i, j in points) + return [ + (self.x, self.y), + (self.x, self.y + self.height), + (self.x + self.width, self.y + self.height), + (self.x + self.width, self.y), + ] def sslify_url(url):