diff --git a/arkindex/documents/importer.py b/arkindex/documents/importer.py index 7edc7799d4c6728e1fccfb0b64af6b1467166ee8..ae83bd675e56dc06d81ad5d91b40e4dc5ba814f4 100644 --- a/arkindex/documents/importer.py +++ b/arkindex/documents/importer.py @@ -109,9 +109,10 @@ class ManifestsImporter(ABC): Parses JSON manifests and annotation data to import them in the database. """ - def __init__(self, imgserv, offline=False, annotations=True): + def __init__(self, imgserv, offline=False, annotations=True, volume_name=None): """Initialize a manifest importer - `imgserv` can be either one ImageServer or a list of ImageServers.""" + `imgserv` can be either one ImageServer or a list of ImageServers. + When `volume_name` is set, it overrides the manifest volume name.""" if isinstance(imgserv, ImageServer): self.imgserv = [imgserv] else: @@ -120,6 +121,7 @@ class ManifestsImporter(ABC): self.offline = offline self.annotations = annotations + self.volume_name = volume_name # This dictionary associates canvas IDs with images and pages # Filled by parse_manifest ; used by parse_annotation_list @@ -163,8 +165,9 @@ class ManifestsImporter(ABC): self.parse_annotation_list(stream) break - def parse_manifest(self, stream): - """Parse a IIIF manifest loaded as a stream.""" + def _extract_volume_name(self, stream): + if self.volume_name is not None: + return self.volume_name # Get this file's volume range ID from the top-most structure try: range_id = next(struct['ranges'][0] @@ -175,11 +178,15 @@ class ManifestsImporter(ABC): # Get our volume's structure and label vol_struct = next(struct for struct in ijson.items(stream, "structures.item") if struct.get('@id') == range_id) - vol_name = vol_struct['label'] + return vol_struct['label'] except StopIteration: logger.debug("Invalid structures in manifest - using manifest label as volume name") stream.seek(0) - vol_name = next(ijson.items(stream, 'label')) + return next(ijson.items(stream, 'label')) + + def parse_manifest(self, stream): + """Parse a IIIF manifest loaded as a stream.""" + vol_name = self._extract_volume_name(stream) # Create a volume and a register logger.debug("Creating volume {}".format(vol_name)) diff --git a/arkindex/documents/management/commands/from_csv.py b/arkindex/documents/management/commands/from_csv.py new file mode 100755 index 0000000000000000000000000000000000000000..fc4de1bf665f01b43f8b0569bd3c03a00473e1e2 --- /dev/null +++ b/arkindex/documents/management/commands/from_csv.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Import Himanis volumes from CSV file +""" +from django.core.management.base import BaseCommand +from multiprocessing.pool import Pool +from arkindex.documents.models import Element, ElementType +from arkindex.documents.importer import URLManifestsImporter +from arkindex.images.importer import IndexImporter, GallicaIndexImporter +import os +import django +import csv +import logging + +logging.basicConfig( + format='[%(levelname)s] %(processName)-9s: %(message)s', + level=logging.INFO +) +logger = logging.getLogger(__name__) + + +def import_manifest(url, name): + "Import a manifest from a given URL with a given volume name." + + logger.info('Importing volume {} from {}'.format(name, url)) + URLManifestsImporter([], url, offline=True, volume_name=name).run() + + +def import_annotations(source, raw_path, name, index_root): + """ + Import annotations. + source: 'bvmm' or 'gallica' + raw_path: Raw path to index file + name: Volume name + index_root: Root folder for index files + """ + + if raw_path.startswith('/home/data/indexes'): + raw_path = raw_path[18:] + raw_path = raw_path.lstrip('/') + index_path = os.path.join(index_root, raw_path) + assert os.path.exists(index_path) + + volume = Element.objects.get(name=name, type=ElementType.Volume) + + if source == 'bvmm': + IndexImporter(index_path, volume, mask=r'(?:.*/)?([^/]+)_[A-Z]\.idx\.gz').run() + elif source == 'gallica': + GallicaIndexImporter(index_path, volume).run() + else: # Try anyway + IndexImporter(index_path, volume).run() + + +class Command(BaseCommand): + help = "Import Himanis volumes from CSV file. May require the Arkindex backend in PATH." + + def add_arguments(self, parser): + parser.add_argument('csv') + parser.add_argument( + '-p', '--processes', + type=int, + help='Maximum number of worker processes for multiprocessing, defaults to 4', + default=4, + ) + parser.add_argument( + '-i', '--index-root', + help='Root folder for indexes (/home/data/indexes)', + default='.', + ) + + def handle(self, *args, **options): + # Handle verbosity level + verbosity = int(options['verbosity']) + if verbosity > 1: + logger.setLevel(logging.DEBUG) + + logger.debug('Opening CSV') + with open(options['csv']) as csvfile: + csvreader = csv.reader(csvfile) + next(csvreader) + data = list(csvreader) + + logger.debug('Setting up Django') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "arkindex.project.settings") + django.setup() + + with Pool(options['processes']) as pool: + logger.info('Importing all manifests') + pool.starmap(import_manifest, [(row[1], row[0]) for row in data]) + logger.info('Importing all annotations') + pool.starmap(import_annotations, [(row[3], row[2], row[0], options['index_root']) for row in data]) diff --git a/arkindex/documents/management/commands/import_manifest.py b/arkindex/documents/management/commands/import_manifest.py index d74416b4d095d2d1425f06026d5309e5ed646f61..1cb0016eaab96786210ed6f71bc95964a1b6e529 100644 --- a/arkindex/documents/management/commands/import_manifest.py +++ b/arkindex/documents/management/commands/import_manifest.py @@ -38,6 +38,10 @@ class Command(BaseCommand): help='Ignore annotation files', dest='annotations', ) + parser.add_argument( + '--volume-name', + help='Override the manifest volume name with a custom name.', + ) def handle(self, *args, **options): # Handle verbosity level @@ -67,6 +71,7 @@ class Command(BaseCommand): options['manifest_folder'], offline=options['offline'], annotations=options['annotations'], + volume_name=options['volume_name'], ) importer.run() diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index 4787050c6497df7d8665cd2dff27690bdda70392..54b4b8ea64a0bdc329e465aa07486f764f0e98ea 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -2,6 +2,7 @@ from arkindex.images.models import Zone, Image from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page from arkindex.project.tools import BoundingBox from collections import namedtuple +from abc import ABC, abstractmethod from django.db import transaction import os import re @@ -111,8 +112,8 @@ def bulk_transcriptions(image, page, items): # Raw elements elements = Element.objects.bulk_create( - Element(type=ElementType.Transcription, zone_id=uuid.uuid4()) - for _ in needed + Element(type=ElementType.Transcription, name=n.text, zone_id=uuid.uuid4()) + for n in needed ) # Build transcriptions & zones instances at the same time @@ -159,23 +160,18 @@ def bulk_transcriptions(image, page, items): return transcriptions -class IndexImporter(object): +class BaseIndexImporter(ABC): """Import index files (.idx.gz) as transcriptions.""" - DEFAULT_MASK = r'(?:.*/)?([^/]+)\.idx\.gz' - - def __init__(self, path, volume, mask=DEFAULT_MASK): + def __init__(self, path, volume): assert os.path.exists(path) assert isinstance(volume, Element) assert volume.type == ElementType.Volume self.path = path self.volume = volume - self.mask = re.compile(mask) - logger.debug('Using mask {}'.format(self.mask.pattern)) logger.debug('Fetching pages for volume {}'.format(str(self.volume))) self.pages = list(Page.objects.get_descending(self.volume.id).prefetch_related('zone', 'zone__image')) - self.images = [p.zone.image for p in self.pages] def get_index_paths(self): # Support single file & directories @@ -186,17 +182,9 @@ class IndexImporter(object): else: yield os.path.realpath(self.path) + @abstractmethod def get_image(self, path): - try: - image_id = self.mask.findall(path)[0] - logger.debug('Matched {} for path {}'.format(image_id, path)) - except IndexError: - logger.debug('Mask did not match path {}'.format(path)) - raise Image.DoesNotExist - try: - return next(img for img in self.images if image_id in img.path) - except StopIteration: - raise Image.DoesNotExist + "Return an Image instance for a given index file path" def get_page(self, image): assert isinstance(image, Image) @@ -230,3 +218,49 @@ class IndexImporter(object): logger.warning("{}\tFAIL".format(index_path)) else: logger.info("{}\t{}".format(index_path, image.path)) + + +class IndexImporter(BaseIndexImporter): + """Basic index importer with image matching based on a regular expression.""" + + DEFAULT_MASK = r'(?:.*/)?([^/]+)(?:_[a-zA-Z]*)?\.idx\.gz' + + def __init__(self, path, volume, mask=DEFAULT_MASK): + self.mask = re.compile(mask) + logger.debug('Using mask {}'.format(self.mask.pattern)) + super().__init__(path, volume) + self.images = [p.zone.image for p in self.pages] + + def get_image(self, path): + try: + image_id = self.mask.findall(path)[0] + logger.debug('Matched {} for path {}'.format(image_id, path)) + except IndexError: + logger.debug('Mask did not match path {}'.format(path)) + raise Image.DoesNotExist + try: + return next(img for img in self.images if image_id in img.path) + except StopIteration: + raise Image.DoesNotExist + + +class GallicaIndexImporter(BaseIndexImporter): + """Special importer due to Gallica's complicated URLs""" + + REGEX = re.compile(r'.*_0*([0-9]+)(?:_[a-z]+)?\.idx\.gz') + + def __init__(self, path, volume): + super().__init__(path, volume) + self.images = { + p.zone.image.path.rpartition('/')[2]: p.zone.image + for p in self.pages + } + + def get_image(self, path): + try: + return self.images['f' + GallicaIndexImporter.REGEX.findall(path)[0]] + except IndexError: + logger.debug('Mask did not match path {}'.format(path)) + raise Image.DoesNotExist + except KeyError: + raise Image.DoesNotExist