Skip to content
Snippets Groups Projects
Commit e8583341 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'from-csv' into 'master'

Import volumes from CSV

See merge request !24
parents 764dffda 61c1b6dc
No related branches found
No related tags found
1 merge request!24Import volumes from CSV
......@@ -109,9 +109,10 @@ class ManifestsImporter(ABC):
Parses JSON manifests and annotation data to import them in the database.
"""
def __init__(self, imgserv, offline=False, annotations=True):
def __init__(self, imgserv, offline=False, annotations=True, volume_name=None):
"""Initialize a manifest importer
`imgserv` can be either one ImageServer or a list of ImageServers."""
`imgserv` can be either one ImageServer or a list of ImageServers.
When `volume_name` is set, it overrides the manifest volume name."""
if isinstance(imgserv, ImageServer):
self.imgserv = [imgserv]
else:
......@@ -120,6 +121,7 @@ class ManifestsImporter(ABC):
self.offline = offline
self.annotations = annotations
self.volume_name = volume_name
# This dictionary associates canvas IDs with images and pages
# Filled by parse_manifest ; used by parse_annotation_list
......@@ -163,8 +165,9 @@ class ManifestsImporter(ABC):
self.parse_annotation_list(stream)
break
def parse_manifest(self, stream):
"""Parse a IIIF manifest loaded as a stream."""
def _extract_volume_name(self, stream):
if self.volume_name is not None:
return self.volume_name
# Get this file's volume range ID from the top-most structure
try:
range_id = next(struct['ranges'][0]
......@@ -175,11 +178,15 @@ class ManifestsImporter(ABC):
# Get our volume's structure and label
vol_struct = next(struct for struct in ijson.items(stream, "structures.item")
if struct.get('@id') == range_id)
vol_name = vol_struct['label']
return vol_struct['label']
except StopIteration:
logger.debug("Invalid structures in manifest - using manifest label as volume name")
stream.seek(0)
vol_name = next(ijson.items(stream, 'label'))
return next(ijson.items(stream, 'label'))
def parse_manifest(self, stream):
"""Parse a IIIF manifest loaded as a stream."""
vol_name = self._extract_volume_name(stream)
# Create a volume and a register
logger.debug("Creating volume {}".format(vol_name))
......
#!/usr/bin/env python3
"""
Import Himanis volumes from CSV file
"""
from django.core.management.base import BaseCommand
from multiprocessing.pool import Pool
from arkindex.documents.models import Element, ElementType
from arkindex.documents.importer import URLManifestsImporter
from arkindex.images.importer import IndexImporter, GallicaIndexImporter
import os
import django
import csv
import logging
logging.basicConfig(
format='[%(levelname)s] %(processName)-9s: %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
def import_manifest(url, name):
"Import a manifest from a given URL with a given volume name."
logger.info('Importing volume {} from {}'.format(name, url))
URLManifestsImporter([], url, offline=True, volume_name=name).run()
def import_annotations(source, raw_path, name, index_root):
"""
Import annotations.
source: 'bvmm' or 'gallica'
raw_path: Raw path to index file
name: Volume name
index_root: Root folder for index files
"""
if raw_path.startswith('/home/data/indexes'):
raw_path = raw_path[18:]
raw_path = raw_path.lstrip('/')
index_path = os.path.join(index_root, raw_path)
assert os.path.exists(index_path)
volume = Element.objects.get(name=name, type=ElementType.Volume)
if source == 'bvmm':
IndexImporter(index_path, volume, mask=r'(?:.*/)?([^/]+)_[A-Z]\.idx\.gz').run()
elif source == 'gallica':
GallicaIndexImporter(index_path, volume).run()
else: # Try anyway
IndexImporter(index_path, volume).run()
class Command(BaseCommand):
help = "Import Himanis volumes from CSV file. May require the Arkindex backend in PATH."
def add_arguments(self, parser):
parser.add_argument('csv')
parser.add_argument(
'-p', '--processes',
type=int,
help='Maximum number of worker processes for multiprocessing, defaults to 4',
default=4,
)
parser.add_argument(
'-i', '--index-root',
help='Root folder for indexes (/home/data/indexes)',
default='.',
)
def handle(self, *args, **options):
# Handle verbosity level
verbosity = int(options['verbosity'])
if verbosity > 1:
logger.setLevel(logging.DEBUG)
logger.debug('Opening CSV')
with open(options['csv']) as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
data = list(csvreader)
logger.debug('Setting up Django')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "arkindex.project.settings")
django.setup()
with Pool(options['processes']) as pool:
logger.info('Importing all manifests')
pool.starmap(import_manifest, [(row[1], row[0]) for row in data])
logger.info('Importing all annotations')
pool.starmap(import_annotations, [(row[3], row[2], row[0], options['index_root']) for row in data])
......@@ -38,6 +38,10 @@ class Command(BaseCommand):
help='Ignore annotation files',
dest='annotations',
)
parser.add_argument(
'--volume-name',
help='Override the manifest volume name with a custom name.',
)
def handle(self, *args, **options):
# Handle verbosity level
......@@ -67,6 +71,7 @@ class Command(BaseCommand):
options['manifest_folder'],
offline=options['offline'],
annotations=options['annotations'],
volume_name=options['volume_name'],
)
importer.run()
......@@ -2,6 +2,7 @@ from arkindex.images.models import Zone, Image
from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page
from arkindex.project.tools import BoundingBox
from collections import namedtuple
from abc import ABC, abstractmethod
from django.db import transaction
import os
import re
......@@ -111,8 +112,8 @@ def bulk_transcriptions(image, page, items):
# Raw elements
elements = Element.objects.bulk_create(
Element(type=ElementType.Transcription, zone_id=uuid.uuid4())
for _ in needed
Element(type=ElementType.Transcription, name=n.text, zone_id=uuid.uuid4())
for n in needed
)
# Build transcriptions & zones instances at the same time
......@@ -159,23 +160,18 @@ def bulk_transcriptions(image, page, items):
return transcriptions
class IndexImporter(object):
class BaseIndexImporter(ABC):
"""Import index files (.idx.gz) as transcriptions."""
DEFAULT_MASK = r'(?:.*/)?([^/]+)\.idx\.gz'
def __init__(self, path, volume, mask=DEFAULT_MASK):
def __init__(self, path, volume):
assert os.path.exists(path)
assert isinstance(volume, Element)
assert volume.type == ElementType.Volume
self.path = path
self.volume = volume
self.mask = re.compile(mask)
logger.debug('Using mask {}'.format(self.mask.pattern))
logger.debug('Fetching pages for volume {}'.format(str(self.volume)))
self.pages = list(Page.objects.get_descending(self.volume.id).prefetch_related('zone', 'zone__image'))
self.images = [p.zone.image for p in self.pages]
def get_index_paths(self):
# Support single file & directories
......@@ -186,17 +182,9 @@ class IndexImporter(object):
else:
yield os.path.realpath(self.path)
@abstractmethod
def get_image(self, path):
try:
image_id = self.mask.findall(path)[0]
logger.debug('Matched {} for path {}'.format(image_id, path))
except IndexError:
logger.debug('Mask did not match path {}'.format(path))
raise Image.DoesNotExist
try:
return next(img for img in self.images if image_id in img.path)
except StopIteration:
raise Image.DoesNotExist
"Return an Image instance for a given index file path"
def get_page(self, image):
assert isinstance(image, Image)
......@@ -230,3 +218,49 @@ class IndexImporter(object):
logger.warning("{}\tFAIL".format(index_path))
else:
logger.info("{}\t{}".format(index_path, image.path))
class IndexImporter(BaseIndexImporter):
"""Basic index importer with image matching based on a regular expression."""
DEFAULT_MASK = r'(?:.*/)?([^/]+)(?:_[a-zA-Z]*)?\.idx\.gz'
def __init__(self, path, volume, mask=DEFAULT_MASK):
self.mask = re.compile(mask)
logger.debug('Using mask {}'.format(self.mask.pattern))
super().__init__(path, volume)
self.images = [p.zone.image for p in self.pages]
def get_image(self, path):
try:
image_id = self.mask.findall(path)[0]
logger.debug('Matched {} for path {}'.format(image_id, path))
except IndexError:
logger.debug('Mask did not match path {}'.format(path))
raise Image.DoesNotExist
try:
return next(img for img in self.images if image_id in img.path)
except StopIteration:
raise Image.DoesNotExist
class GallicaIndexImporter(BaseIndexImporter):
"""Special importer due to Gallica's complicated URLs"""
REGEX = re.compile(r'.*_0*([0-9]+)(?:_[a-z]+)?\.idx\.gz')
def __init__(self, path, volume):
super().__init__(path, volume)
self.images = {
p.zone.image.path.rpartition('/')[2]: p.zone.image
for p in self.pages
}
def get_image(self, path):
try:
return self.images['f' + GallicaIndexImporter.REGEX.findall(path)[0]]
except IndexError:
logger.debug('Mask did not match path {}'.format(path))
raise Image.DoesNotExist
except KeyError:
raise Image.DoesNotExist
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment