From ca239e78fc183eea6b1876a0ad11b0f00698ceec Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Thu, 17 May 2018 17:11:38 +0200 Subject: [PATCH] Move Gallica importer to backend --- arkindex/images/importer.py | 23 +++++++++++++++++++++++ from_csv.py | 21 --------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index 377f7a49ea..e28d51e485 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -221,6 +221,7 @@ class BaseIndexImporter(ABC): class IndexImporter(BaseIndexImporter): + """Basic index importer with image matching based on a regular expression.""" DEFAULT_MASK = r'(?:.*/)?([^/]+)\.idx\.gz' @@ -241,3 +242,25 @@ class IndexImporter(BaseIndexImporter): return next(img for img in self.images if image_id in img.path) except StopIteration: raise Image.DoesNotExist + + +class GallicaIndexImporter(BaseIndexImporter): + """Special importer due to Gallica's complicated URLs""" + + REGEX = re.compile(r'.*_0*([0-9]+)(?:_[a-z]+)?\.idx\.gz') + + def __init__(self, path, volume): + super().__init__(path, volume) + self.images = { + p.zone.image.path.rpartition('/')[2]: p.zone.image + for p in self.pages + } + + def get_image(self, path): + try: + return self.images['f' + GallicaIndexImporter.REGEX.findall(path)[0]] + except IndexError: + logger.debug('Mask did not match path {}'.format(path)) + raise Image.DoesNotExist + except KeyError: + raise Image.DoesNotExist diff --git a/from_csv.py b/from_csv.py index 4805a4e517..545474963c 100755 --- a/from_csv.py +++ b/from_csv.py @@ -34,27 +34,6 @@ def import_annotations(source, raw_path, name, index_root): from arkindex.images.models import Image from arkindex.images.importer import BaseIndexImporter, IndexImporter - class GallicaIndexImporter(BaseIndexImporter): - """Special importer due to Gallica's complicated URLs""" - - REGEX = re.compile(r'.*_0*([0-9]+)(?:_[a-z]+)?\.idx\.gz') - - def __init__(self, path, volume): - super().__init__(path, volume) - self.images = { - p.zone.image.path.rpartition('/')[2]: p.zone.image - for p in self.pages - } - - def get_image(self, path): - try: - return self.images['f' + GallicaIndexImporter.REGEX.findall(path)[0]] - except IndexError: - logger.debug('Mask did not match path {}'.format(path)) - raise Image.DoesNotExist - except KeyError: - raise Image.DoesNotExist - if raw_path.startswith('/home/data/indexes'): raw_path = raw_path[18:] raw_path = raw_path.lstrip('/') -- GitLab