From 7b38b7e23190631c15ccce17a6c11ea64af4d1e0 Mon Sep 17 00:00:00 2001 From: kermorvant <kermorvant@gmail.com> Date: Fri, 23 Aug 2019 09:54:33 +0200 Subject: [PATCH] example script to upload files to arkindex through S3 --- commands/import_local_files.py | 163 +++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 commands/import_local_files.py diff --git a/commands/import_local_files.py b/commands/import_local_files.py new file mode 100644 index 0000000..3499322 --- /dev/null +++ b/commands/import_local_files.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Import local images in a directory to Arkindex.""" + +from apistar.exceptions import ErrorResponse +from arkindex import ArkindexClient, options_from_env +import argparse +import glob +import hashlib +import imghdr +import logging +import os +import requests +import uuid + +SUPPORTED_IMG = ['jpeg', 'png'] + +logging.basicConfig( + format='[%(levelname)s] %(message)s', + level=logging.INFO, +) +logger = logging.getLogger(__name__) + +ark_client = ArkindexClient() + +class LocalPage(object): + """A local image to be uploaded.""" + + def __init__(self, page_path, corpus_id, volume_id, index): + """Init a local image parameters.""" + self.page_path = page_path + self.page_name = os.path.basename(self.page_path) + self.page_file = open(self.page_path, 'rb') + self.corpus_id = corpus_id + self.volume_id = volume_id + self.index = index + logger.info('import {}'.format(self.page_path)) + + def hash_image(self): + """Create the hash for image validations.""" + md5 = hashlib.md5() + for chunk in iter(lambda: self.page_file.read(4096), b""): + md5.update(chunk) + return md5.hexdigest() + + def upload_image(self): + """Upload the image on S3 and validate it.""" + logger.info('Creating image for page {}'.format(self.page_path)) + try: + self.image = ark_client.request('CreateImage', body={'hash': self.hash_image()}) + except ErrorResponse as e: + if e.status_code == 400 and 'id' in e.content: + self.image = ark_client.request('RetrieveImage', id=e.content['id']) + else: + logger.info('Uploading image from page {} as {}'.format(self.page_path, self.image['id'])) + self.page_file.seek(0) + requests.put(self.image['s3_put_url'], data=self.page_file) + + logger.info('Validating image {}'.format(self.image['id'])) + ark_client.request('PartialUpdateImage', id=self.image['id'], body={'status': 'checked'}) + + def create_page(self): + """Create a page in Arkindex with S3 images.""" + logger.info('Creating page {}'.format(self.page_path)) + self.page = ark_client.request('CreateElement', body={ + 'corpus': self.corpus_id, + 'type': 'page', + 'parent': self.volume_id, + 'name': self.page_name, + 'image': self.image['id'], + 'metadata': { + 'folio': str(self.index), + }, + }) + + def run(self): + """Run the import of a local page.""" + try: + self.upload_image() + self.create_page() + except AssertionError as e: + logger.error('Failed importing page {}: {}'.format(self.page_path, e)) + except ErrorResponse as e: + logger.error('Failed importing page {}: {} - {}'.format( + self.page_path, e.status_code, e.content)) + + +class LocalVolume(): + """Volume for a local directory.""" + + def __init__(self, directory=None, corpus=None, volume_name=None): + """Init a volume with a local directory.""" + self.local_dir = directory + self.volume_name = volume_name + self.corpus_id = str(corpus) + + def create_volume(self): + """Create the corresponding volume in Arkindex.""" + logger.info('Creating volume {}'.format(self.volume_name)) + self.volume = ark_client.request('CreateElement', body={ + 'corpus': self.corpus_id, + 'type': 'volume', + 'name': self.volume_name, + }) + + def run(self): + """Run the local directory import.""" + try: + self.create_volume() + idx = 1 + for page_path in glob.glob(os.path.join(self.local_dir, '*')): + # Check that the file is an image + if imghdr.what(page_path) in SUPPORTED_IMG: + LocalPage( + page_path, + corpus_id=self.corpus_id, + volume_id=self.volume['id'], + index=idx, + ).run() + idx+=1 + else: + logger.info('Skip non image file {}'.format(page_path)) + except AssertionError as e: + logger.error('Failed importing volume {}: {}'.format(self.volume_name, e)) + except ErrorResponse as e: + logger.error('Failed importing volume {}: {} - {}'.format(self.volume_name, e.status_code, e.content)) + + +def main(): + """Collect arguments and run.""" + parser = argparse.ArgumentParser( + description='Import local files from directory', + ) + parser.add_argument( + 'directory', + help='path to local directory to import', + ) + parser.add_argument( + '--volume-name', + help='name of the volume to be created', + required=True, + ) + parser.add_argument( + '--corpus', + help='UUID of an existing corpus to import into', + type=uuid.UUID, + required=True, + ) + parser.add_argument( + '--sleep', + help='Throttle API requests by waiting for a given number of seconds', + type=float, + default=0, + ) + args = vars(parser.parse_args()) + + + ark_client.configure(sleep=args.pop('sleep'), **options_from_env()) + + LocalVolume(**args).run() + + +if __name__ == '__main__': + main() -- GitLab