Skip to content
Snippets Groups Projects
Commit 7b38b7e2 authored by kermorvant's avatar kermorvant
Browse files

example script to upload files to arkindex through S3

parent 08da09ac
Branches import_files
No related tags found
1 merge request!46example script to upload files to arkindex through S3
Pipeline #28215 failed
#!/usr/bin/env python3
"""Import local images in a directory to Arkindex."""
from apistar.exceptions import ErrorResponse
from arkindex import ArkindexClient, options_from_env
import argparse
import glob
import hashlib
import imghdr
import logging
import os
import requests
import uuid
SUPPORTED_IMG = ['jpeg', 'png']
logging.basicConfig(
format='[%(levelname)s] %(message)s',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
ark_client = ArkindexClient()
class LocalPage(object):
"""A local image to be uploaded."""
def __init__(self, page_path, corpus_id, volume_id, index):
"""Init a local image parameters."""
self.page_path = page_path
self.page_name = os.path.basename(self.page_path)
self.page_file = open(self.page_path, 'rb')
self.corpus_id = corpus_id
self.volume_id = volume_id
self.index = index
logger.info('import {}'.format(self.page_path))
def hash_image(self):
"""Create the hash for image validations."""
md5 = hashlib.md5()
for chunk in iter(lambda: self.page_file.read(4096), b""):
md5.update(chunk)
return md5.hexdigest()
def upload_image(self):
"""Upload the image on S3 and validate it."""
logger.info('Creating image for page {}'.format(self.page_path))
try:
self.image = ark_client.request('CreateImage', body={'hash': self.hash_image()})
except ErrorResponse as e:
if e.status_code == 400 and 'id' in e.content:
self.image = ark_client.request('RetrieveImage', id=e.content['id'])
else:
logger.info('Uploading image from page {} as {}'.format(self.page_path, self.image['id']))
self.page_file.seek(0)
requests.put(self.image['s3_put_url'], data=self.page_file)
logger.info('Validating image {}'.format(self.image['id']))
ark_client.request('PartialUpdateImage', id=self.image['id'], body={'status': 'checked'})
def create_page(self):
"""Create a page in Arkindex with S3 images."""
logger.info('Creating page {}'.format(self.page_path))
self.page = ark_client.request('CreateElement', body={
'corpus': self.corpus_id,
'type': 'page',
'parent': self.volume_id,
'name': self.page_name,
'image': self.image['id'],
'metadata': {
'folio': str(self.index),
},
})
def run(self):
"""Run the import of a local page."""
try:
self.upload_image()
self.create_page()
except AssertionError as e:
logger.error('Failed importing page {}: {}'.format(self.page_path, e))
except ErrorResponse as e:
logger.error('Failed importing page {}: {} - {}'.format(
self.page_path, e.status_code, e.content))
class LocalVolume():
"""Volume for a local directory."""
def __init__(self, directory=None, corpus=None, volume_name=None):
"""Init a volume with a local directory."""
self.local_dir = directory
self.volume_name = volume_name
self.corpus_id = str(corpus)
def create_volume(self):
"""Create the corresponding volume in Arkindex."""
logger.info('Creating volume {}'.format(self.volume_name))
self.volume = ark_client.request('CreateElement', body={
'corpus': self.corpus_id,
'type': 'volume',
'name': self.volume_name,
})
def run(self):
"""Run the local directory import."""
try:
self.create_volume()
idx = 1
for page_path in glob.glob(os.path.join(self.local_dir, '*')):
# Check that the file is an image
if imghdr.what(page_path) in SUPPORTED_IMG:
LocalPage(
page_path,
corpus_id=self.corpus_id,
volume_id=self.volume['id'],
index=idx,
).run()
idx+=1
else:
logger.info('Skip non image file {}'.format(page_path))
except AssertionError as e:
logger.error('Failed importing volume {}: {}'.format(self.volume_name, e))
except ErrorResponse as e:
logger.error('Failed importing volume {}: {} - {}'.format(self.volume_name, e.status_code, e.content))
def main():
"""Collect arguments and run."""
parser = argparse.ArgumentParser(
description='Import local files from directory',
)
parser.add_argument(
'directory',
help='path to local directory to import',
)
parser.add_argument(
'--volume-name',
help='name of the volume to be created',
required=True,
)
parser.add_argument(
'--corpus',
help='UUID of an existing corpus to import into',
type=uuid.UUID,
required=True,
)
parser.add_argument(
'--sleep',
help='Throttle API requests by waiting for a given number of seconds',
type=float,
default=0,
)
args = vars(parser.parse_args())
ark_client.configure(sleep=args.pop('sleep'), **options_from_env())
LocalVolume(**args).run()
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment