Skip to content
Snippets Groups Projects
Commit 7b38b7e2 authored by kermorvant's avatar kermorvant
Browse files

example script to upload files to arkindex through S3

parent 08da09ac
No related branches found
No related tags found
1 merge request!46example script to upload files to arkindex through S3
Pipeline #28215 failed
#!/usr/bin/env python3
"""Import local images in a directory to Arkindex."""
from apistar.exceptions import ErrorResponse
from arkindex import ArkindexClient, options_from_env
import argparse
import glob
import hashlib
import imghdr
import logging
import os
import requests
import uuid
SUPPORTED_IMG = ['jpeg', 'png']
logging.basicConfig(
format='[%(levelname)s] %(message)s',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
ark_client = ArkindexClient()
class LocalPage(object):
"""A local image to be uploaded."""
def __init__(self, page_path, corpus_id, volume_id, index):
"""Init a local image parameters."""
self.page_path = page_path
self.page_name = os.path.basename(self.page_path)
self.page_file = open(self.page_path, 'rb')
self.corpus_id = corpus_id
self.volume_id = volume_id
self.index = index
logger.info('import {}'.format(self.page_path))
def hash_image(self):
"""Create the hash for image validations."""
md5 = hashlib.md5()
for chunk in iter(lambda: self.page_file.read(4096), b""):
md5.update(chunk)
return md5.hexdigest()
def upload_image(self):
"""Upload the image on S3 and validate it."""
logger.info('Creating image for page {}'.format(self.page_path))
try:
self.image = ark_client.request('CreateImage', body={'hash': self.hash_image()})
except ErrorResponse as e:
if e.status_code == 400 and 'id' in e.content:
self.image = ark_client.request('RetrieveImage', id=e.content['id'])
else:
logger.info('Uploading image from page {} as {}'.format(self.page_path, self.image['id']))
self.page_file.seek(0)
requests.put(self.image['s3_put_url'], data=self.page_file)
logger.info('Validating image {}'.format(self.image['id']))
ark_client.request('PartialUpdateImage', id=self.image['id'], body={'status': 'checked'})
def create_page(self):
"""Create a page in Arkindex with S3 images."""
logger.info('Creating page {}'.format(self.page_path))
self.page = ark_client.request('CreateElement', body={
'corpus': self.corpus_id,
'type': 'page',
'parent': self.volume_id,
'name': self.page_name,
'image': self.image['id'],
'metadata': {
'folio': str(self.index),
},
})
def run(self):
"""Run the import of a local page."""
try:
self.upload_image()
self.create_page()
except AssertionError as e:
logger.error('Failed importing page {}: {}'.format(self.page_path, e))
except ErrorResponse as e:
logger.error('Failed importing page {}: {} - {}'.format(
self.page_path, e.status_code, e.content))
class LocalVolume():
"""Volume for a local directory."""
def __init__(self, directory=None, corpus=None, volume_name=None):
"""Init a volume with a local directory."""
self.local_dir = directory
self.volume_name = volume_name
self.corpus_id = str(corpus)
def create_volume(self):
"""Create the corresponding volume in Arkindex."""
logger.info('Creating volume {}'.format(self.volume_name))
self.volume = ark_client.request('CreateElement', body={
'corpus': self.corpus_id,
'type': 'volume',
'name': self.volume_name,
})
def run(self):
"""Run the local directory import."""
try:
self.create_volume()
idx = 1
for page_path in glob.glob(os.path.join(self.local_dir, '*')):
# Check that the file is an image
if imghdr.what(page_path) in SUPPORTED_IMG:
LocalPage(
page_path,
corpus_id=self.corpus_id,
volume_id=self.volume['id'],
index=idx,
).run()
idx+=1
else:
logger.info('Skip non image file {}'.format(page_path))
except AssertionError as e:
logger.error('Failed importing volume {}: {}'.format(self.volume_name, e))
except ErrorResponse as e:
logger.error('Failed importing volume {}: {} - {}'.format(self.volume_name, e.status_code, e.content))
def main():
"""Collect arguments and run."""
parser = argparse.ArgumentParser(
description='Import local files from directory',
)
parser.add_argument(
'directory',
help='path to local directory to import',
)
parser.add_argument(
'--volume-name',
help='name of the volume to be created',
required=True,
)
parser.add_argument(
'--corpus',
help='UUID of an existing corpus to import into',
type=uuid.UUID,
required=True,
)
parser.add_argument(
'--sleep',
help='Throttle API requests by waiting for a given number of seconds',
type=float,
default=0,
)
args = vars(parser.parse_args())
ark_client.configure(sleep=args.pop('sleep'), **options_from_env())
LocalVolume(**args).run()
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment