diff --git a/arkindex/dataimport/management/commands/import_s3.py b/arkindex/dataimport/management/commands/import_s3.py deleted file mode 100644 index 036e23db1f8dedb4a062d2da5f226e6e822d5ada..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/management/commands/import_s3.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -import yaml -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -from arkindex.dataimport.models import DataImport, DataImportMode -from arkindex.dataimport.utils import get_default_farm_id -from arkindex.project.argparse import CorpusArgument -from arkindex.users.models import User -from ponos.models import Workflow - -IMPORT_NAME = 'import-s3' -MAX_DEPTH_WARN = 3 - - -class Command(BaseCommand): - help = 'Start a S3 import distributed among multiple tasks' - - def add_arguments(self, parser): - parser.add_argument( - '--source-bucket', - required=True, - help='AWS S3 import source bucket name', - ) - parser.add_argument( - '--corpus', - required=True, - help='Corpus ID or name to import Volumes to', - type=CorpusArgument() - ) - parser.add_argument( - '--bucket-prefix', - required=True, - help='Bucket subfolder to limit volumes import', - ) - parser.add_argument( - '--max-folder-depth', - type=int, - required=True, - help='Recursion level for subfolders exploration', - ) - parser.add_argument( - '--nb-chunks', - type=int, - required=True, - help=''' - Number of tasks used for volumes import. A configuration file is written for each - task at /data/<IMAGE_NAME>/chunk_<WORKER_NUMBER>.yml, starting with chunk_1.yml - ''', - ) - parser.add_argument( - '--dest-bucket', - help='Arkindex S3 bucket to copy images to', - ) - parser.add_argument( - '--image-server', - help='Use image server as dest server. No copy is needed', - ) - parser.add_argument( - '--iiif-cache-bucket', - help='Image server cache bucket to populate with pre-computed images', - ) - - def handle(self, *args, **options): - # Parser exclusive - dest_bucket_mode = bool(options.get('dest_bucket')) - image_server_mode = bool(options.get('image_server')) - if not dest_bucket_mode ^ image_server_mode: - raise CommandError('Exactly one of the arguments --dest-bucket, --image-server is required') - env_vars = { - 'AWS_ACCESS_KEY': settings.AWS_ACCESS_KEY, - 'AWS_SECRET_KEY': settings.AWS_SECRET_KEY, - 'AWS_ENDPOINT': settings.AWS_ENDPOINT, - 'AWS_REGION': settings.AWS_REGION - } - # Assert s3 information are passed to tasks - assert env_vars['AWS_ACCESS_KEY'] and env_vars['AWS_SECRET_KEY'], ( - 'S3 environment variables could not be found\n' - 'Please define AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY before starting import' - ) - - # Warn for high recursion level - depth = options['max_folder_depth'] - if depth > MAX_DEPTH_WARN: - self.stderr.write(self.style.WARNING( - f'Maximum folder depth set to {depth}. A high value can considerably slow tasks import distribution' - )) - - iiif_cache_bucket = options.get('iiif_cache_bucket') - - import_command = ( - "python3 -m arkindex_tasks.import_s3 " - "--source-bucket {source_bucket} " - "--corpus-id {corpus.id} " - "--bucket-prefix '{bucket_prefix}' " - "--max-folder-depth {max_folder_depth} " - "--nb-chunks {nb_chunks} " - ).format(**options) - if iiif_cache_bucket: - import_command += "--iiif-cache-bucket {} ".format(iiif_cache_bucket) - if dest_bucket_mode: - import_command += "--dest-bucket {}".format(options['dest_bucket']) - else: - import_command += "--image-server {}".format(options['image_server']) - - tasks_config = { - IMPORT_NAME: { - 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': import_command - } - } - for n in range(1, options['nb_chunks'] + 1): - chunk_name = 'import_chunk_{}'.format(n) - tasks_config[chunk_name] = { - 'parents': [IMPORT_NAME], - 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/{}/chunk_{}.yml' - .format(IMPORT_NAME, n) - } - # Add automatic thumbnails generation - tasks_config['thumbnails_chunk_{}'.format(n)] = { - 'parents': [chunk_name], - 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/{}/elements.json'.format(chunk_name), - } - if not iiif_cache_bucket: - continue - tasks_config['cache_builder_{}'.format(n)] = { - 'parents': [chunk_name], - 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': 'python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache /data/{}/s3_elements.json' - .format(chunk_name) - } - - recipe = settings.PONOS_RECIPE.copy() - recipe['tasks'] = tasks_config - recipe.setdefault('env', {}).update(env_vars) - - workflow = Workflow.objects.create(farm_id=get_default_farm_id(), recipe=yaml.dump(recipe)) - self.stdout.write('Created Workflow with id {}'.format(workflow.id)) - - admin = User.objects.filter(is_admin=True).first() - assert admin is not None, 'No admin user has been found to create a Dataimport' - - dataimport = DataImport.objects.create( - workflow_id=workflow.id, - mode=DataImportMode.Images, - corpus_id=options['corpus'].id, - creator_id=admin.id - ) - self.stdout.write(self.style.SUCCESS( - 'Linked Workflow to DataImport {0} using user {1.email} ({1.id})' - .format(dataimport.id, admin) - )) diff --git a/arkindex/dataimport/tests/commands/test_import_s3.py b/arkindex/dataimport/tests/commands/test_import_s3.py deleted file mode 100644 index caddec219426901cfafcada9fca92a371600358b..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tests/commands/test_import_s3.py +++ /dev/null @@ -1,152 +0,0 @@ -import yaml -from django.core.management import call_command -from django.core.management.base import CommandError -from django.test import override_settings - -from arkindex.dataimport.models import DataImport, DataImportMode -from arkindex.project.tests import FixtureTestCase - - -class TestImportS3(FixtureTestCase): - - @override_settings( - AWS_ACCESS_KEY='username', - AWS_SECRET_KEY='s3kr3t', - AWS_ENDPOINT='http://nowhere/', - AWS_REGION='middle-earth', - ARKINDEX_TASKS_IMAGE='tasks:latest', - PONOS_RECIPE={ - 'env': { - 'SOME_VAR': 'somevalue', - } - } - ) - def test_dest_bucket(self): - self.assertFalse(DataImport.objects.exists()) - - call_command( - 'import_s3', - source_bucket='mybucket', - dest_bucket='somewhere', - corpus=self.corpus, - bucket_prefix='A', - max_folder_depth=3, - nb_chunks=2, - ) - - di = DataImport.objects.get() - self.assertEqual(di.mode, DataImportMode.Images) - self.assertEqual(di.corpus, self.corpus) - self.assertEqual(di.creator, self.superuser) - self.assertIsNone(di.element) - - recipe = yaml.safe_load(di.workflow.recipe) - self.assertDictEqual(recipe, { - 'env': { - 'AWS_ACCESS_KEY': 'username', - 'AWS_SECRET_KEY': 's3kr3t', - 'AWS_ENDPOINT': 'http://nowhere/', - 'AWS_REGION': 'middle-earth', - 'SOME_VAR': 'somevalue', - }, - 'tasks': { - 'import-s3': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.import_s3 ' - '--source-bucket mybucket ' - '--corpus-id {} ' - "--bucket-prefix 'A' " - '--max-folder-depth 3 ' - '--nb-chunks 2 ' - '--dest-bucket somewhere'.format(self.corpus.id), - }, - 'import_chunk_1': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml', - 'parents': ['import-s3'], - }, - 'import_chunk_2': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_2.yml', - 'parents': ['import-s3'], - }, - 'thumbnails_chunk_1': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json', - 'parents': ['import_chunk_1'], - }, - 'thumbnails_chunk_2': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_2/elements.json', - 'parents': ['import_chunk_2'], - } - } - }) - - @override_settings( - AWS_ACCESS_KEY='username', - AWS_SECRET_KEY='s3kr3t', - AWS_ENDPOINT='http://nowhere/', - AWS_REGION='middle-earth', - ARKINDEX_TASKS_IMAGE='tasks:latest', - ) - def test_iiif_cache_bucket(self): - self.assertFalse(DataImport.objects.exists()) - - call_command( - 'import_s3', - source_bucket='mybucket', - image_server=42, - corpus=self.corpus, - bucket_prefix='A', - max_folder_depth=2, - nb_chunks=1, - iiif_cache_bucket='melon_cache', - ) - - di = DataImport.objects.get() - recipe = yaml.safe_load(di.workflow.recipe) - self.assertDictEqual(recipe.get('tasks'), { - 'import-s3': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.import_s3 ' - '--source-bucket mybucket ' - '--corpus-id {} ' - "--bucket-prefix 'A' " - '--max-folder-depth 2 ' - '--nb-chunks 1 ' - '--iiif-cache-bucket melon_cache ' - '--image-server 42'.format(self.corpus.id), - }, - 'import_chunk_1': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml', - 'parents': ['import-s3'], - }, - 'thumbnails_chunk_1': { - 'image': 'tasks:latest', - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json', - 'parents': ['import_chunk_1'], - }, - 'cache_builder_1': { - 'image': 'tasks:latest', - 'command': ( - 'python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache ' - '/data/import_chunk_1/s3_elements.json' - ), - 'parents': ['import_chunk_1'], - } - }) - - def test_xor_args(self): - with self.assertRaises(CommandError): - call_command( - 'import_s3', - source_bucket='mybucket', - dest_bucket='somewhere', - image_server='some_server', - corpus=self.corpus, - bucket_prefix='A', - max_folder_depth=3, - nb_chunks=2, - )