Skip to content
Snippets Groups Projects
Commit f668efdc authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'nuke-import-s3' into 'master'

Remove manage.py import_s3

See merge request !1691
parents 4c8d3d9c d4265597
No related branches found
Tags 1.2.3-rc1
1 merge request!1691Remove manage.py import_s3
#!/usr/bin/env python3
import yaml
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from arkindex.dataimport.models import DataImport, DataImportMode
from arkindex.dataimport.utils import get_default_farm_id
from arkindex.project.argparse import CorpusArgument
from arkindex.users.models import User
from ponos.models import Workflow
IMPORT_NAME = 'import-s3'
MAX_DEPTH_WARN = 3
class Command(BaseCommand):
help = 'Start a S3 import distributed among multiple tasks'
def add_arguments(self, parser):
parser.add_argument(
'--source-bucket',
required=True,
help='AWS S3 import source bucket name',
)
parser.add_argument(
'--corpus',
required=True,
help='Corpus ID or name to import Volumes to',
type=CorpusArgument()
)
parser.add_argument(
'--bucket-prefix',
required=True,
help='Bucket subfolder to limit volumes import',
)
parser.add_argument(
'--max-folder-depth',
type=int,
required=True,
help='Recursion level for subfolders exploration',
)
parser.add_argument(
'--nb-chunks',
type=int,
required=True,
help='''
Number of tasks used for volumes import. A configuration file is written for each
task at /data/<IMAGE_NAME>/chunk_<WORKER_NUMBER>.yml, starting with chunk_1.yml
''',
)
parser.add_argument(
'--dest-bucket',
help='Arkindex S3 bucket to copy images to',
)
parser.add_argument(
'--image-server',
help='Use image server as dest server. No copy is needed',
)
parser.add_argument(
'--iiif-cache-bucket',
help='Image server cache bucket to populate with pre-computed images',
)
def handle(self, *args, **options):
# Parser exclusive
dest_bucket_mode = bool(options.get('dest_bucket'))
image_server_mode = bool(options.get('image_server'))
if not dest_bucket_mode ^ image_server_mode:
raise CommandError('Exactly one of the arguments --dest-bucket, --image-server is required')
env_vars = {
'AWS_ACCESS_KEY': settings.AWS_ACCESS_KEY,
'AWS_SECRET_KEY': settings.AWS_SECRET_KEY,
'AWS_ENDPOINT': settings.AWS_ENDPOINT,
'AWS_REGION': settings.AWS_REGION
}
# Assert s3 information are passed to tasks
assert env_vars['AWS_ACCESS_KEY'] and env_vars['AWS_SECRET_KEY'], (
'S3 environment variables could not be found\n'
'Please define AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY before starting import'
)
# Warn for high recursion level
depth = options['max_folder_depth']
if depth > MAX_DEPTH_WARN:
self.stderr.write(self.style.WARNING(
f'Maximum folder depth set to {depth}. A high value can considerably slow tasks import distribution'
))
iiif_cache_bucket = options.get('iiif_cache_bucket')
import_command = (
"python3 -m arkindex_tasks.import_s3 "
"--source-bucket {source_bucket} "
"--corpus-id {corpus.id} "
"--bucket-prefix '{bucket_prefix}' "
"--max-folder-depth {max_folder_depth} "
"--nb-chunks {nb_chunks} "
).format(**options)
if iiif_cache_bucket:
import_command += "--iiif-cache-bucket {} ".format(iiif_cache_bucket)
if dest_bucket_mode:
import_command += "--dest-bucket {}".format(options['dest_bucket'])
else:
import_command += "--image-server {}".format(options['image_server'])
tasks_config = {
IMPORT_NAME: {
'image': settings.ARKINDEX_TASKS_IMAGE,
'command': import_command
}
}
for n in range(1, options['nb_chunks'] + 1):
chunk_name = 'import_chunk_{}'.format(n)
tasks_config[chunk_name] = {
'parents': [IMPORT_NAME],
'image': settings.ARKINDEX_TASKS_IMAGE,
'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/{}/chunk_{}.yml'
.format(IMPORT_NAME, n)
}
# Add automatic thumbnails generation
tasks_config['thumbnails_chunk_{}'.format(n)] = {
'parents': [chunk_name],
'image': settings.ARKINDEX_TASKS_IMAGE,
'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/{}/elements.json'.format(chunk_name),
}
if not iiif_cache_bucket:
continue
tasks_config['cache_builder_{}'.format(n)] = {
'parents': [chunk_name],
'image': settings.ARKINDEX_TASKS_IMAGE,
'command': 'python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache /data/{}/s3_elements.json'
.format(chunk_name)
}
recipe = settings.PONOS_RECIPE.copy()
recipe['tasks'] = tasks_config
recipe.setdefault('env', {}).update(env_vars)
workflow = Workflow.objects.create(farm_id=get_default_farm_id(), recipe=yaml.dump(recipe))
self.stdout.write('Created Workflow with id {}'.format(workflow.id))
admin = User.objects.filter(is_admin=True).first()
assert admin is not None, 'No admin user has been found to create a Dataimport'
dataimport = DataImport.objects.create(
workflow_id=workflow.id,
mode=DataImportMode.Images,
corpus_id=options['corpus'].id,
creator_id=admin.id
)
self.stdout.write(self.style.SUCCESS(
'Linked Workflow to DataImport {0} using user {1.email} ({1.id})'
.format(dataimport.id, admin)
))
import yaml
from django.core.management import call_command
from django.core.management.base import CommandError
from django.test import override_settings
from arkindex.dataimport.models import DataImport, DataImportMode
from arkindex.project.tests import FixtureTestCase
class TestImportS3(FixtureTestCase):
@override_settings(
AWS_ACCESS_KEY='username',
AWS_SECRET_KEY='s3kr3t',
AWS_ENDPOINT='http://nowhere/',
AWS_REGION='middle-earth',
ARKINDEX_TASKS_IMAGE='tasks:latest',
PONOS_RECIPE={
'env': {
'SOME_VAR': 'somevalue',
}
}
)
def test_dest_bucket(self):
self.assertFalse(DataImport.objects.exists())
call_command(
'import_s3',
source_bucket='mybucket',
dest_bucket='somewhere',
corpus=self.corpus,
bucket_prefix='A',
max_folder_depth=3,
nb_chunks=2,
)
di = DataImport.objects.get()
self.assertEqual(di.mode, DataImportMode.Images)
self.assertEqual(di.corpus, self.corpus)
self.assertEqual(di.creator, self.superuser)
self.assertIsNone(di.element)
recipe = yaml.safe_load(di.workflow.recipe)
self.assertDictEqual(recipe, {
'env': {
'AWS_ACCESS_KEY': 'username',
'AWS_SECRET_KEY': 's3kr3t',
'AWS_ENDPOINT': 'http://nowhere/',
'AWS_REGION': 'middle-earth',
'SOME_VAR': 'somevalue',
},
'tasks': {
'import-s3': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.import_s3 '
'--source-bucket mybucket '
'--corpus-id {} '
"--bucket-prefix 'A' "
'--max-folder-depth 3 '
'--nb-chunks 2 '
'--dest-bucket somewhere'.format(self.corpus.id),
},
'import_chunk_1': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml',
'parents': ['import-s3'],
},
'import_chunk_2': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_2.yml',
'parents': ['import-s3'],
},
'thumbnails_chunk_1': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json',
'parents': ['import_chunk_1'],
},
'thumbnails_chunk_2': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_2/elements.json',
'parents': ['import_chunk_2'],
}
}
})
@override_settings(
AWS_ACCESS_KEY='username',
AWS_SECRET_KEY='s3kr3t',
AWS_ENDPOINT='http://nowhere/',
AWS_REGION='middle-earth',
ARKINDEX_TASKS_IMAGE='tasks:latest',
)
def test_iiif_cache_bucket(self):
self.assertFalse(DataImport.objects.exists())
call_command(
'import_s3',
source_bucket='mybucket',
image_server=42,
corpus=self.corpus,
bucket_prefix='A',
max_folder_depth=2,
nb_chunks=1,
iiif_cache_bucket='melon_cache',
)
di = DataImport.objects.get()
recipe = yaml.safe_load(di.workflow.recipe)
self.assertDictEqual(recipe.get('tasks'), {
'import-s3': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.import_s3 '
'--source-bucket mybucket '
'--corpus-id {} '
"--bucket-prefix 'A' "
'--max-folder-depth 2 '
'--nb-chunks 1 '
'--iiif-cache-bucket melon_cache '
'--image-server 42'.format(self.corpus.id),
},
'import_chunk_1': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml',
'parents': ['import-s3'],
},
'thumbnails_chunk_1': {
'image': 'tasks:latest',
'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json',
'parents': ['import_chunk_1'],
},
'cache_builder_1': {
'image': 'tasks:latest',
'command': (
'python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache '
'/data/import_chunk_1/s3_elements.json'
),
'parents': ['import_chunk_1'],
}
})
def test_xor_args(self):
with self.assertRaises(CommandError):
call_command(
'import_s3',
source_bucket='mybucket',
dest_bucket='somewhere',
image_server='some_server',
corpus=self.corpus,
bucket_prefix='A',
max_folder_depth=3,
nb_chunks=2,
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment