Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Commits
f668efdc
Commit
f668efdc
authored
2 years ago
by
Bastien Abadie
Browse files
Options
Downloads
Plain Diff
Merge branch 'nuke-import-s3' into 'master'
Remove manage.py import_s3 See merge request
!1691
parents
4c8d3d9c
d4265597
No related branches found
Branches containing commit
Tags
1.2.3-rc1
Tags containing commit
1 merge request
!1691
Remove manage.py import_s3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
arkindex/dataimport/management/commands/import_s3.py
+0
-154
0 additions, 154 deletions
arkindex/dataimport/management/commands/import_s3.py
arkindex/dataimport/tests/commands/test_import_s3.py
+0
-152
0 additions, 152 deletions
arkindex/dataimport/tests/commands/test_import_s3.py
with
0 additions
and
306 deletions
arkindex/dataimport/management/commands/import_s3.py
deleted
100644 → 0
+
0
−
154
View file @
4c8d3d9c
#!/usr/bin/env python3
import
yaml
from
django.conf
import
settings
from
django.core.management.base
import
BaseCommand
,
CommandError
from
arkindex.dataimport.models
import
DataImport
,
DataImportMode
from
arkindex.dataimport.utils
import
get_default_farm_id
from
arkindex.project.argparse
import
CorpusArgument
from
arkindex.users.models
import
User
from
ponos.models
import
Workflow
IMPORT_NAME
=
'
import-s3
'
MAX_DEPTH_WARN
=
3
class
Command
(
BaseCommand
):
help
=
'
Start a S3 import distributed among multiple tasks
'
def
add_arguments
(
self
,
parser
):
parser
.
add_argument
(
'
--source-bucket
'
,
required
=
True
,
help
=
'
AWS S3 import source bucket name
'
,
)
parser
.
add_argument
(
'
--corpus
'
,
required
=
True
,
help
=
'
Corpus ID or name to import Volumes to
'
,
type
=
CorpusArgument
()
)
parser
.
add_argument
(
'
--bucket-prefix
'
,
required
=
True
,
help
=
'
Bucket subfolder to limit volumes import
'
,
)
parser
.
add_argument
(
'
--max-folder-depth
'
,
type
=
int
,
required
=
True
,
help
=
'
Recursion level for subfolders exploration
'
,
)
parser
.
add_argument
(
'
--nb-chunks
'
,
type
=
int
,
required
=
True
,
help
=
'''
Number of tasks used for volumes import. A configuration file is written for each
task at /data/<IMAGE_NAME>/chunk_<WORKER_NUMBER>.yml, starting with chunk_1.yml
'''
,
)
parser
.
add_argument
(
'
--dest-bucket
'
,
help
=
'
Arkindex S3 bucket to copy images to
'
,
)
parser
.
add_argument
(
'
--image-server
'
,
help
=
'
Use image server as dest server. No copy is needed
'
,
)
parser
.
add_argument
(
'
--iiif-cache-bucket
'
,
help
=
'
Image server cache bucket to populate with pre-computed images
'
,
)
def
handle
(
self
,
*
args
,
**
options
):
# Parser exclusive
dest_bucket_mode
=
bool
(
options
.
get
(
'
dest_bucket
'
))
image_server_mode
=
bool
(
options
.
get
(
'
image_server
'
))
if
not
dest_bucket_mode
^
image_server_mode
:
raise
CommandError
(
'
Exactly one of the arguments --dest-bucket, --image-server is required
'
)
env_vars
=
{
'
AWS_ACCESS_KEY
'
:
settings
.
AWS_ACCESS_KEY
,
'
AWS_SECRET_KEY
'
:
settings
.
AWS_SECRET_KEY
,
'
AWS_ENDPOINT
'
:
settings
.
AWS_ENDPOINT
,
'
AWS_REGION
'
:
settings
.
AWS_REGION
}
# Assert s3 information are passed to tasks
assert
env_vars
[
'
AWS_ACCESS_KEY
'
]
and
env_vars
[
'
AWS_SECRET_KEY
'
],
(
'
S3 environment variables could not be found
\n
'
'
Please define AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY before starting import
'
)
# Warn for high recursion level
depth
=
options
[
'
max_folder_depth
'
]
if
depth
>
MAX_DEPTH_WARN
:
self
.
stderr
.
write
(
self
.
style
.
WARNING
(
f
'
Maximum folder depth set to
{
depth
}
. A high value can considerably slow tasks import distribution
'
))
iiif_cache_bucket
=
options
.
get
(
'
iiif_cache_bucket
'
)
import_command
=
(
"
python3 -m arkindex_tasks.import_s3
"
"
--source-bucket {source_bucket}
"
"
--corpus-id {corpus.id}
"
"
--bucket-prefix
'
{bucket_prefix}
'
"
"
--max-folder-depth {max_folder_depth}
"
"
--nb-chunks {nb_chunks}
"
).
format
(
**
options
)
if
iiif_cache_bucket
:
import_command
+=
"
--iiif-cache-bucket {}
"
.
format
(
iiif_cache_bucket
)
if
dest_bucket_mode
:
import_command
+=
"
--dest-bucket {}
"
.
format
(
options
[
'
dest_bucket
'
])
else
:
import_command
+=
"
--image-server {}
"
.
format
(
options
[
'
image_server
'
])
tasks_config
=
{
IMPORT_NAME
:
{
'
image
'
:
settings
.
ARKINDEX_TASKS_IMAGE
,
'
command
'
:
import_command
}
}
for
n
in
range
(
1
,
options
[
'
nb_chunks
'
]
+
1
):
chunk_name
=
'
import_chunk_{}
'
.
format
(
n
)
tasks_config
[
chunk_name
]
=
{
'
parents
'
:
[
IMPORT_NAME
],
'
image
'
:
settings
.
ARKINDEX_TASKS_IMAGE
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3.volumes_import /data/{}/chunk_{}.yml
'
.
format
(
IMPORT_NAME
,
n
)
}
# Add automatic thumbnails generation
tasks_config
[
'
thumbnails_chunk_{}
'
.
format
(
n
)]
=
{
'
parents
'
:
[
chunk_name
],
'
image
'
:
settings
.
ARKINDEX_TASKS_IMAGE
,
'
command
'
:
'
python3 -m arkindex_tasks.generate_thumbnails /data/{}/elements.json
'
.
format
(
chunk_name
),
}
if
not
iiif_cache_bucket
:
continue
tasks_config
[
'
cache_builder_{}
'
.
format
(
n
)]
=
{
'
parents
'
:
[
chunk_name
],
'
image
'
:
settings
.
ARKINDEX_TASKS_IMAGE
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache /data/{}/s3_elements.json
'
.
format
(
chunk_name
)
}
recipe
=
settings
.
PONOS_RECIPE
.
copy
()
recipe
[
'
tasks
'
]
=
tasks_config
recipe
.
setdefault
(
'
env
'
,
{}).
update
(
env_vars
)
workflow
=
Workflow
.
objects
.
create
(
farm_id
=
get_default_farm_id
(),
recipe
=
yaml
.
dump
(
recipe
))
self
.
stdout
.
write
(
'
Created Workflow with id {}
'
.
format
(
workflow
.
id
))
admin
=
User
.
objects
.
filter
(
is_admin
=
True
).
first
()
assert
admin
is
not
None
,
'
No admin user has been found to create a Dataimport
'
dataimport
=
DataImport
.
objects
.
create
(
workflow_id
=
workflow
.
id
,
mode
=
DataImportMode
.
Images
,
corpus_id
=
options
[
'
corpus
'
].
id
,
creator_id
=
admin
.
id
)
self
.
stdout
.
write
(
self
.
style
.
SUCCESS
(
'
Linked Workflow to DataImport {0} using user {1.email} ({1.id})
'
.
format
(
dataimport
.
id
,
admin
)
))
This diff is collapsed.
Click to expand it.
arkindex/dataimport/tests/commands/test_import_s3.py
deleted
100644 → 0
+
0
−
152
View file @
4c8d3d9c
import
yaml
from
django.core.management
import
call_command
from
django.core.management.base
import
CommandError
from
django.test
import
override_settings
from
arkindex.dataimport.models
import
DataImport
,
DataImportMode
from
arkindex.project.tests
import
FixtureTestCase
class
TestImportS3
(
FixtureTestCase
):
@override_settings
(
AWS_ACCESS_KEY
=
'
username
'
,
AWS_SECRET_KEY
=
'
s3kr3t
'
,
AWS_ENDPOINT
=
'
http://nowhere/
'
,
AWS_REGION
=
'
middle-earth
'
,
ARKINDEX_TASKS_IMAGE
=
'
tasks:latest
'
,
PONOS_RECIPE
=
{
'
env
'
:
{
'
SOME_VAR
'
:
'
somevalue
'
,
}
}
)
def
test_dest_bucket
(
self
):
self
.
assertFalse
(
DataImport
.
objects
.
exists
())
call_command
(
'
import_s3
'
,
source_bucket
=
'
mybucket
'
,
dest_bucket
=
'
somewhere
'
,
corpus
=
self
.
corpus
,
bucket_prefix
=
'
A
'
,
max_folder_depth
=
3
,
nb_chunks
=
2
,
)
di
=
DataImport
.
objects
.
get
()
self
.
assertEqual
(
di
.
mode
,
DataImportMode
.
Images
)
self
.
assertEqual
(
di
.
corpus
,
self
.
corpus
)
self
.
assertEqual
(
di
.
creator
,
self
.
superuser
)
self
.
assertIsNone
(
di
.
element
)
recipe
=
yaml
.
safe_load
(
di
.
workflow
.
recipe
)
self
.
assertDictEqual
(
recipe
,
{
'
env
'
:
{
'
AWS_ACCESS_KEY
'
:
'
username
'
,
'
AWS_SECRET_KEY
'
:
'
s3kr3t
'
,
'
AWS_ENDPOINT
'
:
'
http://nowhere/
'
,
'
AWS_REGION
'
:
'
middle-earth
'
,
'
SOME_VAR
'
:
'
somevalue
'
,
},
'
tasks
'
:
{
'
import-s3
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3
'
'
--source-bucket mybucket
'
'
--corpus-id {}
'
"
--bucket-prefix
'
A
'
"
'
--max-folder-depth 3
'
'
--nb-chunks 2
'
'
--dest-bucket somewhere
'
.
format
(
self
.
corpus
.
id
),
},
'
import_chunk_1
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml
'
,
'
parents
'
:
[
'
import-s3
'
],
},
'
import_chunk_2
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_2.yml
'
,
'
parents
'
:
[
'
import-s3
'
],
},
'
thumbnails_chunk_1
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json
'
,
'
parents
'
:
[
'
import_chunk_1
'
],
},
'
thumbnails_chunk_2
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_2/elements.json
'
,
'
parents
'
:
[
'
import_chunk_2
'
],
}
}
})
@override_settings
(
AWS_ACCESS_KEY
=
'
username
'
,
AWS_SECRET_KEY
=
'
s3kr3t
'
,
AWS_ENDPOINT
=
'
http://nowhere/
'
,
AWS_REGION
=
'
middle-earth
'
,
ARKINDEX_TASKS_IMAGE
=
'
tasks:latest
'
,
)
def
test_iiif_cache_bucket
(
self
):
self
.
assertFalse
(
DataImport
.
objects
.
exists
())
call_command
(
'
import_s3
'
,
source_bucket
=
'
mybucket
'
,
image_server
=
42
,
corpus
=
self
.
corpus
,
bucket_prefix
=
'
A
'
,
max_folder_depth
=
2
,
nb_chunks
=
1
,
iiif_cache_bucket
=
'
melon_cache
'
,
)
di
=
DataImport
.
objects
.
get
()
recipe
=
yaml
.
safe_load
(
di
.
workflow
.
recipe
)
self
.
assertDictEqual
(
recipe
.
get
(
'
tasks
'
),
{
'
import-s3
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3
'
'
--source-bucket mybucket
'
'
--corpus-id {}
'
"
--bucket-prefix
'
A
'
"
'
--max-folder-depth 2
'
'
--nb-chunks 1
'
'
--iiif-cache-bucket melon_cache
'
'
--image-server 42
'
.
format
(
self
.
corpus
.
id
),
},
'
import_chunk_1
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.import_s3.volumes_import /data/import-s3/chunk_1.yml
'
,
'
parents
'
:
[
'
import-s3
'
],
},
'
thumbnails_chunk_1
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
'
python3 -m arkindex_tasks.generate_thumbnails /data/import_chunk_1/elements.json
'
,
'
parents
'
:
[
'
import_chunk_1
'
],
},
'
cache_builder_1
'
:
{
'
image
'
:
'
tasks:latest
'
,
'
command
'
:
(
'
python3 -m arkindex_tasks.import_s3.build_cantaloupe_cache
'
'
/data/import_chunk_1/s3_elements.json
'
),
'
parents
'
:
[
'
import_chunk_1
'
],
}
})
def
test_xor_args
(
self
):
with
self
.
assertRaises
(
CommandError
):
call_command
(
'
import_s3
'
,
source_bucket
=
'
mybucket
'
,
dest_bucket
=
'
somewhere
'
,
image_server
=
'
some_server
'
,
corpus
=
self
.
corpus
,
bucket_prefix
=
'
A
'
,
max_folder_depth
=
3
,
nb_chunks
=
2
,
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment