Skip to content
Snippets Groups Projects
Commit 5f5887cd authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'download-workflow' into 'master'

Download files from S3 before workflows

See merge request !257
parents 5d13e427 7ced172f
No related branches found
No related tags found
1 merge request!257Download files from S3 before workflows
......@@ -3,13 +3,20 @@ from django.core.management.base import BaseCommand
from arkindex_common.ml_tool import MLToolType
from arkindex.project.argparse import DataImportArgument
from arkindex.dataimport.models import DataImportMode
from arkindex.dataimport.tasks import extract_pdf_images, populate_volume, setup_ml_analysis, check_images
from arkindex.dataimport.tasks import (
download_files,
check_images,
extract_pdf_images,
populate_volume,
setup_ml_analysis,
)
from arkindex.dataimport.git import GitFlow
from django.conf import settings
import tempfile
import json
import os
import logging
import shutil
logging.basicConfig(
level=logging.INFO,
......@@ -39,26 +46,33 @@ class Command(BaseCommand):
# Fallback to a temp directory while developing
task_dir = os.environ.get('PONOS_DATA', tempfile.mkdtemp(suffix='-ponos'))
assert os.path.isdir(task_dir), 'Invalid task dir {}'.format(task_dir)
logging.info('Using task dir: {}'.format(task_dir))
logger.info('Using task dir: {}'.format(task_dir))
# Use temp folder for anything that does not need to be shared
temp_dir = tempfile.mkdtemp(suffix='-ponostmp')
logger.debug('Using temp dir: {}'.format(temp_dir))
pages, files = None, None
if data_import.files.exists():
files = download_files(data_import, temp_dir)
if data_import.mode == DataImportMode.Images:
# Validate images from data import
files = check_images(data_import)
files = check_images(files)
elif data_import.mode == DataImportMode.PDF:
assert data_import.files.count() == 1, 'Only one file in PDF mode'
pdf_file = data_import.files.first()
assert len(files) == 1, 'Only one file in PDF mode'
pdf_file, pdf_path = files[0]
assert pdf_file.content_type == 'application/pdf', 'File is not a PDF'
# Get images from pdf into the task working dir
images = extract_pdf_images(pdf_file, task_dir)
# Extract images from the PDF into the task working dir and get their paths
images = extract_pdf_images(pdf_file, pdf_path, task_dir)
# Add those images to the volume
# all linked to the original pdf file
files = [
(pdf_file, img)
for img in images
(pdf_file, img_path)
for img_path in images
]
elif data_import.mode == DataImportMode.Repository:
......@@ -81,10 +95,12 @@ class Command(BaseCommand):
# Load or create volume from dataimport
volume = data_import.get_volume()
logging.info('Using volume: {}'.format(volume))
logger.info('Using volume: {}'.format(volume))
pages = populate_volume(volume, files)
shutil.rmtree(temp_dir)
# Setup the analysis process through a json config
ml_analysis = os.path.join(task_dir, 'ml_analysis.json')
if os.path.exists(ml_analysis) or not ml_tools:
......
......@@ -202,6 +202,10 @@ class DataFile(models.Model):
self.s3_object.download_fileobj(b)
return b
def download_to(self, path):
logger.debug('Downloading file {} from S3'.format(self.staging_path))
self.s3_object.download_file(path)
class Repository(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4)
......
# flake8: noqa
from arkindex.dataimport.tasks.base import populate_volume, setup_ml_analysis
from arkindex.dataimport.tasks.base import download_files, populate_volume, setup_ml_analysis
from arkindex.dataimport.tasks.image import check_images, build_iiif_image
from arkindex.dataimport.tasks.pdf import extract_pdf_images
from django.conf import settings
from arkindex.documents.models import Element, ElementType
from arkindex.documents.importer import import_page
from arkindex.dataimport.tasks.image import build_iiif_image
from arkindex.images.models import ImageServer
from arkindex.dataimport.models import EventType
from arkindex.dataimport.models import EventType, DataImport
from botocore.exceptions import ClientError
import logging
import os.path
logger = logging.getLogger(__name__)
def populate_volume(volume, files, server_id=settings.LOCAL_IMAGESERVER_ID):
def download_files(dataimport, dest_dir):
assert isinstance(dataimport, DataImport)
assert os.access(dest_dir, os.W_OK | os.X_OK), 'Destination directory is read-only'
datafiles = dataimport.files.all()
valid_files = []
filecount = len(datafiles)
if filecount < 1:
logger.info('No files to download - skipping')
return []
for i, datafile in enumerate(datafiles):
logger.info('Downloading file {} of {}'.format(i + 1, filecount))
path = os.path.join(dest_dir, datafile.staging_path)
try:
datafile.download_to(path)
except (IOError, ClientError) as e:
logger.warning('Failed downloading file {} ({}): {}'.format(datafile.name, str(datafile.id), str(e)))
continue
valid_files.append((datafile, path))
assert len(valid_files) > 0, 'No files were successfully downloaded'
return valid_files
def populate_volume(volume, files):
'''
Import files into the volume, and post on IIIF server
TODO: this could use an API endpoint to ingest a new page
......@@ -19,17 +47,12 @@ def populate_volume(volume, files, server_id=settings.LOCAL_IMAGESERVER_ID):
assert isinstance(volume, Element)
assert volume.type == ElementType.Volume
try:
server = ImageServer.objects.get(id=server_id)
except ImageServer.DoesNotExist:
raise ValueError('Image server {} does not exist'.format(server_id))
pages, count = [], len(files)
for i, (data_file, staging_path) in enumerate(files):
logger.info('Adding page {} of {}'.format(i + 1, count))
# Build local IIIF image
img = build_iiif_image(server, volume, staging_path, data_file, suffix=str(i))
img = build_iiif_image(volume, staging_path, data_file, suffix=str(i))
# Build page with image
page = import_page(volume, img, volume.name)
......
from PIL import Image
from arkindex.dataimport.models import DataImport, DataFile
from arkindex.dataimport.models import DataFile
from arkindex.images.models import ImageServer, ImageStatus
from arkindex.documents.models import Element, ElementType
from django.conf import settings
import logging
import os
from django.conf import settings
logger = logging.getLogger(__name__)
def check_images(dataimport):
assert isinstance(dataimport, DataImport)
def check_images(files):
assert len(files), 'No files to check'
datafiles = dataimport.files.all()
valid_files = []
filecount = len(datafiles)
filecount = len(files)
for i, datafile in enumerate(datafiles):
for i, (datafile, path) in enumerate(files):
logger.info("Checking image {} of {}".format(i + 1, filecount))
try:
img = Image.open(datafile.download())
img = Image.open(path)
assert max(img.size) >= 500, "Image {} is too small".format(datafile.name)
except IOError:
logger.warn("File {} is not a valid image".format(datafile.name))
......@@ -29,17 +28,16 @@ def check_images(dataimport):
logger.warn(str(e))
continue
valid_files.append((datafile, datafile.staging_path))
valid_files.append((datafile, path))
assert len(valid_files) > 0, "No valid images in selected files"
return valid_files
def build_iiif_image(server, volume, path, data_file, suffix=None):
def build_iiif_image(volume, path, data_file, suffix=None):
'''
Import a staging image into the IIIF server local path
Import a staging image into the local IIIF server
'''
assert isinstance(server, ImageServer)
assert isinstance(volume, Element)
assert volume.type == ElementType.Volume
assert isinstance(path, str)
......@@ -79,7 +77,7 @@ def build_iiif_image(server, volume, path, data_file, suffix=None):
# Build image
width, height = pillow_img.size
img, _ = server.images.get_or_create(
img, _ = ImageServer.objects.local.images.get_or_create(
path=iiif_path,
defaults={
'width': width,
......
from unittest.mock import patch, call
from arkindex.project.tests import FixtureTestCase
from arkindex.documents.models import ElementType
from arkindex.dataimport.tasks import check_images, build_iiif_image
from arkindex.dataimport.models import DataImportMode
from arkindex.images.models import ImageStatus
import os.path
......@@ -11,34 +11,24 @@ class TestImageTasks(FixtureTestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.di = cls.corpus.imports.create(
mode=DataImportMode.Images,
creator=cls.user,
payload={
'folder_name': 'abc',
'volume_name': 'Something',
},
)
cls.df = cls.di.files.create(
corpus=cls.corpus,
cls.vol = cls.corpus.elements.create(name='Test volume', type=ElementType.Volume)
cls.df = cls.corpus.files.create(
name='file.jpg',
size=1234,
hash='cafe',
content_type='image/jpeg',
)
@patch('arkindex.dataimport.models.DataFile.download')
@patch('arkindex.dataimport.tasks.image.Image')
def test_check_images(self, image_mock, dl_mock):
def test_check_images(self, image_mock):
image_mock.open.return_value.size = (1000, 1000)
result = check_images(self.di)
result = check_images([(self.df, '/some/path'), ])
self.assertEqual(dl_mock.call_count, 1)
self.assertEqual(image_mock.open.call_count, 1)
self.assertEqual(image_mock.open.call_args, call(dl_mock.return_value))
self.assertEqual(image_mock.open.call_args, call('/some/path'))
self.assertListEqual(result, [
(self.df, str(self.df.id)),
(self.df, '/some/path'),
])
@patch('arkindex.dataimport.tasks.image.os')
......@@ -50,11 +40,9 @@ class TestImageTasks(FixtureTestCase):
os_mock.path.dirname = os.path.dirname
os_mock.path.join = os.path.join
vol = self.di.get_volume()
with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif'):
with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif', LOCAL_IMAGESERVER_ID=self.imgsrv.id):
img = build_iiif_image(
self.imgsrv,
vol,
self.vol,
'/somewhere/Untitled.bmp',
self.df,
suffix='42',
......@@ -64,14 +52,14 @@ class TestImageTasks(FixtureTestCase):
self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp'))
self.assertTrue(os_mock.path.exists.called)
self.assertEqual(os_mock.makedirs.call_count, 1)
self.assertEqual(os_mock.makedirs.call_args, call(os.path.join('/iiif', str(vol.id))))
self.assertEqual(os_mock.makedirs.call_args, call(os.path.join('/iiif', str(self.vol.id))))
self.assertEqual(image_mock.open().save.call_count, 1)
self.assertEqual(
image_mock.open().save.call_args,
call(os.path.join('/iiif', str(vol.id), '{}-42.jpg'.format(str(self.df.id)))),
call(os.path.join('/iiif', str(self.vol.id), '{}-42.jpg'.format(str(self.df.id)))),
)
self.assertEqual(img.server, self.imgsrv)
self.assertEqual(img.path, '{}/{}-42.jpg'.format(str(vol.id), str(self.df.id)))
self.assertEqual(img.path, '{}/{}-42.jpg'.format(str(self.vol.id), str(self.df.id)))
self.assertEqual(img.width, 400)
self.assertEqual(img.height, 900)
self.assertEqual(img.status, ImageStatus.Checked)
......@@ -86,19 +74,17 @@ class TestImageTasks(FixtureTestCase):
image_mock.open.return_value.format = 'JPEG2000'
image_mock.open.return_value.size = (400, 900)
os_mock.path.exists.return_value = True
vol = self.di.get_volume()
original_img = self.imgsrv.images.create(
path='{}/{}.jp2'.format(str(vol.id), str(self.df.id)),
path='{}/{}.jp2'.format(str(self.vol.id), str(self.df.id)),
datafile=self.df,
width=900,
height=400,
status=ImageStatus.Checked,
)
with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif'):
with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif', LOCAL_IMAGESERVER_ID=self.imgsrv.id):
new_img = build_iiif_image(
self.imgsrv,
vol,
self.vol,
'/somewhere/Untitled.bmp',
self.df,
)
......
from arkindex.project.tests import FixtureTestCase
from arkindex.dataimport.models import DataImportMode
from arkindex.dataimport.tasks import download_files
from unittest.mock import patch, call
class TestTasks(FixtureTestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.df1 = cls.corpus.files.create(
name='file1.jpg',
size=1234,
hash='cafe',
content_type='image/jpeg',
)
cls.df2 = cls.corpus.files.create(
name='file2.png',
size=5678,
hash='beef',
content_type='image/png',
)
cls.di = cls.corpus.imports.create(
mode=DataImportMode.Images,
creator=cls.user,
)
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.s3obj_patch = patch('arkindex.dataimport.models.s3.Object')
cls.s3obj_mock = cls.s3obj_patch.start()
cls.access_patch = patch('arkindex.dataimport.tasks.base.os.access')
cls.access_mock = cls.access_patch.start()
@classmethod
def tearDownClass(cls):
cls.s3obj_patch.stop()
cls.access_patch.stop()
super().tearDownClass()
def setUp(self):
super().setUp()
self.s3obj_mock.reset_mock()
self.access_mock.reset_mock()
self.access_mock.return_value = True
self.s3obj_mock.return_value.download_file.side_effect = None
def test_download_files(self):
self.di.files.set([self.df1])
expected_path = '/somewhere/{}'.format(str(self.df1.id))
self.assertListEqual(
download_files(self.di, '/somewhere'),
[(self.df1, expected_path)],
)
self.assertEqual(self.s3obj_mock().download_file.call_count, 1)
self.assertEqual(self.s3obj_mock().download_file.call_args, call(expected_path))
def test_download_files_fail(self):
self.di.files.set([self.df1, self.df2])
# Fail only once
self.s3obj_mock().download_file.side_effect = [IOError, None]
expected_path1 = '/somewhere/{}'.format(str(self.df1.id))
expected_path2 = '/somewhere/{}'.format(str(self.df2.id))
self.assertListEqual(
download_files(self.di, '/somewhere'),
[(self.df2, expected_path2)],
)
self.assertEqual(self.s3obj_mock().download_file.call_count, 2)
self.assertEqual(self.s3obj_mock().download_file.call_args_list, [
call(expected_path1),
call(expected_path2),
])
def test_download_files_epic_fail(self):
self.di.files.set([self.df1, self.df2])
# Fail all the time
self.s3obj_mock().download_file.side_effect = IOError
expected_path1 = '/somewhere/{}'.format(str(self.df1.id))
expected_path2 = '/somewhere/{}'.format(str(self.df2.id))
with self.assertRaisesRegex(AssertionError, 'No files'):
download_files(self.di, '/somewhere')
self.assertEqual(self.s3obj_mock().download_file.call_count, 2)
self.assertEqual(self.s3obj_mock().download_file.call_args_list, [
call(expected_path1),
call(expected_path2),
])
def test_download_files_empty(self):
self.di.files.set([])
self.assertListEqual(download_files(self.di, '/somewhere'), [])
self.assertFalse(self.s3obj_mock.called)
def test_download_files_read_only(self):
self.access_mock.return_value = False
self.di.files.set([self.df1])
with self.assertRaisesRegex(AssertionError, 'read-only'):
download_files(self.di, '/somewhere')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment