diff --git a/arkindex/dataimport/tasks/image.py b/arkindex/dataimport/tasks/image.py index fced7edb2f5f98884fedd62c29cfde3adcc4e8b3..84ca73831f4f01b4e69b0f32baed2c51a0d0d8e6 100644 --- a/arkindex/dataimport/tasks/image.py +++ b/arkindex/dataimport/tasks/image.py @@ -1,10 +1,9 @@ -from PIL import Image +from PIL import Image as PillowImage from arkindex.dataimport.models import DataFile -from arkindex.images.models import ImageServer, ImageStatus +from arkindex.images.models import ImageServer, Image, ImageStatus from arkindex.documents.models import Element, ElementType -from django.conf import settings +from urllib.parse import quote import logging -import os logger = logging.getLogger(__name__) @@ -19,7 +18,7 @@ def check_images(files): logger.info("Checking image {} of {}".format(i + 1, filecount)) try: - img = Image.open(path) + img = PillowImage.open(path) assert max(img.size) >= 500, "Image {} is too small".format(datafile.name) except IOError: logger.warn("File {} is not a valid image".format(datafile.name)) @@ -43,7 +42,8 @@ def build_iiif_image(volume, path, data_file, suffix=None): assert isinstance(path, str) assert isinstance(data_file, DataFile) - pillow_img = Image.open(path) + pillow_img = PillowImage.open(path) + width, height = pillow_img.size # Non-JPEG image formats that should not be converted # Will default to .jpg if the image format is not in there @@ -54,37 +54,36 @@ def build_iiif_image(volume, path, data_file, suffix=None): "PNG": ".png", "TIFF": ".tif", } - ext = known_exts.get(pillow_img.format, ".jpg") + img_format = pillow_img.format + if img_format not in known_exts: + img_format = 'JPEG' + ext = known_exts.get(img_format, '.jpg') - # Use data file id + # Build image path filename = str(data_file.id) if suffix is not None: filename += '-{}'.format(suffix) filename += ext - - # Build paths - iiif_path = '{}/{}'.format(volume.id, filename) - iiif_full_path = os.path.join(settings.LOCAL_IMAGESERVER_ROOT, iiif_path) - if os.path.exists(iiif_full_path): + iiif_path = quote('{}/{}'.format(volume.id, filename), safe='') + + # Get Image instance + try: + img = ImageServer.objects.local.images.get(path=iiif_path) + except Image.DoesNotExist: + img = Image( + server=ImageServer.objects.local, + path=iiif_path, + width=width, + height=height, + datafile=data_file, + ) + + if img.exists(): logger.warning('Image already exists on the IIIF server') else: - iiif_directory = os.path.dirname(iiif_full_path) - if not os.path.exists(iiif_directory): - os.makedirs(iiif_directory) - - # Save using optional image type conversion - pillow_img.save(iiif_full_path) - - # Build image - width, height = pillow_img.size - img, _ = ImageServer.objects.local.images.get_or_create( - path=iiif_path, - defaults={ - 'width': width, - 'height': height, - 'status': ImageStatus.Checked, - 'datafile': data_file, - } - ) + # Save to S3 using optional image type conversion + img.pillow_save(pillow_img, format=img_format) + img.status = ImageStatus.Checked + img.save() return img diff --git a/arkindex/dataimport/tests/test_image.py b/arkindex/dataimport/tests/test_image.py index 5b06bb5f9a0f5334f432b78d223e7ff140b2b380..941a66ba186b9b93b6e3df89d0dfc850d678fb3d 100644 --- a/arkindex/dataimport/tests/test_image.py +++ b/arkindex/dataimport/tests/test_image.py @@ -3,7 +3,7 @@ from arkindex.project.tests import FixtureTestCase from arkindex.documents.models import ElementType from arkindex.dataimport.tasks import check_images, build_iiif_image from arkindex.images.models import ImageStatus -import os.path +from botocore.exceptions import ClientError class TestImageTasks(FixtureTestCase): @@ -19,7 +19,7 @@ class TestImageTasks(FixtureTestCase): content_type='image/jpeg', ) - @patch('arkindex.dataimport.tasks.image.Image') + @patch('arkindex.dataimport.tasks.image.PillowImage') def test_check_images(self, image_mock): image_mock.open.return_value.size = (1000, 1000) @@ -31,16 +31,14 @@ class TestImageTasks(FixtureTestCase): (self.df, '/some/path'), ]) - @patch('arkindex.dataimport.tasks.image.os') - @patch('arkindex.dataimport.tasks.image.Image') - def test_build_iiif_image(self, image_mock, os_mock): + @patch('arkindex.images.models.s3.Object') + @patch('arkindex.dataimport.tasks.image.PillowImage') + def test_build_iiif_image(self, image_mock, s3obj_mock): image_mock.open.return_value.format = 'BMP' image_mock.open.return_value.size = (400, 900) - os_mock.path.exists.return_value = False - os_mock.path.dirname = os.path.dirname - os_mock.path.join = os.path.join + s3obj_mock.return_value.load.side_effect = ClientError({'Error': {'Code': '404'}}, 'head_object') - with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif', LOCAL_IMAGESERVER_ID=self.imgsrv.id): + with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'): img = build_iiif_image( self.vol, '/somewhere/Untitled.bmp', @@ -48,41 +46,38 @@ class TestImageTasks(FixtureTestCase): suffix='42', ) + expected_path = '{}/{}-42.jpg'.format(str(self.vol.id), str(self.df.id)) self.assertEqual(image_mock.open.call_count, 1) self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp')) - self.assertTrue(os_mock.path.exists.called) - self.assertEqual(os_mock.makedirs.call_count, 1) - self.assertEqual(os_mock.makedirs.call_args, call(os.path.join('/iiif', str(self.vol.id)))) + self.assertEqual(s3obj_mock.call_count, 1) + self.assertEqual(s3obj_mock.call_args, call('iiif', expected_path)) + self.assertEqual(s3obj_mock().load.call_count, 1) self.assertEqual(image_mock.open().save.call_count, 1) - self.assertEqual( - image_mock.open().save.call_args, - call(os.path.join('/iiif', str(self.vol.id), '{}-42.jpg'.format(str(self.df.id)))), - ) + self.assertEqual(s3obj_mock().upload_fileobj.call_count, 1) self.assertEqual(img.server, self.imgsrv) - self.assertEqual(img.path, '{}/{}-42.jpg'.format(str(self.vol.id), str(self.df.id))) + self.assertEqual(img.path, expected_path.replace('/', '%2F')) self.assertEqual(img.width, 400) self.assertEqual(img.height, 900) self.assertEqual(img.status, ImageStatus.Checked) self.assertEqual(img.datafile, self.df) - @patch('arkindex.dataimport.tasks.image.os') - @patch('arkindex.dataimport.tasks.image.Image') - def test_build_iiif_image_retry(self, image_mock, os_mock): + @patch('arkindex.images.models.s3.Object') + @patch('arkindex.dataimport.tasks.image.PillowImage') + def test_build_iiif_image_retry(self, image_mock, s3obj_mock): """ Test build_iiif_image just returns existing images if they already exist """ image_mock.open.return_value.format = 'JPEG2000' image_mock.open.return_value.size = (400, 900) - os_mock.path.exists.return_value = True original_img = self.imgsrv.images.create( - path='{}/{}.jp2'.format(str(self.vol.id), str(self.df.id)), + path='{}%2F{}.jp2'.format(str(self.vol.id), str(self.df.id)), datafile=self.df, width=900, height=400, status=ImageStatus.Checked, ) - with self.settings(LOCAL_IMAGESERVER_ROOT='/iiif', LOCAL_IMAGESERVER_ID=self.imgsrv.id): + with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'): new_img = build_iiif_image( self.vol, '/somewhere/Untitled.bmp', @@ -92,5 +87,5 @@ class TestImageTasks(FixtureTestCase): self.assertEqual(original_img.id, new_img.id) self.assertEqual(image_mock.open.call_count, 1) self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp')) - self.assertTrue(os_mock.path.exists.called) + self.assertEqual(s3obj_mock().load.call_count, 1) self.assertFalse(image_mock.open().save.called) diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 096d2b04b061b1ae21eda2d59db285399225c9cb..03f56af67b82dc7b4d06ddca4db019da59204bca 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -206,6 +206,24 @@ class Image(IndexableModel): def url(self): return self.server.build_url(self.path) + @cached_property + def s3_object(self): + assert self.server.is_local, 'Cannot load images on remote image servers via S3' + return s3.Object(settings.AWS_IIIF_BUCKET, urllib.parse.unquote(self.path)) + + def exists(self): + """ + Returns whether the Image exists on the IIIF S3 bucket by performing a HEAD request to S3. + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.load + """ + try: + self.s3_object.load() + return True + except ClientError as e: + if e.response['Error']['Code'] != '404': + raise + return False + def get_thumbnail_url(self, max_width=200, max_height=None): if max_width is None and max_height is None: resize = "full" @@ -215,7 +233,9 @@ class Image(IndexableModel): def pillow_open(self, max_width=500): if self.server.is_local: - return PillowImage.open(os.path.join(settings.LOCAL_IMAGESERVER_ROOT, self.path)) + b = BytesIO() + self.s3_object.download_fileobj(b) + return PillowImage.open(b) # PIL.Image.open explicitly requires seek(int) method, that the urllib responses do not provide. # We therefore have to get the whole content and put it back in a file-like object @@ -263,6 +283,17 @@ class Image(IndexableModel): if save: self.save() + def pillow_save(self, pillow_img, format=None): + """ + Save a PIL.Image instance to S3 + """ + assert self.server.is_local, 'Only images on the local server can be saved' + logger.debug('Saving {} to S3'.format(self.id)) + b = BytesIO() + pillow_img.save(b, format=format) + b.seek(0) + self.s3_object.upload_fileobj(b) + def __str__(self): return '{} - {}'.format(self.id, self.url) diff --git a/arkindex/project/checks.py b/arkindex/project/checks.py index cacea08ccc16941dcde0b0624362a2873471a044..996db3feb8aed6eab06be18cf263fd79b125c4da 100644 --- a/arkindex/project/checks.py +++ b/arkindex/project/checks.py @@ -291,6 +291,7 @@ def s3_check(*args, **kwargs): 'AWS_ENDPOINT': 'AWS endpoint', 'AWS_THUMBNAIL_BUCKET': 'S3 thumbnails bucket name', 'AWS_STAGING_BUCKET': 'S3 staging bucket name', + 'AWS_IIIF_BUCKET': 'S3 IIIF bucket name', } errors = [] for name, display_name in aws_settings.items(): diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index dd4fb56decfee39c31ef9a1c8c9c1acc0fd818c9..be3594a23bf52a15da205020dece2ec4638afb85 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -73,6 +73,7 @@ AWS_ENDPOINT = os.environ.get('S3SOURCE_ENDPOINT') AWS_REGION = os.environ.get('AWS_REGION') AWS_THUMBNAIL_BUCKET = os.environ.get('AWS_THUMBNAIL_BUCKET', 'thumbnails') AWS_STAGING_BUCKET = os.environ.get('AWS_STAGING_BUCKET', 'staging') +AWS_IIIF_BUCKET = os.environ.get('S3SOURCE_BASICLOOKUPSTRATEGY_BUCKET_NAME', 'iiif') if 'test' in sys.argv: # Overrides for unit tests diff --git a/arkindex/project/tests/test_checks.py b/arkindex/project/tests/test_checks.py index 7d99f34b71b8ad6e825cb2da4a12fcf87cc70651..8bb63df7692618752e39a0e89f55f193dbc26db2 100644 --- a/arkindex/project/tests/test_checks.py +++ b/arkindex/project/tests/test_checks.py @@ -335,6 +335,7 @@ class ChecksTestCase(TestCase): del settings.AWS_ENDPOINT del settings.AWS_THUMBNAIL_BUCKET del settings.AWS_STAGING_BUCKET + del settings.AWS_IIIF_BUCKET self.maxDiff = None self.assertCountEqual(s3_check(), [ Error( @@ -362,6 +363,11 @@ class ChecksTestCase(TestCase): hint='settings.AWS_STAGING_BUCKET = None', id='arkindex.E011', ), + Error( + 'S3 IIIF bucket name is missing; all S3-related features will fail.', + hint='settings.AWS_IIIF_BUCKET = None', + id='arkindex.E011', + ), ]) settings.AWS_ACCESS_KEY = 'key' @@ -369,4 +375,5 @@ class ChecksTestCase(TestCase): settings.AWS_ENDPOINT = 'http://somewhere' settings.AWS_THUMBNAIL_BUCKET = 'Thumbs.db' settings.AWS_STAGING_BUCKET = 'buckette' + settings.AWS_IIIF_BUCKET = 'bucks' self.assertListEqual(s3_check(), [])