diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 19c761121b51637c5503ce01b8df1ac4199a578a..304a500602a67245c80c4be6ae58359eb9eaedf7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: registry.gitlab.com/arkindex/backend:base-0.8.7 +image: registry.gitlab.com/arkindex/backend:base-0.8.8 stages: - test diff --git a/VERSION b/VERSION index 2003b639c40025a4216b7b765e800b872a9052cd..e72b7b4721fc34ed6aebacef4250266dbbe54210 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.2 +0.9.3-dev diff --git a/arkindex/dataimport/tests/test_iiif.py b/arkindex/dataimport/tests/test_iiif.py index 7d718b38d96ccb63bd180cff89efff963542ef9f..6138aff86bb4af20aa5207917cbd34dbd39d3203 100644 --- a/arkindex/dataimport/tests/test_iiif.py +++ b/arkindex/dataimport/tests/test_iiif.py @@ -218,9 +218,10 @@ class TestManifestParser(FixtureTestCase): parser.run() self._assert_second_import(self.rev, new_rev) + @patch('arkindex.images.models.Thumbnail') @patch('arkindex.dataimport.providers.Gitlab') @patch('arkindex.dataimport.providers.git.Repo.clone_from') - def test_git_import(self, clone_mock, gl_mock): + def test_git_import(self, clone_mock, gl_mock, thumb_mock): """ Import manifest files from a Git repo """ @@ -284,10 +285,13 @@ class TestManifestParser(FixtureTestCase): # Run first import first_rev = run_import(first_commit) + self.assertEqual(thumb_mock.call_count, 1) self._assert_first_import(first_rev) # Run second import + thumb_mock.reset_mock() second_rev = run_import(second_commit) + self.assertEqual(thumb_mock.call_count, 1) self._assert_second_import(first_rev, second_rev) def test_manifest_parser_find_image_server(self): diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 156e95bb62611885cf8883bc8e5cf831bc82fbfa..1b3e3327a7b389145200b3e48d589b1d7b327af6 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -10,7 +10,6 @@ import uuid import enum import logging import itertools -import os logger = logging.getLogger(__name__) @@ -247,19 +246,12 @@ class Element(IndexableModel): from arkindex.images.models import Thumbnail # Prevent circular imports return Thumbnail(self) - def generate_thumbnail(self, force=False): + def generate_thumbnail(self, **kwargs): ''' Build a thumbnail for this element ''' - from arkindex.images.models import Thumbnail # Prevent circular imports - if os.path.exists(self.thumbnail.path): - # Thumbnail already exists - if not force: - return - os.remove(self.thumbnail.path) - logger.info("Creating thumbnail for element {}".format(self)) - Thumbnail.create(self) + self.thumbnail.create(**kwargs) def __str__(self): return '{} : {}'.format(self.type, self.name) diff --git a/arkindex/documents/tests/commands/test_generate_thumbnails.py b/arkindex/documents/tests/commands/test_generate_thumbnails.py index a9d2f754a9ad24d16cc8f6c1328429b256cd5c27..0bc1f0e848ef8786e0b21aceadd2981ee242336f 100644 --- a/arkindex/documents/tests/commands/test_generate_thumbnails.py +++ b/arkindex/documents/tests/commands/test_generate_thumbnails.py @@ -18,7 +18,7 @@ class TestGenerateThumbnailsCommand(FixtureTestCase): cls.vol2 = cls.corpus.elements.get(type=ElementType.Volume, name="Volume 2") corpus2 = Corpus.objects.create(name='Other corpus') cls.vol3 = corpus2.elements.create(type=ElementType.Volume, name='Volume 3') - cls.thumb_patch = patch('arkindex.images.models.Thumbnail.create') + cls.thumb_patch = patch('arkindex.images.models.Thumbnail') def setUp(self): super().setUp() @@ -40,6 +40,7 @@ class TestGenerateThumbnailsCommand(FixtureTestCase): call(self.vol1), call(self.vol2), ]) + self.assertEqual(self.thumb_mock().create.call_count, 2) def test_start_element(self): """ @@ -53,6 +54,7 @@ class TestGenerateThumbnailsCommand(FixtureTestCase): self.assertCountEqual(self.thumb_mock.call_args_list, [ call(self.reg), ]) + self.assertEqual(self.thumb_mock().create.call_count, 1) @patch('arkindex.documents.models.Element.generate_thumbnail') def test_force(self, gen_mock): @@ -82,6 +84,7 @@ class TestGenerateThumbnailsCommand(FixtureTestCase): call(self.vol2), call(self.vol3), ]) + self.assertEqual(self.thumb_mock().create.call_count, 3) def test_all_xor_corpus(self): """ diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 650f9e592050be1563fd387ca3e8dacd3edc96e6..096d2b04b061b1ae21eda2d59db285399225c9cb 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -1,12 +1,15 @@ from django.db import models from django.db.models.functions import Concat, Substr from django.conf import settings +from django.utils.functional import cached_property from django.utils.text import slugify from arkindex.documents.models import Page from arkindex.images.managers import ImageServerManager from arkindex.project.models import IndexableModel from arkindex.project.fields import StripSlashURLField, LStripTextField from arkindex.project.polygon import PolygonField +from arkindex.project.aws import s3 +from botocore.client import ClientError from enumfields import EnumField, Enum from io import BytesIO from PIL import Image as PillowImage @@ -216,7 +219,10 @@ class Image(IndexableModel): # PIL.Image.open explicitly requires seek(int) method, that the urllib responses do not provide. # We therefore have to get the whole content and put it back in a file-like object - resp = requests.get(self.get_thumbnail_url(max_width=max_width)) + resp = requests.get( + self.get_thumbnail_url(max_width=max_width), + timeout=settings.IIIF_DOWNLOAD_TIMEOUT, + ) resp.raise_for_status() return PillowImage.open(BytesIO(resp.content)) @@ -271,24 +277,37 @@ class Thumbnail(object): @property def name(self): - return 'thumbnail_{}.jpg'.format(str(self.element.id.hex)) + return '{}.jpg'.format(str(self.element.id.hex)) - @property - def url(self): - return ImageServer.objects.local.build_url(self.name) + @cached_property + def s3_object(self): + return s3.Object(settings.AWS_THUMBNAIL_BUCKET, self.name) @property - def path(self): - return os.path.join(settings.LOCAL_IMAGESERVER_ROOT, self.name) + def url(self): + return s3.meta.client.generate_presigned_url('get_object', Params={ + 'Bucket': self.s3_object.bucket_name, + 'Key': self.s3_object.key, + }) - @staticmethod - def create(elt, width=900, height=400, max_element_count=3): + def create(self, width=900, height=400, max_element_count=3, force=False): """ Generate a thumbnail for an Element and store it in the IIIF server """ + if not force: + try: + self.s3_object.load() + # This did not raise anything so the thumbnail exists + return + except ClientError as e: + if e.response['Error']['Code'] != '404': + raise + # Get at most 'max_element_count' first pages - pages = Page.objects.get_descending(elt.id).prefetch_related('zone__image__server')[:max_element_count] + pages = Page.objects \ + .get_descending(self.element.id) \ + .prefetch_related('zone__image__server')[:max_element_count] if not pages: raise Page.DoesNotExist("No pages found for thumbnail generation") @@ -320,7 +339,10 @@ class Thumbnail(object): thumbnail.paste(img, (offset, 0)) offset += single_width - thumbnail.save(elt.thumbnail.path) + b = BytesIO() + thumbnail.save(b, format='jpeg') + b.seek(0) + self.s3_object.upload_fileobj(b) class Zone(IndexableModel): diff --git a/arkindex/images/tests/test_image.py b/arkindex/images/tests/test_image.py index 77e064c3a42a62c1e2c04756cf518c7f49123bf9..9d3f318a63689fc4c82b0455c8804347235d925c 100644 --- a/arkindex/images/tests/test_image.py +++ b/arkindex/images/tests/test_image.py @@ -16,7 +16,7 @@ class TestImage(FixtureTestCase): self.assertEqual(img.path, '!#%:+/*') self.assertEqual(img.url, 'http://server/!#%:+/*') - @override_settings(LOCAL_IMAGESERVER_ID=-1) + @override_settings(LOCAL_IMAGESERVER_ID=-1, IIIF_DOWNLOAD_TIMEOUT=(13, 37)) @patch('arkindex.images.models.PillowImage.open') @patch('arkindex.images.models.requests.get') def test_pillow_open_url(self, get_mock, pillow_open_mock): @@ -25,7 +25,7 @@ class TestImage(FixtureTestCase): get_mock.return_value.content = b'imageblob' self.assertEqual(img.pillow_open(), pillow_open_mock.return_value) self.assertEqual(get_mock.call_count, 1) - self.assertEqual(get_mock.call_args, call('http://server/image/full/500,/0/default.jpg')) + self.assertEqual(get_mock.call_args, call('http://server/image/full/500,/0/default.jpg', timeout=(13, 37))) self.assertEqual(get_mock().raise_for_status.call_count, 1) self.assertEqual(pillow_open_mock.call_count, 1) args, kwargs = pillow_open_mock.call_args diff --git a/arkindex/images/tests/test_thumbnail.py b/arkindex/images/tests/test_thumbnail.py new file mode 100644 index 0000000000000000000000000000000000000000..cbb4389db0dbe711c6323b2b1ecdd58386561cda --- /dev/null +++ b/arkindex/images/tests/test_thumbnail.py @@ -0,0 +1,156 @@ +from arkindex.project.tests import FixtureTestCase +from arkindex.documents.models import ElementType, Page +from unittest.mock import patch, call +from django.test import override_settings +from botocore.exceptions import ClientError + + +@override_settings(LOCAL_IMAGESERVER_ID=999, IIIF_DOWNLOAD_TIMEOUT=(13, 37)) +class TestThumbnail(FixtureTestCase): + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.s3_mock = patch('arkindex.images.models.s3').start() + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.vol1 = cls.corpus.elements.get(name='Volume 1', type=ElementType.Volume) + + def tearDown(self): + super().tearDown() + self.s3_mock.reset_mock() + try: + # Remove the cached S3 object to force a reinstanciation + del self.vol1.thumbnail.s3_object + except AttributeError: # Sometimes, it isn't already cached + pass + + def test_name(self): + self.assertEqual(self.vol1.thumbnail.name, str(self.vol1.id.hex) + '.jpg') + + def test_s3_object(self): + with self.settings(AWS_THUMBNAIL_BUCKET="derp"): + # Test twice to ensure s3_object is only created once + self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value) + self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value) + self.assertEqual(self.s3_mock.Object.call_count, 1) + self.assertEqual(self.s3_mock.Object.call_args, call('derp', self.vol1.thumbnail.name)) + + def test_url(self): + self.s3_mock.meta.client.generate_presigned_url.return_value = 'http://nowhere' + self.s3_mock.Object.return_value.bucket_name = 'derp' + self.s3_mock.Object.return_value.key = 'meme.jpg' + with self.settings(AWS_THUMBNAIL_BUCKET="derp"): + self.assertEqual(self.vol1.thumbnail.url, 'http://nowhere') + self.assertEqual(self.s3_mock.Object.call_count, 1) + self.assertEqual(self.s3_mock.meta.client.generate_presigned_url.call_count, 1) + self.assertEqual( + self.s3_mock.meta.client.generate_presigned_url.call_args, + call('get_object', Params={'Bucket': 'derp', 'Key': 'meme.jpg'}), + ) + + @patch('arkindex.images.models.BytesIO') + @patch('arkindex.images.models.PillowImage') + @patch('arkindex.images.models.requests') + def test_create(self, requests_mock, pil_mock, bytes_mock): + """ + Test Thumbnail.create creates a thumbnail if it does not exist + """ + self.s3_mock.Object.return_value.load.side_effect = ClientError({'Error': {'Code': '404'}}, 'get_object') + requests_mock.get.return_value.content = b'something' + pil_mock.open.return_value.size = (1000, 1000) + pil_mock.open.return_value.resize.return_value.size = (400, 400) + + self.vol1.thumbnail.create() + + self.assertEqual(pil_mock.new.call_count, 1) + self.assertEqual(pil_mock.new.call_args, call('RGB', (900, 400))) + self.assertEqual(pil_mock.open.call_count, 3) + self.assertEqual(requests_mock.get.call_count, 3) + self.assertEqual(requests_mock.get.call_args_list, [ + call('http://server/img1/full/300,/0/default.jpg', timeout=(13, 37)), + call('http://server/img2/full/300,/0/default.jpg', timeout=(13, 37)), + call('http://server/img3/full/300,/0/default.jpg', timeout=(13, 37)), + ]) + self.assertEqual(pil_mock.open().resize.call_count, 3) + self.assertListEqual( + pil_mock.open().resize.call_args_list, + [call((400, 400), pil_mock.BICUBIC)] * 3, + ) + self.assertEqual(pil_mock.open().resize().crop.call_count, 3) + self.assertEqual( + pil_mock.open().resize().crop.call_args_list, + [call((50, 0, 350, 400))] * 3, + ) + self.assertEqual(pil_mock.new().paste.call_count, 3) + self.assertEqual(pil_mock.new().paste.call_args_list, [ + call(pil_mock.open().resize().crop(), (0, 0)), + call(pil_mock.open().resize().crop(), (300, 0)), + call(pil_mock.open().resize().crop(), (600, 0)), + ]) + + # BytesIO is used three times for opening images, then once for the thumbnail save + self.assertEqual(bytes_mock.call_count, 4) + self.assertEqual(bytes_mock.call_args_list, [call(b'something')] * 3 + [call()]) + self.assertEqual(pil_mock.new().save.call_count, 1) + self.assertEqual( + pil_mock.new().save.call_args, + call(bytes_mock(), format='jpeg'), + ) + self.assertEqual(bytes_mock().seek.call_count, 1) + self.assertEqual(bytes_mock().seek.call_args, call(0)) + self.assertEqual(self.s3_mock.Object.call_count, 1) + self.assertEqual(self.s3_mock.Object().upload_fileobj.call_count, 1) + self.assertEqual(self.s3_mock.Object().upload_fileobj.call_args, call(bytes_mock())) + + @patch('arkindex.images.models.PillowImage') + def test_create_exists(self, pil_mock): + """ + Test Thumbnail.create ignores creation if the thumbnail exists (if S3 does not raise 404) + """ + self.s3_mock.Object.return_value.load.side_effect = None + self.vol1.thumbnail.create() + self.assertEqual(self.s3_mock.Object.call_count, 1) + self.assertEqual(self.s3_mock.Object().load.call_count, 1) + self.assertEqual(self.s3_mock.Object().upload_fileobj.call_count, 0) + self.assertEqual(pil_mock.new.call_count, 0) + self.assertEqual(pil_mock.open.call_count, 0) + + def test_create_exception(self): + """ + Test Thumbnail.create raises any error that is not 404 + """ + self.s3_mock.Object.return_value.load.side_effect = ClientError({'Error': {'Code': '999'}}, 'get_object') + with self.assertRaises(ClientError): + self.vol1.thumbnail.create() + + @patch('arkindex.images.models.BytesIO') + @patch('arkindex.images.models.PillowImage') + @patch('arkindex.images.models.requests') + def test_create_force(self, requests_mock, pil_mock, bytes_mock): + """ + Test Thumbnail.create ignores existing thumbnails with force=True + """ + requests_mock.get.return_value.content = b'something' + pil_mock.open.return_value.size = (1000, 1000) + pil_mock.open.return_value.resize.return_value.size = (400, 400) + + self.vol1.thumbnail.create(force=True) + + self.assertEqual(self.s3_mock.Object.call_count, 1) + self.assertEqual(self.s3_mock.Object().load.call_count, 0) + self.assertEqual(pil_mock.new.call_count, 1) + self.assertEqual(pil_mock.open.call_count, 3) + self.assertEqual(self.s3_mock.Object().upload_fileobj.call_count, 1) + self.assertEqual(self.s3_mock.Object().upload_fileobj.call_args, call(bytes_mock())) + + def test_create_empty(self): + """ + Test Thumbnail.create fails if there are no pages + """ + self.s3_mock.Object.return_value.load.side_effect = ClientError({'Error': {'Code': '404'}}, 'get_object') + Page.objects.get_descending(self.vol1.id).delete() + with self.assertRaises(Page.DoesNotExist): + self.vol1.thumbnail.create() diff --git a/arkindex/project/aws.py b/arkindex/project/aws.py new file mode 100644 index 0000000000000000000000000000000000000000..041ed1e2c3682fc460d00244f37ac465228acf4e --- /dev/null +++ b/arkindex/project/aws.py @@ -0,0 +1,19 @@ +from django.conf import settings +from botocore.config import Config +import boto3.session + +session = boto3.session.Session( + aws_access_key_id=settings.AWS_ACCESS_KEY, + aws_secret_access_key=settings.AWS_SECRET_KEY, +) + +config = Config( + region_name=settings.AWS_REGION, + signature_version='s3v4', +) + +s3 = session.resource( + 's3', + endpoint_url=settings.AWS_ENDPOINT, + config=config, +) diff --git a/arkindex/project/checks.py b/arkindex/project/checks.py index 6dc3203d265ee69e681605e1adb54875a98899c1..3debbe64d71ddd4334c80337a58a37b577d38078 100644 --- a/arkindex/project/checks.py +++ b/arkindex/project/checks.py @@ -279,3 +279,25 @@ def gitlab_oauth_check(*args, **kwargs): id='arkindex.W004', )) return warnings + + +@register() +@only_runserver +def s3_check(*args, **kwargs): + from django.conf import settings + aws_settings = { + 'AWS_ACCESS_KEY': 'AWS access key ID', + 'AWS_SECRET_KEY': 'AWS secret key', + 'AWS_ENDPOINT': 'AWS endpoint', + 'AWS_THUMBNAIL_BUCKET': 'S3 thumbnails bucket name', + } + errors = [] + for name, display_name in aws_settings.items(): + value = getattr(settings, name, None) + if not value: + errors.append(Error( + '{} is missing; all S3-related features will fail.'.format(display_name), + hint='settings.{} = {}'.format(name, repr(value)), + id='arkindex.E011', + )) + return errors diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 6116c3b07b870cdeed2bb351ef3873ada9bd6582..d9bb6d4b735867ef3a88836b8a74d29002d63c24 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -64,6 +64,20 @@ HOST_LOCAL_IMAGESERVER_ROOT = os.environ.get('HOST_LOCAL_IMAGESERVER_ROOT', LOCA # Extra initial data for scripts HOST_INITIAL_DATA_ROOT = os.environ.get('HOST_INITIAL_DATA', os.path.join(BASE_DIR, '../../data')) +# Amazon S3 +# S3SOURCE_ variables are from Cantaloupe +AWS_ACCESS_KEY = os.environ.get('S3SOURCE_ACCESS_KEY_ID') +AWS_SECRET_KEY = os.environ.get('S3SOURCE_SECRET_KEY') +AWS_ENDPOINT = os.environ.get('S3SOURCE_ENDPOINT') +AWS_REGION = os.environ.get('AWS_REGION') +AWS_THUMBNAIL_BUCKET = os.environ.get('AWS_THUMBNAIL_BUCKET', 'thumbnails') + +if 'test' in sys.argv: + # Overrides for unit tests + AWS_ACCESS_KEY = 'test' + AWS_SECRET_KEY = 'test' + AWS_ENDPOINT = 'http://s3' + # Ponos integration if os.environ.get('PONOS_TASK'): # In a ponos docker task @@ -95,6 +109,10 @@ else: 'ARKINDEX_API_INTERNAL_URL': 'http://ark-backend/api/v1/', 'ARKINDEX_API_TOKEN': os.environ.get('ARKINDEX_API_TOKEN', 'deadbeefTestToken'), 'LOCAL_IMAGESERVER_ID': LOCAL_IMAGESERVER_ID, + 'S3SOURCE_ACCESS_KEY_ID': AWS_ACCESS_KEY, + 'S3SOURCE_SECRET_KEY': AWS_SECRET_KEY, + 'S3SOURCE_ENDPOINT': AWS_ENDPOINT, + 'AWS_THUMBNAIL_BUCKET': AWS_THUMBNAIL_BUCKET, }, 'network': 'arkindex_default', 'links': { diff --git a/arkindex/project/tests/test_checks.py b/arkindex/project/tests/test_checks.py index 236e5e5f6c8c9353949a6afbf275f97952533d50..c878513851d974f29e99a6d8fad21d92cc826d6b 100644 --- a/arkindex/project/tests/test_checks.py +++ b/arkindex/project/tests/test_checks.py @@ -325,3 +325,41 @@ class ChecksTestCase(TestCase): settings.GITLAB_APP_SECRET = 's3kr3t' self.assertListEqual(gitlab_oauth_check(), []) + + @override_settings() + def test_s3_check(self): + from arkindex.project.checks import s3_check + + del settings.AWS_ACCESS_KEY + del settings.AWS_SECRET_KEY + del settings.AWS_ENDPOINT + del settings.AWS_THUMBNAIL_BUCKET + self.maxDiff = None + self.assertCountEqual(s3_check(), [ + Error( + 'AWS access key ID is missing; all S3-related features will fail.', + hint='settings.AWS_ACCESS_KEY = None', + id='arkindex.E011', + ), + Error( + 'AWS secret key is missing; all S3-related features will fail.', + hint='settings.AWS_SECRET_KEY = None', + id='arkindex.E011', + ), + Error( + 'AWS endpoint is missing; all S3-related features will fail.', + hint='settings.AWS_ENDPOINT = None', + id='arkindex.E011', + ), + Error( + 'S3 thumbnails bucket name is missing; all S3-related features will fail.', + hint='settings.AWS_THUMBNAIL_BUCKET = None', + id='arkindex.E011', + ), + ]) + + settings.AWS_ACCESS_KEY = 'key' + settings.AWS_SECRET_KEY = 's3kr3t' + settings.AWS_ENDPOINT = 'http://somewhere' + settings.AWS_THUMBNAIL_BUCKET = 'Thumbs.db' + self.assertListEqual(s3_check(), []) diff --git a/requirements.txt b/requirements.txt index 091b58bcb468a3ae4b3ca6493b4ae4879be83fe6..155f512b7a5e3898261520d54ead85baa6412537 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # -r ./base/requirements.txt arkindex-common==0.1.0 +boto3==1.9 certifi==2017.7.27.1 chardet==3.0.4 django-cors-headers==2.4.0