From 6f3c6400c470a867b0d83a4aa6f3f805685f5e7e Mon Sep 17 00:00:00 2001 From: Valentin Rigal <rigal@teklia.com> Date: Thu, 9 May 2019 07:56:46 +0000 Subject: [PATCH] Datafile upload via S3 --- README.md | 4 +- arkindex/dataimport/api.py | 18 ++- .../migrations/0005_s3_file_mixin_fields.py | 29 ++++ arkindex/dataimport/models.py | 52 +++--- arkindex/dataimport/serializers.py | 64 +++++++- arkindex/dataimport/tasks/base.py | 2 +- arkindex/dataimport/tasks/image.py | 5 +- .../dataimport/tests/test_datafile_api.py | 151 ++++++++++++++++++ arkindex/dataimport/tests/test_files.py | 6 +- arkindex/dataimport/tests/test_iiif.py | 7 +- arkindex/dataimport/tests/test_image.py | 10 +- arkindex/dataimport/tests/test_pdf.py | 6 +- arkindex/dataimport/tests/test_tasks.py | 2 +- .../documents/management/commands/telegraf.py | 7 +- arkindex/documents/serializers/elements.py | 7 +- arkindex/documents/tests/test_elements_api.py | 7 +- .../management/commands/check_images.py | 11 +- arkindex/images/migrations/0001_initial.py | 2 +- arkindex/images/models.py | 87 +++------- arkindex/images/serializers.py | 13 +- arkindex/images/tests/test_check_images.py | 6 +- arkindex/images/tests/test_image.py | 41 ++--- arkindex/images/tests/test_image_api.py | 11 +- arkindex/images/tests/test_thumbnail.py | 16 +- arkindex/project/api_v1.py | 3 +- arkindex/project/aws.py | 86 ++++++++++ openapi/patch.yml | 62 ++++++- 27 files changed, 529 insertions(+), 186 deletions(-) create mode 100644 arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py create mode 100644 arkindex/dataimport/tests/test_datafile_api.py diff --git a/README.md b/README.md index fb740b018c..6cd01cf4bb 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,8 @@ SHELL_PLUS_POST_IMPORTS = [ 'DataImportMode', 'EventType', )), - ('arkindex.images.models', ( - 'ImageStatus', + ('arkindex.project.aws', ( + 'S3FileStatus', )), ('arkindex.users.models', ( 'OAuthStatus', diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 6f6c0297bb..7ece805c10 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -15,11 +15,13 @@ from rest_framework.exceptions import ValidationError from arkindex.project.mixins import CorpusACLMixin from arkindex.project.permissions import IsVerified, IsAuthenticated, IsAdminUser from arkindex.documents.models import Corpus, Right, Element, ElementType -from arkindex.dataimport.models import \ - DataImport, DataFile, DataImportMode, DataImportFailure, Repository, Event, EventType +from arkindex.dataimport.models import ( + DataImport, DataFile, DataImportMode, + DataImportFailure, Repository, Event, EventType +) from arkindex.dataimport.serializers import ( DataImportLightSerializer, DataImportSerializer, DataImportFromFilesSerializer, - DataImportFailureSerializer, DataFileSerializer, + DataImportFailureSerializer, DataFileSerializer, DataFileCreateSerializer, RepositorySerializer, RepositoryStartImportSerializer, ExternalRepositorySerializer, EventSerializer, MLToolSerializer, ) @@ -326,6 +328,16 @@ class DataFileUpload(CorpusACLMixin, APIView): ) +class DataFileCreate(CreateAPIView): + """ + Create a DataFile from user file hash + A s3 PUT URI is returned to upload corresponding file + """ + + permission_classes = (IsVerified, ) + serializer_class = DataFileCreateSerializer + + class GitRepositoryImportHook(APIView): """ Handle Git push events diff --git a/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py b/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py new file mode 100644 index 0000000000..7ac870539c --- /dev/null +++ b/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py @@ -0,0 +1,29 @@ +# Generated by Django 2.2 on 2019-05-07 08:17 + +import arkindex.project.aws +from django.db import migrations, models +import enumfields.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataimport', '0004_images_foreign_key_to_datafile'), + ] + + operations = [ + migrations.AddField( + model_name='datafile', + name='status', + field=enumfields.fields.EnumField( + default='unchecked', + enum=arkindex.project.aws.S3FileStatus, + max_length=50 + ), + ), + migrations.AlterField( + model_name='datafile', + name='size', + field=models.PositiveIntegerField(help_text='file size in bytes'), + ), + ] diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index b5b4a2dd58..09609b4f2c 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -6,19 +6,14 @@ from django.utils.functional import cached_property from rest_framework.exceptions import ValidationError from enumfields import EnumField, Enum from arkindex_common.ml_tool import MLTool, MLToolType -from arkindex.project.aws import s3 +from arkindex.project.aws import S3FileModelMixin from arkindex.project.models import IndexableModel from arkindex.documents.models import Element, ElementType from arkindex.dataimport.providers import git_providers, get_provider from ponos.models import Workflow, State -from botocore.exceptions import ClientError -from io import BytesIO -import logging import yaml import uuid -logger = logging.getLogger(__name__) - class DataImportMode(Enum): Images = 'images' @@ -189,44 +184,33 @@ class DataImportFailure(models.Model): ) -class DataFile(models.Model): +class DataFile(S3FileModelMixin): id = models.UUIDField(primary_key=True, default=uuid.uuid4) name = models.CharField(max_length=100) - size = models.PositiveIntegerField() - hash = models.CharField(max_length=32) + size = models.PositiveIntegerField(help_text='file size in bytes') content_type = models.CharField(max_length=50) corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='files') + s3_bucket = settings.AWS_STAGING_BUCKET + + @cached_property + def s3_key(self): + return str(self.id) + class Meta: unique_together = (('corpus', 'hash'), ) ordering = ['corpus', 'name'] - @property - def staging_path(self): - return str(self.id) + def perform_check(self, save=True, raise_exc=False): + """ + Check Datafile's existence and update size, content_type from S3 + """ + self.check_hash(save=save, raise_exc=raise_exc) - @cached_property - def s3_object(self): - return s3.Object(settings.AWS_STAGING_BUCKET, self.staging_path) - - def exists(self): - try: - self.s3_object.load() - return True - except ClientError as e: - if e.response['Error']['Code'] != '404': - raise - return False - - def download(self): - b = BytesIO() - logger.debug('Downloading file {} from S3'.format(self.staging_path)) - self.s3_object.download_fileobj(b) - return b - - def download_to(self, path): - logger.debug('Downloading file {} from S3'.format(self.staging_path)) - self.s3_object.download_file(path) + self.size = self.s3_object.content_length + self.content_type = self.s3_object.content_type + if save: + self.save() class Repository(models.Model): diff --git a/arkindex/dataimport/serializers.py b/arkindex/dataimport/serializers.py index 89f5acd3f5..add5df4a03 100644 --- a/arkindex/dataimport/serializers.py +++ b/arkindex/dataimport/serializers.py @@ -1,6 +1,7 @@ from rest_framework import serializers from rest_framework.utils import model_meta from arkindex.project.serializer_fields import EnumField, MLToolField +from arkindex.project.aws import S3FileStatus from arkindex.dataimport.models import ( DataImport, DataImportMode, DataImportFailure, DataFile, Repository, Revision, Event, EventType, DataImportPDFEngine @@ -11,6 +12,7 @@ from arkindex.images.serializers import ImageSerializer from arkindex_common.ml_tool import MLToolType from ponos.models import State import gitlab.v4.objects +import re class DataImportLightSerializer(serializers.ModelSerializer): @@ -184,18 +186,76 @@ class DataFileSerializer(serializers.ModelSerializer): """ Serialize a single uploaded file """ - images = ImageSerializer(many=True) + + images = ImageSerializer(many=True, read_only=True) + status = EnumField(S3FileStatus) class Meta: model = DataFile fields = ( 'id', 'name', + 'hash', 'content_type', 'size', 'images', + 'status', + ) + read_only_fields = ('id', 'name', 'hash', 'size', 'content_type', 'images', ) + + def validate_status(self, value): + if value == S3FileStatus.Checked: + # Status has been resquested to be checked, perform validation + try: + self.instance.perform_check(raise_exc=True) + except (AssertionError, ValueError) as e: + raise serializers.ValidationError(str(e)) + return value + + +class DataFileCreateSerializer(serializers.ModelSerializer): + """ + Serialize a Datafile creation with Amazon S3 PUT uri + """ + + status = EnumField(S3FileStatus, read_only=True) + hash = serializers.RegexField(re.compile(r'[0-9A-Fa-f]{32}'), min_length=32, max_length=32) + s3_put_url = serializers.SerializerMethodField() + + class Meta: + model = DataFile + fields = ( + 'id', + 'name', + 'hash', + 'size', + 'corpus', + 'status', + 's3_url', + 's3_put_url', ) - read_only_fields = ('id', 'size', 'content_type', 'images', ) + read_only_fields = ('id', 'status', 's3_url', 's3_put_url') + + def get_s3_put_url(self, obj): + if obj.status == S3FileStatus.Checked: + return None + return obj.s3_put_url + + def run_validation(self, data): + existing_datafile = DataFile.objects.filter(hash=data['hash']).first() + if existing_datafile: + message = { + 'hash': ['DataFile with this hash already exists'], + 'id': str(existing_datafile.id), + 'status': existing_datafile.status.value, + } + if existing_datafile.status != S3FileStatus.Checked: + message['s3_put_url'] = existing_datafile.s3_put_url + else: + message['s3_url'] = existing_datafile.s3_url + self._errors = message + raise serializers.ValidationError(message) + return super().run_validation(data) class DataImportFailureSerializer(serializers.ModelSerializer): diff --git a/arkindex/dataimport/tasks/base.py b/arkindex/dataimport/tasks/base.py index adc78b12e5..ab4dfb354a 100644 --- a/arkindex/dataimport/tasks/base.py +++ b/arkindex/dataimport/tasks/base.py @@ -23,7 +23,7 @@ def download_files(dataimport, dest_dir): for i, datafile in enumerate(datafiles): logger.info('Downloading file {}Â of {}'.format(i + 1, filecount)) - path = os.path.join(dest_dir, datafile.staging_path) + path = os.path.join(dest_dir, datafile.s3_key) try: datafile.download_to(path) diff --git a/arkindex/dataimport/tasks/image.py b/arkindex/dataimport/tasks/image.py index 84ca73831f..9e71c99185 100644 --- a/arkindex/dataimport/tasks/image.py +++ b/arkindex/dataimport/tasks/image.py @@ -1,7 +1,8 @@ from PIL import Image as PillowImage from arkindex.dataimport.models import DataFile -from arkindex.images.models import ImageServer, Image, ImageStatus +from arkindex.images.models import ImageServer, Image from arkindex.documents.models import Element, ElementType +from arkindex.project.aws import S3FileStatus from urllib.parse import quote import logging @@ -83,7 +84,7 @@ def build_iiif_image(volume, path, data_file, suffix=None): else: # Save to S3 using optional image type conversion img.pillow_save(pillow_img, format=img_format) - img.status = ImageStatus.Checked + img.status = S3FileStatus.Checked img.save() return img diff --git a/arkindex/dataimport/tests/test_datafile_api.py b/arkindex/dataimport/tests/test_datafile_api.py new file mode 100644 index 0000000000..ad16213a73 --- /dev/null +++ b/arkindex/dataimport/tests/test_datafile_api.py @@ -0,0 +1,151 @@ +from arkindex.project.tests import FixtureAPITestCase +from arkindex.dataimport.models import DataFile +from arkindex.project.aws import S3FileStatus +from rest_framework import status +from django.urls import reverse +from unittest.mock import patch + + +class TestDataFileApi(FixtureAPITestCase): + """ + Test datafile creation and upload to S3 + """ + + def setUp(self): + super().setUp() + self.df = DataFile.objects.create( + name='test.pdf', + size=42, + hash='11111111111111111111111111111112', + content_type='application/pdf', + corpus=self.corpus + ) + + def build_file_create_request(self, name=None, hash=None, size=None, corpus=None): + return { + 'name': name or 'some text', + 'hash': hash or '552e21cd4cd9918678e3c1a0df491bc3', + 'size': size or 1, + 'corpus': corpus or str(self.corpus.id), + } + + def test_create_df_requires_login(self): + request = self.build_file_create_request() + response = self.client.post(reverse('api:file-create'), request) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url') + def test_create_datafile(self, s3_presigned_url_mock): + self.client.force_login(self.user) + s3_presigned_url_mock.return_value = 'http://s3/upload_put_url' + request = self.build_file_create_request() + response = self.client.post(reverse('api:file-create'), request) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + self.assertIn('id', data) + df = DataFile.objects.get(id=data['id']) + self.assertDictEqual( + data, + { + 'id': str(df.id), + 'name': df.name, + 'hash': str(df.hash), + 'size': df.size, + 'status': df.status.value, + 's3_url': df.s3_url, + 's3_put_url': df.s3_put_url, + 'corpus': str(df.corpus.id), + } + ) + self.assertListEqual( + [df.name, df.hash, df.size, df.s3_put_url], + ['some text', '552e21cd4cd9918678e3c1a0df491bc3', 1, 'http://s3/upload_put_url'] + ) + + def test_create_existing_hash(self): + self.client.force_login(self.user) + request = self.build_file_create_request(hash='11111111111111111111111111111112') + response = self.client.post(reverse('api:file-create'), request) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + informations = response.json() + self.assertIn('hash', informations) + self.assertIn('id', informations) + self.assertIn('s3_put_url', informations) + + def test_create_checked_existing_hash(self): + self.client.force_login(self.user) + self.df.status = S3FileStatus.Checked + self.df.save() + request = self.build_file_create_request(hash='11111111111111111111111111111112') + response = self.client.post(reverse('api:file-create'), request) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + informations = response.json().keys() + self.assertIn('hash', informations) + self.assertIn('id', informations) + self.assertNotIn('s3_put_url', informations) + self.assertIn('s3_url', informations) + + @patch('arkindex.project.aws.s3.Object') + def test_check_uploaded_datafile(self, s3_object): + s3_object().e_tag = '11111111111111111111111111111112' + s3_object().content_length = '99942' + s3_object().content_type = 'test/testfile' + + self.client.force_login(self.user) + response = self.client.put( + reverse('api:file-retrieve', kwargs={'pk': self.df.id}), + {'status': 'checked'}, + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.df.refresh_from_db() + self.assertDictEqual( + response.json(), + { + 'id': str(self.df.id), + 'name': self.df.name, + 'hash': self.df.hash, + 'content_type': self.df.content_type, + 'size': self.df.size, + 'status': self.df.status.value, + 'images': list(self.df.images.all()), + } + ) + self.assertEqual(self.df.status, S3FileStatus.Checked) + self.assertEqual(self.df.content_type, 'test/testfile') + self.assertEqual(self.df.size, 99942) + + @patch('arkindex.project.aws.s3.Object') + def test_check_wrong_md5(self, s3_object): + s3_object().e_tag = 'wrong md5' + self.client.force_login(self.user) + response = self.client.put( + reverse('api:file-retrieve', kwargs={'pk': self.df.id}), + {'status': 'checked'}, + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + @patch('arkindex.project.aws.s3.Object') + def test_set_error_status_on_failure(self, s3_object): + s3_object().e_tag = 'wrong md5' + self.client.force_login(self.user) + response = self.client.put( + reverse('api:file-retrieve', kwargs={'pk': self.df.id}), + {'status': 'checked'}, + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.df.refresh_from_db() + self.assertEqual(self.df.status, S3FileStatus.Error) + + @patch('arkindex.project.aws.s3.Object') + def test_check_even_if_already_checked(self, s3_object): + self.df.status = S3FileStatus.Checked + self.df.save() + s3_object().e_tag = 'corrupted md5' + self.client.force_login(self.user) + response = self.client.put( + reverse('api:file-retrieve', kwargs={'pk': self.df.id}), + {'status': 'checked'}, + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.df.refresh_from_db() + self.assertEqual(self.df.status, S3FileStatus.Error) diff --git a/arkindex/dataimport/tests/test_files.py b/arkindex/dataimport/tests/test_files.py index 82825d8638..bec834eb69 100644 --- a/arkindex/dataimport/tests/test_files.py +++ b/arkindex/dataimport/tests/test_files.py @@ -85,7 +85,7 @@ class TestFiles(FixtureAPITestCase): response = self.client.post(reverse('api:file-upload', kwargs={'pk': public.id}), data={'file': f}) self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_file_upload(self, s3_mock): """ Assert a file upload creates a database instance and saves the file @@ -123,7 +123,7 @@ class TestFiles(FixtureAPITestCase): response = self.client.post(reverse('api:file-upload', kwargs={'pk': str(uuid.uuid4())}), data={'file': f}) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_file_upload_wrong_content_type(self, s3_mock): """ Assert file upload does not trust the client defined content type @@ -142,7 +142,7 @@ class TestFiles(FixtureAPITestCase): self.assertEqual(s3_mock.Object.call_count, 1) self.assertEqual(s3_mock.Object().upload_fileobj.call_count, 1) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_file_upload_unique(self, s3_mock): """ Assert uploading the same file twice fails diff --git a/arkindex/dataimport/tests/test_iiif.py b/arkindex/dataimport/tests/test_iiif.py index d060212ad7..a4e70d8d99 100644 --- a/arkindex/dataimport/tests/test_iiif.py +++ b/arkindex/dataimport/tests/test_iiif.py @@ -3,10 +3,11 @@ from django.test import override_settings from arkindex_common.enums import MetaType from arkindex.project.tests import FixtureTestCase from arkindex.documents.models import Element, ElementType, Page -from arkindex.images.models import ImageStatus, ImageServer +from arkindex.images.models import ImageServer from arkindex.dataimport.models import EventType, DataImportMode from arkindex.dataimport.git import GitFlow from arkindex.dataimport.iiif import IIIFParser, ManifestParser, CollectionParser +from arkindex.project.aws import S3FileStatus from requests.exceptions import Timeout import os.path import git @@ -78,7 +79,7 @@ class TestManifestParser(FixtureTestCase): self.assertEqual(p.zone.polygon.y, 0) self.assertEqual(p.zone.polygon.width, 2000) self.assertEqual(p.zone.polygon.height, 1000) - self.assertEqual(p.zone.image.status, ImageStatus.Unchecked) + self.assertEqual(p.zone.image.status, S3FileStatus.Unchecked) self.assertEqual(p.zone.image.server, self.imgsrv) self.assertEqual(p.zone.image.width, 2000) self.assertEqual(p.zone.image.height, 1000) @@ -124,7 +125,7 @@ class TestManifestParser(FixtureTestCase): self.assertEqual(p.zone.polygon.y, 0) self.assertEqual(p.zone.polygon.width, 2000) self.assertEqual(p.zone.polygon.height, 1000) - self.assertEqual(p.zone.image.status, ImageStatus.Unchecked) + self.assertEqual(p.zone.image.status, S3FileStatus.Unchecked) self.assertEqual(p.zone.image.server, self.imgsrv) self.assertEqual(p.zone.image.width, 2000) self.assertEqual(p.zone.image.height, 1000) diff --git a/arkindex/dataimport/tests/test_image.py b/arkindex/dataimport/tests/test_image.py index 941a66ba18..059142eaed 100644 --- a/arkindex/dataimport/tests/test_image.py +++ b/arkindex/dataimport/tests/test_image.py @@ -2,7 +2,7 @@ from unittest.mock import patch, call from arkindex.project.tests import FixtureTestCase from arkindex.documents.models import ElementType from arkindex.dataimport.tasks import check_images, build_iiif_image -from arkindex.images.models import ImageStatus +from arkindex.project.aws import S3FileStatus from botocore.exceptions import ClientError @@ -31,7 +31,7 @@ class TestImageTasks(FixtureTestCase): (self.df, '/some/path'), ]) - @patch('arkindex.images.models.s3.Object') + @patch('arkindex.project.aws.s3.Object') @patch('arkindex.dataimport.tasks.image.PillowImage') def test_build_iiif_image(self, image_mock, s3obj_mock): image_mock.open.return_value.format = 'BMP' @@ -58,10 +58,10 @@ class TestImageTasks(FixtureTestCase): self.assertEqual(img.path, expected_path.replace('/', '%2F')) self.assertEqual(img.width, 400) self.assertEqual(img.height, 900) - self.assertEqual(img.status, ImageStatus.Checked) + self.assertEqual(img.status, S3FileStatus.Checked) self.assertEqual(img.datafile, self.df) - @patch('arkindex.images.models.s3.Object') + @patch('arkindex.project.aws.s3.Object') @patch('arkindex.dataimport.tasks.image.PillowImage') def test_build_iiif_image_retry(self, image_mock, s3obj_mock): """ @@ -74,7 +74,7 @@ class TestImageTasks(FixtureTestCase): datafile=self.df, width=900, height=400, - status=ImageStatus.Checked, + status=S3FileStatus.Checked, ) with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'): diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py index da87d98f1f..0c6af8c6a1 100644 --- a/arkindex/dataimport/tests/test_pdf.py +++ b/arkindex/dataimport/tests/test_pdf.py @@ -49,7 +49,7 @@ class TestPdf(FixtureTestCase): with self.assertRaises(AssertionError): extract_pdf_images(file_mock, self.pdf_path, self.working_dir) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_extract_pdf_images_s3_error(self, s3_mock): """ Test extract_pdf_images task lets S3 errors through @@ -62,7 +62,7 @@ class TestPdf(FixtureTestCase): with self.assertRaises(ClientError): extract_pdf_images(file_mock, self.pdf_path, self.working_dir) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_extract_pdf_images_with_convert(self, s3_mock): """ Test extract_pdf_images runs ImageMagick and returns proper info @@ -74,7 +74,7 @@ class TestPdf(FixtureTestCase): os.path.join(self.working_dir, 'pdf-0001.jpg'), ]) - @patch('arkindex.dataimport.models.s3') + @patch('arkindex.project.aws.s3') def test_extract_pdf_images_with_poppler(self, s3_mock): """ Test extract_pdf_images runs ImageMagick and returns proper info diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py index 4bf38b1fce..9fead8f1d3 100644 --- a/arkindex/dataimport/tests/test_tasks.py +++ b/arkindex/dataimport/tests/test_tasks.py @@ -29,7 +29,7 @@ class TestTasks(FixtureTestCase): @classmethod def setUpClass(cls): super().setUpClass() - cls.s3obj_patch = patch('arkindex.dataimport.models.s3.Object') + cls.s3obj_patch = patch('arkindex.project.aws.s3.Object') cls.s3obj_mock = cls.s3obj_patch.start() cls.access_patch = patch('arkindex.dataimport.tasks.base.os.access') cls.access_mock = cls.access_patch.start() diff --git a/arkindex/documents/management/commands/telegraf.py b/arkindex/documents/management/commands/telegraf.py index 5c91c41558..721602a79b 100644 --- a/arkindex/documents/management/commands/telegraf.py +++ b/arkindex/documents/management/commands/telegraf.py @@ -4,7 +4,8 @@ from django.conf import settings from django.db.models import Count from django.utils.text import slugify from arkindex.documents.models import Element, ElementType, Transcription, Corpus -from arkindex.images.models import ImageServer, ImageStatus, Image +from arkindex.images.models import ImageServer, Image +from arkindex.project.aws import S3FileStatus from urllib.parse import urljoin import time import requests @@ -67,13 +68,13 @@ class Command(BaseCommand): # Image server statistics, in three SQL queries checked_counts = dict( Image.objects - .filter(status=ImageStatus.Checked) + .filter(status=S3FileStatus.Checked) .values_list('server') .annotate(Count('server')) ) error_counts = dict( Image.objects - .filter(status=ImageStatus.Error) + .filter(status=S3FileStatus.Error) .values_list('server') .annotate(Count('server')) ) diff --git a/arkindex/documents/serializers/elements.py b/arkindex/documents/serializers/elements.py index 3d7b4fe92b..044fceac82 100644 --- a/arkindex/documents/serializers/elements.py +++ b/arkindex/documents/serializers/elements.py @@ -5,13 +5,14 @@ from arkindex.documents.models import \ Element, ElementType, Page, PageType, PageDirection, Act, Corpus, \ MetaData, InterpretedDate, DateType from arkindex.images.serializers import ZoneSerializer, ImageSerializer -from arkindex.images.models import Image, ImageStatus +from arkindex.images.models import Image from arkindex.documents.serializers.light import CorpusLightSerializer, ElementLightSerializer from arkindex.documents.serializers.ml import ClassificationSerializer, TranscriptionSerializer from arkindex.dataimport.serializers import RevisionSerializer from arkindex.dataimport.models import EventType from arkindex.project.serializer_fields import EnumField from arkindex.project.polygon import Polygon +from arkindex.project.aws import S3FileStatus from collections import defaultdict @@ -86,7 +87,7 @@ class ElementSlimSerializer(serializers.ModelSerializer): """ type = EnumField(ElementType, read_only=True) corpus = CorpusLightSerializer(read_only=True) - thumbnail_url = serializers.URLField(source='thumbnail.url', read_only=True) + thumbnail_url = serializers.URLField(source='thumbnail.s3_url', read_only=True) class Meta: model = Element @@ -261,7 +262,7 @@ class ElementCreateSerializer(ElementLightSerializer): 'Parent and child must be in the same corpus' ) image = data.get('image') - if image and image.status != ImageStatus.Checked: + if image and image.status != S3FileStatus.Checked: errors['image'].append( 'Image is not checked. Try to upload a valid IIIF image.' ) diff --git a/arkindex/documents/tests/test_elements_api.py b/arkindex/documents/tests/test_elements_api.py index 8aac24ff70..68ca0ddab2 100644 --- a/arkindex/documents/tests/test_elements_api.py +++ b/arkindex/documents/tests/test_elements_api.py @@ -2,8 +2,9 @@ from django.urls import reverse from rest_framework import status from arkindex.documents.models import Element, ElementType, DataSource, \ TranscriptionType, Page, Act, Corpus -from arkindex.images.models import ImageServer, ImageStatus +from arkindex.images.models import ImageServer from arkindex.project.tests import FixtureAPITestCase +from arkindex.project.aws import S3FileStatus class TestElementsAPI(FixtureAPITestCase): @@ -16,7 +17,7 @@ class TestElementsAPI(FixtureAPITestCase): cls.src = DataSource.objects.get(slug='test') cls.image = ImageServer.objects.local.images.create( path="kingdom/far/away", - status=ImageStatus.Checked + status=S3FileStatus.Checked ) def test_get_element(self): @@ -170,7 +171,7 @@ class TestElementsAPI(FixtureAPITestCase): image=str(self.image.id), metadata={'folio': 'new'}, ) - self.image.status = ImageStatus.Error + self.image.status = S3FileStatus.Error self.image.save() response = self.client.post(**request) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) diff --git a/arkindex/images/management/commands/check_images.py b/arkindex/images/management/commands/check_images.py index 975093b52c..1d1e8923d2 100644 --- a/arkindex/images/management/commands/check_images.py +++ b/arkindex/images/management/commands/check_images.py @@ -2,7 +2,8 @@ from django.core.management.base import CommandError from django.conf import settings from ponos.management.base import PonosCommand from arkindex.project.argparse import CorpusArgument, ElementArgument -from arkindex.images.models import ImageServer, Image, ImageStatus +from arkindex.project.aws import S3FileStatus +from arkindex.images.models import ImageServer, Image import logging logging.basicConfig( @@ -59,7 +60,7 @@ class Command(PonosCommand): images = Image.objects.all() if not force: - images = images.exclude(status=ImageStatus.Checked) + images = images.exclude(status=S3FileStatus.Checked) return {'images': images, 'sample': sample} @@ -68,7 +69,7 @@ class Command(PonosCommand): # Re-check a few images from each server for server in ImageServer.objects.all(): server_sample = server.images \ - .filter(status=ImageStatus.Checked) \ + .filter(status=S3FileStatus.Checked) \ .order_by('?')[:sample] logger.info('Re-checking {}Â images in server {}'.format(len(server_sample), server.display_name)) self.check(server_sample) @@ -80,9 +81,9 @@ class Command(PonosCommand): for image in images: logger.info('Checking image {}Â at {}'.format(str(image.id), image.url)) image.perform_check(save=True) - if image.status == ImageStatus.Checked: + if image.status == S3FileStatus.Checked: successful += 1 - elif image.status == ImageStatus.Error: + elif image.status == S3FileStatus.Error: failed += 1 return successful, failed diff --git a/arkindex/images/migrations/0001_initial.py b/arkindex/images/migrations/0001_initial.py index f653595a95..76fbfd2cc6 100644 --- a/arkindex/images/migrations/0001_initial.py +++ b/arkindex/images/migrations/0001_initial.py @@ -25,7 +25,7 @@ class Migration(migrations.Migration): ('path', models.TextField()), ('width', models.PositiveIntegerField(default=0)), ('height', models.PositiveIntegerField(default=0)), - ('status', enumfields.fields.EnumField(default='unchecked', enum=arkindex.images.models.ImageStatus, max_length=50)), + ('status', enumfields.fields.EnumField(default='unchecked', enum=arkindex.project.aws.S3FileStatus, max_length=50)), ], ), migrations.CreateModel( diff --git a/arkindex/images/models.py b/arkindex/images/models.py index cb133dcd62..06952fb73a 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -9,9 +9,8 @@ from arkindex.images.managers import ImageServerManager from arkindex.project.models import IndexableModel from arkindex.project.fields import StripSlashURLField, LStripTextField from arkindex.project.polygon import PolygonField -from arkindex.project.aws import s3 +from arkindex.project.aws import S3FileMixin, S3FileModelMixin, S3FileStatus from botocore.client import ClientError -from enumfields import EnumField, Enum from io import BytesIO from PIL import Image as PillowImage import logging @@ -179,16 +178,7 @@ class ImageServer(models.Model): return '{}_{}'.format(self.display_name, self.id) -class ImageStatus(Enum): - """ - Image validation status - """ - Checked = "checked" - Unchecked = "unchecked" - Error = "error" - - -class Image(IndexableModel): +class Image(S3FileModelMixin, IndexableModel): """ A document image """ @@ -196,9 +186,15 @@ class Image(IndexableModel): path = LStripTextField(chars='/') width = models.PositiveIntegerField(default=0) height = models.PositiveIntegerField(default=0) - status = EnumField(ImageStatus, default=ImageStatus.Unchecked, max_length=50) - hash = models.CharField(max_length=32, blank=True, null=True) datafile = models.ForeignKey(DataFile, related_name='images', null=True, on_delete=models.SET_NULL) + # Overwrite s3 hash to allow null value for external servers + hash = models.CharField(max_length=32, blank=True, null=True) + + s3_bucket = settings.AWS_IIIF_BUCKET + + @cached_property + def s3_key(self): + return urllib.parse.unquote(self.path) class Meta: unique_together = ( @@ -212,34 +208,7 @@ class Image(IndexableModel): @cached_property def s3_object(self): assert self.server.is_local, 'Cannot load images on remote image servers via S3' - return s3.Object(settings.AWS_IIIF_BUCKET, urllib.parse.unquote(self.path)) - - @property - def s3_url(self): - return s3.meta.client.generate_presigned_url('get_object', Params={ - 'Bucket': self.s3_object.bucket_name, - 'Key': self.s3_object.key, - }) - - @property - def s3_put_url(self): - return s3.meta.client.generate_presigned_url('put_object', Params={ - 'Bucket': self.s3_object.bucket_name, - 'Key': self.s3_object.key, - }) - - def exists(self): - """ - Returns whether the Image exists on the IIIF S3 bucket by performing a HEAD request to S3. - See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.load - """ - try: - self.s3_object.load() - return True - except ClientError as e: - if e.response['Error']['Code'] != '404': - raise - return False + return super(S3FileModelMixin, self).get_s3_object() def get_thumbnail_url(self, max_width=200, max_height=None): if max_width is None and max_height is None: @@ -272,7 +241,7 @@ class Image(IndexableModel): 'Image id does not start with server url ({} vs. {})'.format(image_id, self.server.url) except Exception as e: logger.warn('Image check failed: {}'.format(str(e))) - self.status = ImageStatus.Error + self.status = S3FileStatus.Error if save: self.save() if raise_exc: @@ -283,25 +252,13 @@ class Image(IndexableModel): self.path = image_id[len(self.server.url) + 1:] self.width, self.height = int(data['width']), int(data['height']) - self.status = ImageStatus.Checked + self.status = S3FileStatus.Checked if save: self.save() - def check_hash(self, save=True, raise_exc=False): - """ - Checks the MD5 hash against the hash from Amazon S3 - """ + def check_hash(self, *args, **kwargs): assert self.server.is_local, 'Image hash checks are not supported outside the local server' - assert self.hash, 'Image has no hash' - assert self.exists(), 'Image does not exist' - # The hash given by Boto seems to be surrounded by double quotes - if self.s3_object.e_tag.strip('"') == self.hash: - return - self.status = ImageStatus.Error - if save: - self.save() - if raise_exc: - raise ValueError('MD5 hashes do not match') + S3FileMixin.check_hash(self, *args, **kwargs) def pillow_open(self, max_width=500): if self.server.is_local: @@ -333,10 +290,11 @@ class Image(IndexableModel): return '{} - {}'.format(self.id, self.url) -class Thumbnail(object): +class Thumbnail(S3FileMixin): """ Describes an element thumbnail """ + s3_bucket = settings.AWS_THUMBNAIL_BUCKET def __init__(self, element): self.element = element @@ -346,15 +304,8 @@ class Thumbnail(object): return '{}.jpg'.format(str(self.element.id.hex)) @cached_property - def s3_object(self): - return s3.Object(settings.AWS_THUMBNAIL_BUCKET, self.name) - - @property - def url(self): - return s3.meta.client.generate_presigned_url('get_object', Params={ - 'Bucket': self.s3_object.bucket_name, - 'Key': self.s3_object.key, - }) + def s3_key(self): + return self.name def create(self, width=900, height=400, max_element_count=3, force=False): """ diff --git a/arkindex/images/serializers.py b/arkindex/images/serializers.py index bf6fa6ce12..09db7992b9 100644 --- a/arkindex/images/serializers.py +++ b/arkindex/images/serializers.py @@ -1,8 +1,9 @@ from rest_framework import serializers from arkindex.project.serializer_fields import PolygonField, EnumField from arkindex.documents.models import Corpus -from arkindex.images.models import Image, Zone, ImageServer, ImageStatus +from arkindex.images.models import Image, Zone, ImageServer from arkindex.dataimport.models import DataFile +from arkindex.project.aws import S3FileStatus import re import uuid @@ -25,7 +26,7 @@ class ImageSerializer(serializers.ModelSerializer): """ Serialises an image """ - status = EnumField(ImageStatus) + status = EnumField(S3FileStatus) class Meta: model = Image @@ -42,8 +43,8 @@ class ImageSerializer(serializers.ModelSerializer): def validate_status(self, value): if not self.instance: # Force the Unchecked status when creating a new image - return ImageStatus.Unchecked - elif value == ImageStatus.Checked: + return S3FileStatus.Unchecked + elif value == S3FileStatus.Checked: # Perform image validation if we are updating an existing image to Checked try: self.instance.check_hash(raise_exc=True) @@ -57,7 +58,7 @@ class ImageUploadSerializer(ImageSerializer): """ Serialize an image to upload in s3 """ - status = EnumField(ImageStatus, read_only=True) + status = EnumField(S3FileStatus, read_only=True) hash = serializers.RegexField(re.compile(r'[0-9A-Fa-f]{32}'), min_length=32, max_length=32) datafile = serializers.PrimaryKeyRelatedField( queryset=DataFile.objects.none(), @@ -80,7 +81,7 @@ class ImageUploadSerializer(ImageSerializer): ) def get_s3_put_url(self, obj): - if obj.status == ImageStatus.Checked or not obj.server.is_local: + if obj.status == S3FileStatus.Checked or not obj.server.is_local: # No PUT for existing images or external servers return None return obj.s3_put_url diff --git a/arkindex/images/tests/test_check_images.py b/arkindex/images/tests/test_check_images.py index 914437d8ef..e1de93ae05 100644 --- a/arkindex/images/tests/test_check_images.py +++ b/arkindex/images/tests/test_check_images.py @@ -1,5 +1,5 @@ from arkindex.project.tests import FixtureTestCase -from arkindex.images.models import ImageStatus +from arkindex.project.aws import S3FileStatus from django.core.management import call_command from django.core.management.base import CommandError from unittest.mock import patch @@ -19,8 +19,8 @@ class TestCheckImages(FixtureTestCase): cls.p2 = cls.corpus.elements.get(name='Volume 1, page 1v') # Set two images to Checked - cls.imgsrv.images.filter(path='img5').update(status=ImageStatus.Checked) - cls.imgsrv.images.filter(path='img6').update(status=ImageStatus.Checked) + cls.imgsrv.images.filter(path='img5').update(status=S3FileStatus.Checked) + cls.imgsrv.images.filter(path='img6').update(status=S3FileStatus.Checked) # Create an image linked to zero elements cls.imgsrv.images.create(path='am-outside') diff --git a/arkindex/images/tests/test_image.py b/arkindex/images/tests/test_image.py index d53d8df340..d5842e421a 100644 --- a/arkindex/images/tests/test_image.py +++ b/arkindex/images/tests/test_image.py @@ -3,7 +3,8 @@ from unittest.mock import patch, call from django.test import override_settings from botocore.exceptions import ClientError from arkindex.project.tests import FixtureTestCase -from arkindex.images.models import ImageServer, ImageStatus +from arkindex.project.aws import S3FileStatus +from arkindex.images.models import ImageServer @override_settings(LOCAL_IMAGESERVER_ID=1, AWS_IIIF_BUCKET='iiif') @@ -43,7 +44,7 @@ class TestImage(FixtureTestCase): # Should handle paths that look like an absolute URL but aren't self.assertEqual(self.imgsrv.build_url('ark:/some/path'), 'http://server/ark:/some/path') - @patch('arkindex.images.models.s3.meta.client.generate_presigned_url') + @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url') def test_s3_url(self, presigned_url_mock): presigned_url_mock.return_value = 'http://somewhere' img = ImageServer.objects.local.images.create(path='abcd') @@ -54,7 +55,7 @@ class TestImage(FixtureTestCase): Params={'Bucket': 'iiif', 'Key': 'abcd'}, )) - @patch('arkindex.images.models.s3.meta.client.generate_presigned_url') + @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url') def test_s3_put_url(self, presigned_url_mock): presigned_url_mock.return_value = 'http://somewhere' img = ImageServer.objects.local.images.create(path='abcd') @@ -65,53 +66,53 @@ class TestImage(FixtureTestCase): Params={'Bucket': 'iiif', 'Key': 'abcd'}, )) - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash(self, s3_mock): s3_mock.Object().e_tag = '"beef"' img = ImageServer.objects.local.images.create(path='abcd', hash='beef') - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) img.check_hash() - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) @override_settings(LOCAL_IMAGESERVER_ID=-1) - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash_external(self, s3_mock): self.assertFalse(self.imgsrv.is_local) img = self.imgsrv.images.create(path='abcd', hash='beef') - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) with self.assertRaisesRegex(AssertionError, 'not supported'): img.check_hash() - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash_missing(self, s3_mock): img = ImageServer.objects.local.images.create(path='abcd') - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) with self.assertRaisesRegex(AssertionError, 'no hash'): img.check_hash() - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash_not_found(self, s3_mock): s3_mock.Object().load.side_effect = ClientError({'Error': {'Code': '404'}}, 'head_object') img = ImageServer.objects.local.images.create(path='abcd', hash='beef') - self.assertEqual(img.status, ImageStatus.Unchecked) - with self.assertRaisesRegex(AssertionError, 'does not exist'): + self.assertEqual(img.status, S3FileStatus.Unchecked) + with self.assertRaisesRegex(AssertionError, 'No file content, assert file has been correctly uploaded'): img.check_hash() - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash_error(self, s3_mock): s3_mock.Object().e_tag = '"wrong"' img = ImageServer.objects.local.images.create(path='abcd', hash='beef') - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) img.check_hash() - self.assertEqual(img.status, ImageStatus.Error) + self.assertEqual(img.status, S3FileStatus.Error) - @patch('arkindex.images.models.s3') + @patch('arkindex.project.aws.s3') def test_check_hash_exc(self, s3_mock): s3_mock.Object().e_tag = '"wrong"' img = ImageServer.objects.local.images.create(path='abcd', hash='beef') - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) with self.assertRaisesRegex(ValueError, 'hashes do not match'): img.check_hash(raise_exc=True, save=False) - self.assertEqual(img.status, ImageStatus.Error) + self.assertEqual(img.status, S3FileStatus.Error) img.refresh_from_db() - self.assertEqual(img.status, ImageStatus.Unchecked) + self.assertEqual(img.status, S3FileStatus.Unchecked) diff --git a/arkindex/images/tests/test_image_api.py b/arkindex/images/tests/test_image_api.py index c9d80e2293..84b8967d19 100644 --- a/arkindex/images/tests/test_image_api.py +++ b/arkindex/images/tests/test_image_api.py @@ -1,5 +1,6 @@ from arkindex.project.tests import FixtureAPITestCase -from arkindex.images.models import Image, ImageServer, ImageStatus +from arkindex.images.models import Image, ImageServer +from arkindex.project.aws import S3FileStatus from rest_framework import status from django.test import override_settings from django.urls import reverse @@ -20,7 +21,7 @@ class TestImageApi(FixtureAPITestCase): response = self.client.post(reverse('api:image-create'), {'hash': self.image_hash}) self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - @patch('arkindex.images.models.s3.meta.client.generate_presigned_url') + @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url') def test_create_image(self, s3_presigned_url_mock): """ Use serializer to create a new Image @@ -86,7 +87,7 @@ class TestImageApi(FixtureAPITestCase): def test_update_image_requires_login(self): self.client.logout() img = ImageServer.objects.local.images.create( - status=ImageStatus.Unchecked, + status=S3FileStatus.Unchecked, path='test_update', ) response = self.client.put( @@ -102,7 +103,7 @@ class TestImageApi(FixtureAPITestCase): Test setting an image's status to Checked runs the image checks """ img = ImageServer.objects.local.images.create( - status=ImageStatus.Unchecked, + status=S3FileStatus.Unchecked, path='test_update', ) response = self.client.put( @@ -111,7 +112,7 @@ class TestImageApi(FixtureAPITestCase): ) self.assertEqual(response.status_code, status.HTTP_200_OK) img.refresh_from_db() - self.assertEqual(img.status, ImageStatus.Checked) + self.assertEqual(img.status, S3FileStatus.Checked) self.assertEqual(check_hash_mock.call_count, 1) self.assertEqual(perform_check_mock.call_count, 1) self.assertEqual(check_hash_mock.call_args, call(raise_exc=True)) diff --git a/arkindex/images/tests/test_thumbnail.py b/arkindex/images/tests/test_thumbnail.py index cbb4389db0..83ba23bf9a 100644 --- a/arkindex/images/tests/test_thumbnail.py +++ b/arkindex/images/tests/test_thumbnail.py @@ -11,7 +11,7 @@ class TestThumbnail(FixtureTestCase): @classmethod def setUpClass(cls): super().setUpClass() - cls.s3_mock = patch('arkindex.images.models.s3').start() + cls.s3_mock = patch('arkindex.project.aws.s3').start() @classmethod def setUpTestData(cls): @@ -31,10 +31,11 @@ class TestThumbnail(FixtureTestCase): self.assertEqual(self.vol1.thumbnail.name, str(self.vol1.id.hex) + '.jpg') def test_s3_object(self): - with self.settings(AWS_THUMBNAIL_BUCKET="derp"): - # Test twice to ensure s3_object is only created once - self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value) - self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value) + thumbnail = self.vol1.thumbnail + thumbnail.s3_bucket = 'derp' + # Test twice to ensure s3_object is only created once + self.assertEqual(thumbnail.s3_object, self.s3_mock.Object.return_value) + self.assertEqual(thumbnail.s3_object, self.s3_mock.Object.return_value) self.assertEqual(self.s3_mock.Object.call_count, 1) self.assertEqual(self.s3_mock.Object.call_args, call('derp', self.vol1.thumbnail.name)) @@ -42,8 +43,9 @@ class TestThumbnail(FixtureTestCase): self.s3_mock.meta.client.generate_presigned_url.return_value = 'http://nowhere' self.s3_mock.Object.return_value.bucket_name = 'derp' self.s3_mock.Object.return_value.key = 'meme.jpg' - with self.settings(AWS_THUMBNAIL_BUCKET="derp"): - self.assertEqual(self.vol1.thumbnail.url, 'http://nowhere') + thumbnail = self.vol1.thumbnail + thumbnail.s3_bucket = 'derp' + self.assertEqual(thumbnail.s3_url, 'http://nowhere') self.assertEqual(self.s3_mock.Object.call_count, 1) self.assertEqual(self.s3_mock.meta.client.generate_presigned_url.call_count, 1) self.assertEqual( diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 6f34541cfa..37b0141b87 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -16,7 +16,7 @@ from arkindex.documents.api.iiif import ( from arkindex.dataimport.api import ( DataImportsList, DataImportDetails, DataImportRetry, DataImportFailures, DataImportDemo, DataFileList, DataFileRetrieve, DataFileUpload, DataImportFromFiles, - RepositoryList, RepositoryRetrieve, RepositoryStartImport, + RepositoryList, RepositoryRetrieve, RepositoryStartImport, DataFileCreate, GitRepositoryImportHook, AvailableRepositoriesList, ElementHistory, MLToolList, ) from arkindex.images.api import ImageCreate, ImageRetrieve @@ -105,6 +105,7 @@ api = [ path('imports/<uuid:pk>/failures/', DataImportFailures.as_view(), name='import-failures'), path('imports/demo/<uuid:pk>/', DataImportDemo.as_view(), name='import-demo'), path('imports/files/<uuid:pk>/', DataFileList.as_view(), name='file-list'), + path('imports/files/create/', DataFileCreate.as_view(), name='file-create'), path('imports/file/<uuid:pk>/', DataFileRetrieve.as_view(), name='file-retrieve'), path('imports/upload/<uuid:pk>/', DataFileUpload.as_view(), name='file-upload'), diff --git a/arkindex/project/aws.py b/arkindex/project/aws.py index 041ed1e2c3..7cb31f2eeb 100644 --- a/arkindex/project/aws.py +++ b/arkindex/project/aws.py @@ -1,6 +1,15 @@ from django.conf import settings +from django.db import models from botocore.config import Config +from botocore.exceptions import ClientError +from django.utils.functional import cached_property +from enumfields import EnumField, Enum +from io import BytesIO import boto3.session +import logging + +logger = logging.getLogger(__name__) + session = boto3.session.Session( aws_access_key_id=settings.AWS_ACCESS_KEY, @@ -17,3 +26,80 @@ s3 = session.resource( endpoint_url=settings.AWS_ENDPOINT, config=config, ) + + +class S3FileStatus(Enum): + """ + S3 file validation status + """ + Checked = "checked" + Unchecked = "unchecked" + Error = "error" + + +class S3FileMixin(object): + + def get_s3_object(self): + return s3.Object(self.s3_bucket, self.s3_key) + + s3_object = cached_property(get_s3_object, name='s3_object') + + @property + def s3_put_url(self): + return s3.meta.client.generate_presigned_url('put_object', Params={ + 'Bucket': self.s3_object.bucket_name, + 'Key': self.s3_object.key, + }) + + @property + def s3_url(self): + return s3.meta.client.generate_presigned_url('get_object', Params={ + 'Bucket': self.s3_object.bucket_name, + 'Key': self.s3_object.key, + }) + + def exists(self): + """ + Returns whether the file exists on S3 bucket by performing a HEAD request to S3. + See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Ob ject.load + """ + try: + self.s3_object.load() + return True + except ClientError as e: + if e.response['Error']['Code'] != '404': + raise + return False + + def download(self): + b = BytesIO() + logger.debug('Downloading file {} from S3'.format(self.s3_key)) + self.s3_object.download_fileobj(b) + return b + + def download_to(self, path): + logger.debug('Downloading file {} from S3'.format(self.s3_key)) + self.s3_object.download_file(path) + + def check_hash(self, save=True, raise_exc=False): + """ + Checks the MD5 hash against the hash from Amazon S3 + """ + assert self.hash, 'File has no hash' + assert self.exists(), 'No file content, assert file has been correctly uploaded' + # The hash given by Boto seems to be surrounded by double quotes + if self.s3_object.e_tag.strip('"') == self.hash: + return + self.status = S3FileStatus.Error + if save: + self.save() + if raise_exc: + raise ValueError('MD5 hashes do not match') + + +class S3FileModelMixin(S3FileMixin, models.Model): + hash = models.CharField(max_length=32) + status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50) + + class Meta: + abstract = True diff --git a/openapi/patch.yml b/openapi/patch.yml index 8538e9c5da..33c1cb6e02 100644 --- a/openapi/patch.yml +++ b/openapi/patch.yml @@ -229,11 +229,11 @@ paths: tags: - files patch: - description: Rename an uploaded file + description: Update a datafile's status tags: - files put: - description: Rename an uploaded file + description: Update a datafile's status tags: - files delete: @@ -337,6 +337,64 @@ paths: file: - File already exists id: 3cc2e9e0-4172-44b1-8d65-bc3fffd076dc + /api/v1/imports/files/create/: + post: + operationId: CreateDataFile + description: Create a Datafile. In case of success, a signed uri is returned to upload file content directly to remote server. + tags: + - files + responses: + '400': + description: An error occured while creating the data file. + content: + application/json: + schema: + properties: + detail: + type: string + description: A generic error message when an error occurs outside of a specific field. + readOnly: true + hash: + type: array + description: Errors that occured during hash field validation. + readOnly: true + corpus: + type: array + description: Errors that occured during corpus ID field validation. + readOnly: true + name: + type: array + description: Errors that occured during name field validation. + readOnly: true + size: + type: array + description: Errors that occured during size field validation. + readOnly: true + id: + type: string + description: UUID of existing DataFile, if the error comes from a duplicated creation. + readOnly: true + status: + type: string + description: Status of existing DataFile, if the error comes from a duplicated creation. + readOnly: true + s3_put_url: + type: string + description: Signed url used to upload file content to remote server, if the error comes from a duplicated creation and file status is not checked. + readOnly: true + s3_url: + type: string + description: Remote file url, if the error comes from a duplicated creation and file status is checked. + readOnly: true + examples: + file-exists: + summary: Data file already exists. Response include existing file's UUID, status and remote server PUT url to upload file content. + value: + hash: + - DataFile with this hash already exists + id: 55cd009d-cd4b-4ec2-a475-b060f98f9138 + status: unchecked + s3_put_url: https://remote-server.net/staging/55cd009d-cd4b-4ec2-a475-b060f98f9138?Credential=mycredential&Signature=mysignature /api/v1/imports/{id}/: get: description: Retrieve a data import -- GitLab