From 6f3c6400c470a867b0d83a4aa6f3f805685f5e7e Mon Sep 17 00:00:00 2001
From: Valentin Rigal <rigal@teklia.com>
Date: Thu, 9 May 2019 07:56:46 +0000
Subject: [PATCH] Datafile upload via S3

---
 README.md                                     |   4 +-
 arkindex/dataimport/api.py                    |  18 ++-
 .../migrations/0005_s3_file_mixin_fields.py   |  29 ++++
 arkindex/dataimport/models.py                 |  52 +++---
 arkindex/dataimport/serializers.py            |  64 +++++++-
 arkindex/dataimport/tasks/base.py             |   2 +-
 arkindex/dataimport/tasks/image.py            |   5 +-
 .../dataimport/tests/test_datafile_api.py     | 151 ++++++++++++++++++
 arkindex/dataimport/tests/test_files.py       |   6 +-
 arkindex/dataimport/tests/test_iiif.py        |   7 +-
 arkindex/dataimport/tests/test_image.py       |  10 +-
 arkindex/dataimport/tests/test_pdf.py         |   6 +-
 arkindex/dataimport/tests/test_tasks.py       |   2 +-
 .../documents/management/commands/telegraf.py |   7 +-
 arkindex/documents/serializers/elements.py    |   7 +-
 arkindex/documents/tests/test_elements_api.py |   7 +-
 .../management/commands/check_images.py       |  11 +-
 arkindex/images/migrations/0001_initial.py    |   2 +-
 arkindex/images/models.py                     |  87 +++-------
 arkindex/images/serializers.py                |  13 +-
 arkindex/images/tests/test_check_images.py    |   6 +-
 arkindex/images/tests/test_image.py           |  41 ++---
 arkindex/images/tests/test_image_api.py       |  11 +-
 arkindex/images/tests/test_thumbnail.py       |  16 +-
 arkindex/project/api_v1.py                    |   3 +-
 arkindex/project/aws.py                       |  86 ++++++++++
 openapi/patch.yml                             |  62 ++++++-
 27 files changed, 529 insertions(+), 186 deletions(-)
 create mode 100644 arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py
 create mode 100644 arkindex/dataimport/tests/test_datafile_api.py

diff --git a/README.md b/README.md
index fb740b018c..6cd01cf4bb 100644
--- a/README.md
+++ b/README.md
@@ -154,8 +154,8 @@ SHELL_PLUS_POST_IMPORTS = [
         'DataImportMode',
         'EventType',
     )),
-    ('arkindex.images.models', (
-        'ImageStatus',
+    ('arkindex.project.aws', (
+        'S3FileStatus',
     )),
     ('arkindex.users.models', (
         'OAuthStatus',
diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py
index 6f6c0297bb..7ece805c10 100644
--- a/arkindex/dataimport/api.py
+++ b/arkindex/dataimport/api.py
@@ -15,11 +15,13 @@ from rest_framework.exceptions import ValidationError
 from arkindex.project.mixins import CorpusACLMixin
 from arkindex.project.permissions import IsVerified, IsAuthenticated, IsAdminUser
 from arkindex.documents.models import Corpus, Right, Element, ElementType
-from arkindex.dataimport.models import \
-    DataImport, DataFile, DataImportMode, DataImportFailure, Repository, Event, EventType
+from arkindex.dataimport.models import (
+    DataImport, DataFile, DataImportMode,
+    DataImportFailure, Repository, Event, EventType
+)
 from arkindex.dataimport.serializers import (
     DataImportLightSerializer, DataImportSerializer, DataImportFromFilesSerializer,
-    DataImportFailureSerializer, DataFileSerializer,
+    DataImportFailureSerializer, DataFileSerializer, DataFileCreateSerializer,
     RepositorySerializer, RepositoryStartImportSerializer,
     ExternalRepositorySerializer, EventSerializer, MLToolSerializer,
 )
@@ -326,6 +328,16 @@ class DataFileUpload(CorpusACLMixin, APIView):
         )
 
 
+class DataFileCreate(CreateAPIView):
+    """
+    Create a DataFile from user file hash
+    A s3 PUT URI is returned to upload corresponding file
+    """
+
+    permission_classes = (IsVerified, )
+    serializer_class = DataFileCreateSerializer
+
+
 class GitRepositoryImportHook(APIView):
     """
     Handle Git push events
diff --git a/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py b/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py
new file mode 100644
index 0000000000..7ac870539c
--- /dev/null
+++ b/arkindex/dataimport/migrations/0005_s3_file_mixin_fields.py
@@ -0,0 +1,29 @@
+# Generated by Django 2.2 on 2019-05-07 08:17
+
+import arkindex.project.aws
+from django.db import migrations, models
+import enumfields.fields
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('dataimport', '0004_images_foreign_key_to_datafile'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='datafile',
+            name='status',
+            field=enumfields.fields.EnumField(
+                default='unchecked',
+                enum=arkindex.project.aws.S3FileStatus,
+                max_length=50
+            ),
+        ),
+        migrations.AlterField(
+            model_name='datafile',
+            name='size',
+            field=models.PositiveIntegerField(help_text='file size in bytes'),
+        ),
+    ]
diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py
index b5b4a2dd58..09609b4f2c 100644
--- a/arkindex/dataimport/models.py
+++ b/arkindex/dataimport/models.py
@@ -6,19 +6,14 @@ from django.utils.functional import cached_property
 from rest_framework.exceptions import ValidationError
 from enumfields import EnumField, Enum
 from arkindex_common.ml_tool import MLTool, MLToolType
-from arkindex.project.aws import s3
+from arkindex.project.aws import S3FileModelMixin
 from arkindex.project.models import IndexableModel
 from arkindex.documents.models import Element, ElementType
 from arkindex.dataimport.providers import git_providers, get_provider
 from ponos.models import Workflow, State
-from botocore.exceptions import ClientError
-from io import BytesIO
-import logging
 import yaml
 import uuid
 
-logger = logging.getLogger(__name__)
-
 
 class DataImportMode(Enum):
     Images = 'images'
@@ -189,44 +184,33 @@ class DataImportFailure(models.Model):
         )
 
 
-class DataFile(models.Model):
+class DataFile(S3FileModelMixin):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4)
     name = models.CharField(max_length=100)
-    size = models.PositiveIntegerField()
-    hash = models.CharField(max_length=32)
+    size = models.PositiveIntegerField(help_text='file size in bytes')
     content_type = models.CharField(max_length=50)
     corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='files')
 
+    s3_bucket = settings.AWS_STAGING_BUCKET
+
+    @cached_property
+    def s3_key(self):
+        return str(self.id)
+
     class Meta:
         unique_together = (('corpus', 'hash'), )
         ordering = ['corpus', 'name']
 
-    @property
-    def staging_path(self):
-        return str(self.id)
+    def perform_check(self, save=True, raise_exc=False):
+        """
+        Check Datafile's existence and update size, content_type from S3
+        """
+        self.check_hash(save=save, raise_exc=raise_exc)
 
-    @cached_property
-    def s3_object(self):
-        return s3.Object(settings.AWS_STAGING_BUCKET, self.staging_path)
-
-    def exists(self):
-        try:
-            self.s3_object.load()
-            return True
-        except ClientError as e:
-            if e.response['Error']['Code'] != '404':
-                raise
-            return False
-
-    def download(self):
-        b = BytesIO()
-        logger.debug('Downloading file {} from S3'.format(self.staging_path))
-        self.s3_object.download_fileobj(b)
-        return b
-
-    def download_to(self, path):
-        logger.debug('Downloading file {} from S3'.format(self.staging_path))
-        self.s3_object.download_file(path)
+        self.size = self.s3_object.content_length
+        self.content_type = self.s3_object.content_type
+        if save:
+            self.save()
 
 
 class Repository(models.Model):
diff --git a/arkindex/dataimport/serializers.py b/arkindex/dataimport/serializers.py
index 89f5acd3f5..add5df4a03 100644
--- a/arkindex/dataimport/serializers.py
+++ b/arkindex/dataimport/serializers.py
@@ -1,6 +1,7 @@
 from rest_framework import serializers
 from rest_framework.utils import model_meta
 from arkindex.project.serializer_fields import EnumField, MLToolField
+from arkindex.project.aws import S3FileStatus
 from arkindex.dataimport.models import (
     DataImport, DataImportMode, DataImportFailure, DataFile,
     Repository, Revision, Event, EventType, DataImportPDFEngine
@@ -11,6 +12,7 @@ from arkindex.images.serializers import ImageSerializer
 from arkindex_common.ml_tool import MLToolType
 from ponos.models import State
 import gitlab.v4.objects
+import re
 
 
 class DataImportLightSerializer(serializers.ModelSerializer):
@@ -184,18 +186,76 @@ class DataFileSerializer(serializers.ModelSerializer):
     """
     Serialize a single uploaded file
     """
-    images = ImageSerializer(many=True)
+
+    images = ImageSerializer(many=True, read_only=True)
+    status = EnumField(S3FileStatus)
 
     class Meta:
         model = DataFile
         fields = (
             'id',
             'name',
+            'hash',
             'content_type',
             'size',
             'images',
+            'status',
+        )
+        read_only_fields = ('id', 'name', 'hash', 'size', 'content_type', 'images', )
+
+    def validate_status(self, value):
+        if value == S3FileStatus.Checked:
+            # Status has been resquested to be checked, perform validation
+            try:
+                self.instance.perform_check(raise_exc=True)
+            except (AssertionError, ValueError) as e:
+                raise serializers.ValidationError(str(e))
+        return value
+
+
+class DataFileCreateSerializer(serializers.ModelSerializer):
+    """
+    Serialize a Datafile creation with Amazon S3 PUT uri
+    """
+
+    status = EnumField(S3FileStatus, read_only=True)
+    hash = serializers.RegexField(re.compile(r'[0-9A-Fa-f]{32}'), min_length=32, max_length=32)
+    s3_put_url = serializers.SerializerMethodField()
+
+    class Meta:
+        model = DataFile
+        fields = (
+            'id',
+            'name',
+            'hash',
+            'size',
+            'corpus',
+            'status',
+            's3_url',
+            's3_put_url',
         )
-        read_only_fields = ('id', 'size', 'content_type', 'images', )
+        read_only_fields = ('id', 'status', 's3_url', 's3_put_url')
+
+    def get_s3_put_url(self, obj):
+        if obj.status == S3FileStatus.Checked:
+            return None
+        return obj.s3_put_url
+
+    def run_validation(self, data):
+        existing_datafile = DataFile.objects.filter(hash=data['hash']).first()
+        if existing_datafile:
+            message = {
+                'hash': ['DataFile with this hash already exists'],
+                'id': str(existing_datafile.id),
+                'status': existing_datafile.status.value,
+            }
+            if existing_datafile.status != S3FileStatus.Checked:
+                message['s3_put_url'] = existing_datafile.s3_put_url
+            else:
+                message['s3_url'] = existing_datafile.s3_url
+            self._errors = message
+            raise serializers.ValidationError(message)
+        return super().run_validation(data)
 
 
 class DataImportFailureSerializer(serializers.ModelSerializer):
diff --git a/arkindex/dataimport/tasks/base.py b/arkindex/dataimport/tasks/base.py
index adc78b12e5..ab4dfb354a 100644
--- a/arkindex/dataimport/tasks/base.py
+++ b/arkindex/dataimport/tasks/base.py
@@ -23,7 +23,7 @@ def download_files(dataimport, dest_dir):
 
     for i, datafile in enumerate(datafiles):
         logger.info('Downloading file {} of {}'.format(i + 1, filecount))
-        path = os.path.join(dest_dir, datafile.staging_path)
+        path = os.path.join(dest_dir, datafile.s3_key)
 
         try:
             datafile.download_to(path)
diff --git a/arkindex/dataimport/tasks/image.py b/arkindex/dataimport/tasks/image.py
index 84ca73831f..9e71c99185 100644
--- a/arkindex/dataimport/tasks/image.py
+++ b/arkindex/dataimport/tasks/image.py
@@ -1,7 +1,8 @@
 from PIL import Image as PillowImage
 from arkindex.dataimport.models import DataFile
-from arkindex.images.models import ImageServer, Image, ImageStatus
+from arkindex.images.models import ImageServer, Image
 from arkindex.documents.models import Element, ElementType
+from arkindex.project.aws import S3FileStatus
 from urllib.parse import quote
 import logging
 
@@ -83,7 +84,7 @@ def build_iiif_image(volume, path, data_file, suffix=None):
     else:
         # Save to S3 using optional image type conversion
         img.pillow_save(pillow_img, format=img_format)
-        img.status = ImageStatus.Checked
+        img.status = S3FileStatus.Checked
         img.save()
 
     return img
diff --git a/arkindex/dataimport/tests/test_datafile_api.py b/arkindex/dataimport/tests/test_datafile_api.py
new file mode 100644
index 0000000000..ad16213a73
--- /dev/null
+++ b/arkindex/dataimport/tests/test_datafile_api.py
@@ -0,0 +1,151 @@
+from arkindex.project.tests import FixtureAPITestCase
+from arkindex.dataimport.models import DataFile
+from arkindex.project.aws import S3FileStatus
+from rest_framework import status
+from django.urls import reverse
+from unittest.mock import patch
+
+
+class TestDataFileApi(FixtureAPITestCase):
+    """
+    Test datafile creation and upload to S3
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.df = DataFile.objects.create(
+            name='test.pdf',
+            size=42,
+            hash='11111111111111111111111111111112',
+            content_type='application/pdf',
+            corpus=self.corpus
+        )
+
+    def build_file_create_request(self, name=None, hash=None, size=None, corpus=None):
+        return {
+            'name': name or 'some text',
+            'hash': hash or '552e21cd4cd9918678e3c1a0df491bc3',
+            'size': size or 1,
+            'corpus': corpus or str(self.corpus.id),
+        }
+
+    def test_create_df_requires_login(self):
+        request = self.build_file_create_request()
+        response = self.client.post(reverse('api:file-create'), request)
+        self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
+
+    @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url')
+    def test_create_datafile(self, s3_presigned_url_mock):
+        self.client.force_login(self.user)
+        s3_presigned_url_mock.return_value = 'http://s3/upload_put_url'
+        request = self.build_file_create_request()
+        response = self.client.post(reverse('api:file-create'), request)
+        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+        data = response.json()
+        self.assertIn('id', data)
+        df = DataFile.objects.get(id=data['id'])
+        self.assertDictEqual(
+            data,
+            {
+                'id': str(df.id),
+                'name': df.name,
+                'hash': str(df.hash),
+                'size': df.size,
+                'status': df.status.value,
+                's3_url': df.s3_url,
+                's3_put_url': df.s3_put_url,
+                'corpus': str(df.corpus.id),
+            }
+        )
+        self.assertListEqual(
+            [df.name, df.hash, df.size, df.s3_put_url],
+            ['some text', '552e21cd4cd9918678e3c1a0df491bc3', 1, 'http://s3/upload_put_url']
+        )
+
+    def test_create_existing_hash(self):
+        self.client.force_login(self.user)
+        request = self.build_file_create_request(hash='11111111111111111111111111111112')
+        response = self.client.post(reverse('api:file-create'), request)
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        informations = response.json()
+        self.assertIn('hash', informations)
+        self.assertIn('id', informations)
+        self.assertIn('s3_put_url', informations)
+
+    def test_create_checked_existing_hash(self):
+        self.client.force_login(self.user)
+        self.df.status = S3FileStatus.Checked
+        self.df.save()
+        request = self.build_file_create_request(hash='11111111111111111111111111111112')
+        response = self.client.post(reverse('api:file-create'), request)
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        informations = response.json().keys()
+        self.assertIn('hash', informations)
+        self.assertIn('id', informations)
+        self.assertNotIn('s3_put_url', informations)
+        self.assertIn('s3_url', informations)
+
+    @patch('arkindex.project.aws.s3.Object')
+    def test_check_uploaded_datafile(self, s3_object):
+        s3_object().e_tag = '11111111111111111111111111111112'
+        s3_object().content_length = '99942'
+        s3_object().content_type = 'test/testfile'
+
+        self.client.force_login(self.user)
+        response = self.client.put(
+            reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
+            {'status': 'checked'},
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.df.refresh_from_db()
+        self.assertDictEqual(
+            response.json(),
+            {
+                'id': str(self.df.id),
+                'name': self.df.name,
+                'hash': self.df.hash,
+                'content_type': self.df.content_type,
+                'size': self.df.size,
+                'status': self.df.status.value,
+                'images': list(self.df.images.all()),
+            }
+        )
+        self.assertEqual(self.df.status, S3FileStatus.Checked)
+        self.assertEqual(self.df.content_type, 'test/testfile')
+        self.assertEqual(self.df.size, 99942)
+
+    @patch('arkindex.project.aws.s3.Object')
+    def test_check_wrong_md5(self, s3_object):
+        s3_object().e_tag = 'wrong md5'
+        self.client.force_login(self.user)
+        response = self.client.put(
+            reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
+            {'status': 'checked'},
+        )
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+
+    @patch('arkindex.project.aws.s3.Object')
+    def test_set_error_status_on_failure(self, s3_object):
+        s3_object().e_tag = 'wrong md5'
+        self.client.force_login(self.user)
+        response = self.client.put(
+            reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
+            {'status': 'checked'},
+        )
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        self.df.refresh_from_db()
+        self.assertEqual(self.df.status, S3FileStatus.Error)
+
+    @patch('arkindex.project.aws.s3.Object')
+    def test_check_even_if_already_checked(self, s3_object):
+        self.df.status = S3FileStatus.Checked
+        self.df.save()
+        s3_object().e_tag = 'corrupted md5'
+        self.client.force_login(self.user)
+        response = self.client.put(
+            reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
+            {'status': 'checked'},
+        )
+        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
+        self.df.refresh_from_db()
+        self.assertEqual(self.df.status, S3FileStatus.Error)
diff --git a/arkindex/dataimport/tests/test_files.py b/arkindex/dataimport/tests/test_files.py
index 82825d8638..bec834eb69 100644
--- a/arkindex/dataimport/tests/test_files.py
+++ b/arkindex/dataimport/tests/test_files.py
@@ -85,7 +85,7 @@ class TestFiles(FixtureAPITestCase):
         response = self.client.post(reverse('api:file-upload', kwargs={'pk': public.id}), data={'file': f})
         self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_file_upload(self, s3_mock):
         """
         Assert a file upload creates a database instance and saves the file
@@ -123,7 +123,7 @@ class TestFiles(FixtureAPITestCase):
         response = self.client.post(reverse('api:file-upload', kwargs={'pk': str(uuid.uuid4())}), data={'file': f})
         self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_file_upload_wrong_content_type(self, s3_mock):
         """
         Assert file upload does not trust the client defined content type
@@ -142,7 +142,7 @@ class TestFiles(FixtureAPITestCase):
         self.assertEqual(s3_mock.Object.call_count, 1)
         self.assertEqual(s3_mock.Object().upload_fileobj.call_count, 1)
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_file_upload_unique(self, s3_mock):
         """
         Assert uploading the same file twice fails
diff --git a/arkindex/dataimport/tests/test_iiif.py b/arkindex/dataimport/tests/test_iiif.py
index d060212ad7..a4e70d8d99 100644
--- a/arkindex/dataimport/tests/test_iiif.py
+++ b/arkindex/dataimport/tests/test_iiif.py
@@ -3,10 +3,11 @@ from django.test import override_settings
 from arkindex_common.enums import MetaType
 from arkindex.project.tests import FixtureTestCase
 from arkindex.documents.models import Element, ElementType, Page
-from arkindex.images.models import ImageStatus, ImageServer
+from arkindex.images.models import ImageServer
 from arkindex.dataimport.models import EventType, DataImportMode
 from arkindex.dataimport.git import GitFlow
 from arkindex.dataimport.iiif import IIIFParser, ManifestParser, CollectionParser
+from arkindex.project.aws import S3FileStatus
 from requests.exceptions import Timeout
 import os.path
 import git
@@ -78,7 +79,7 @@ class TestManifestParser(FixtureTestCase):
             self.assertEqual(p.zone.polygon.y, 0)
             self.assertEqual(p.zone.polygon.width, 2000)
             self.assertEqual(p.zone.polygon.height, 1000)
-            self.assertEqual(p.zone.image.status, ImageStatus.Unchecked)
+            self.assertEqual(p.zone.image.status, S3FileStatus.Unchecked)
             self.assertEqual(p.zone.image.server, self.imgsrv)
             self.assertEqual(p.zone.image.width, 2000)
             self.assertEqual(p.zone.image.height, 1000)
@@ -124,7 +125,7 @@ class TestManifestParser(FixtureTestCase):
             self.assertEqual(p.zone.polygon.y, 0)
             self.assertEqual(p.zone.polygon.width, 2000)
             self.assertEqual(p.zone.polygon.height, 1000)
-            self.assertEqual(p.zone.image.status, ImageStatus.Unchecked)
+            self.assertEqual(p.zone.image.status, S3FileStatus.Unchecked)
             self.assertEqual(p.zone.image.server, self.imgsrv)
             self.assertEqual(p.zone.image.width, 2000)
             self.assertEqual(p.zone.image.height, 1000)
diff --git a/arkindex/dataimport/tests/test_image.py b/arkindex/dataimport/tests/test_image.py
index 941a66ba18..059142eaed 100644
--- a/arkindex/dataimport/tests/test_image.py
+++ b/arkindex/dataimport/tests/test_image.py
@@ -2,7 +2,7 @@ from unittest.mock import patch, call
 from arkindex.project.tests import FixtureTestCase
 from arkindex.documents.models import ElementType
 from arkindex.dataimport.tasks import check_images, build_iiif_image
-from arkindex.images.models import ImageStatus
+from arkindex.project.aws import S3FileStatus
 from botocore.exceptions import ClientError
 
 
@@ -31,7 +31,7 @@ class TestImageTasks(FixtureTestCase):
             (self.df, '/some/path'),
         ])
 
-    @patch('arkindex.images.models.s3.Object')
+    @patch('arkindex.project.aws.s3.Object')
     @patch('arkindex.dataimport.tasks.image.PillowImage')
     def test_build_iiif_image(self, image_mock, s3obj_mock):
         image_mock.open.return_value.format = 'BMP'
@@ -58,10 +58,10 @@ class TestImageTasks(FixtureTestCase):
         self.assertEqual(img.path, expected_path.replace('/', '%2F'))
         self.assertEqual(img.width, 400)
         self.assertEqual(img.height, 900)
-        self.assertEqual(img.status, ImageStatus.Checked)
+        self.assertEqual(img.status, S3FileStatus.Checked)
         self.assertEqual(img.datafile, self.df)
 
-    @patch('arkindex.images.models.s3.Object')
+    @patch('arkindex.project.aws.s3.Object')
     @patch('arkindex.dataimport.tasks.image.PillowImage')
     def test_build_iiif_image_retry(self, image_mock, s3obj_mock):
         """
@@ -74,7 +74,7 @@ class TestImageTasks(FixtureTestCase):
             datafile=self.df,
             width=900,
             height=400,
-            status=ImageStatus.Checked,
+            status=S3FileStatus.Checked,
         )
 
         with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'):
diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py
index da87d98f1f..0c6af8c6a1 100644
--- a/arkindex/dataimport/tests/test_pdf.py
+++ b/arkindex/dataimport/tests/test_pdf.py
@@ -49,7 +49,7 @@ class TestPdf(FixtureTestCase):
         with self.assertRaises(AssertionError):
             extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_extract_pdf_images_s3_error(self, s3_mock):
         """
         Test extract_pdf_images task lets S3 errors through
@@ -62,7 +62,7 @@ class TestPdf(FixtureTestCase):
         with self.assertRaises(ClientError):
             extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_extract_pdf_images_with_convert(self, s3_mock):
         """
         Test extract_pdf_images runs ImageMagick and returns proper info
@@ -74,7 +74,7 @@ class TestPdf(FixtureTestCase):
             os.path.join(self.working_dir, 'pdf-0001.jpg'),
         ])
 
-    @patch('arkindex.dataimport.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_extract_pdf_images_with_poppler(self, s3_mock):
         """
         Test extract_pdf_images runs ImageMagick and returns proper info
diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py
index 4bf38b1fce..9fead8f1d3 100644
--- a/arkindex/dataimport/tests/test_tasks.py
+++ b/arkindex/dataimport/tests/test_tasks.py
@@ -29,7 +29,7 @@ class TestTasks(FixtureTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls.s3obj_patch = patch('arkindex.dataimport.models.s3.Object')
+        cls.s3obj_patch = patch('arkindex.project.aws.s3.Object')
         cls.s3obj_mock = cls.s3obj_patch.start()
         cls.access_patch = patch('arkindex.dataimport.tasks.base.os.access')
         cls.access_mock = cls.access_patch.start()
diff --git a/arkindex/documents/management/commands/telegraf.py b/arkindex/documents/management/commands/telegraf.py
index 5c91c41558..721602a79b 100644
--- a/arkindex/documents/management/commands/telegraf.py
+++ b/arkindex/documents/management/commands/telegraf.py
@@ -4,7 +4,8 @@ from django.conf import settings
 from django.db.models import Count
 from django.utils.text import slugify
 from arkindex.documents.models import Element, ElementType, Transcription, Corpus
-from arkindex.images.models import ImageServer, ImageStatus, Image
+from arkindex.images.models import ImageServer, Image
+from arkindex.project.aws import S3FileStatus
 from urllib.parse import urljoin
 import time
 import requests
@@ -67,13 +68,13 @@ class Command(BaseCommand):
         # Image server statistics, in three SQL queries
         checked_counts = dict(
             Image.objects
-                 .filter(status=ImageStatus.Checked)
+                 .filter(status=S3FileStatus.Checked)
                  .values_list('server')
                  .annotate(Count('server'))
         )
         error_counts = dict(
             Image.objects
-                 .filter(status=ImageStatus.Error)
+                 .filter(status=S3FileStatus.Error)
                  .values_list('server')
                  .annotate(Count('server'))
         )
diff --git a/arkindex/documents/serializers/elements.py b/arkindex/documents/serializers/elements.py
index 3d7b4fe92b..044fceac82 100644
--- a/arkindex/documents/serializers/elements.py
+++ b/arkindex/documents/serializers/elements.py
@@ -5,13 +5,14 @@ from arkindex.documents.models import \
     Element, ElementType, Page, PageType, PageDirection, Act, Corpus, \
     MetaData, InterpretedDate, DateType
 from arkindex.images.serializers import ZoneSerializer, ImageSerializer
-from arkindex.images.models import Image, ImageStatus
+from arkindex.images.models import Image
 from arkindex.documents.serializers.light import CorpusLightSerializer, ElementLightSerializer
 from arkindex.documents.serializers.ml import ClassificationSerializer, TranscriptionSerializer
 from arkindex.dataimport.serializers import RevisionSerializer
 from arkindex.dataimport.models import EventType
 from arkindex.project.serializer_fields import EnumField
 from arkindex.project.polygon import Polygon
+from arkindex.project.aws import S3FileStatus
 from collections import defaultdict
 
 
@@ -86,7 +87,7 @@ class ElementSlimSerializer(serializers.ModelSerializer):
     """
     type = EnumField(ElementType, read_only=True)
     corpus = CorpusLightSerializer(read_only=True)
-    thumbnail_url = serializers.URLField(source='thumbnail.url', read_only=True)
+    thumbnail_url = serializers.URLField(source='thumbnail.s3_url', read_only=True)
 
     class Meta:
         model = Element
@@ -261,7 +262,7 @@ class ElementCreateSerializer(ElementLightSerializer):
                 'Parent and child must be in the same corpus'
             )
         image = data.get('image')
-        if image and image.status != ImageStatus.Checked:
+        if image and image.status != S3FileStatus.Checked:
             errors['image'].append(
                 'Image is not checked. Try to upload a valid IIIF image.'
             )
diff --git a/arkindex/documents/tests/test_elements_api.py b/arkindex/documents/tests/test_elements_api.py
index 8aac24ff70..68ca0ddab2 100644
--- a/arkindex/documents/tests/test_elements_api.py
+++ b/arkindex/documents/tests/test_elements_api.py
@@ -2,8 +2,9 @@ from django.urls import reverse
 from rest_framework import status
 from arkindex.documents.models import Element, ElementType, DataSource, \
     TranscriptionType, Page, Act, Corpus
-from arkindex.images.models import ImageServer, ImageStatus
+from arkindex.images.models import ImageServer
 from arkindex.project.tests import FixtureAPITestCase
+from arkindex.project.aws import S3FileStatus
 
 
 class TestElementsAPI(FixtureAPITestCase):
@@ -16,7 +17,7 @@ class TestElementsAPI(FixtureAPITestCase):
         cls.src = DataSource.objects.get(slug='test')
         cls.image = ImageServer.objects.local.images.create(
             path="kingdom/far/away",
-            status=ImageStatus.Checked
+            status=S3FileStatus.Checked
         )
 
     def test_get_element(self):
@@ -170,7 +171,7 @@ class TestElementsAPI(FixtureAPITestCase):
             image=str(self.image.id),
             metadata={'folio': 'new'},
         )
-        self.image.status = ImageStatus.Error
+        self.image.status = S3FileStatus.Error
         self.image.save()
         response = self.client.post(**request)
         self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
diff --git a/arkindex/images/management/commands/check_images.py b/arkindex/images/management/commands/check_images.py
index 975093b52c..1d1e8923d2 100644
--- a/arkindex/images/management/commands/check_images.py
+++ b/arkindex/images/management/commands/check_images.py
@@ -2,7 +2,8 @@ from django.core.management.base import CommandError
 from django.conf import settings
 from ponos.management.base import PonosCommand
 from arkindex.project.argparse import CorpusArgument, ElementArgument
-from arkindex.images.models import ImageServer, Image, ImageStatus
+from arkindex.project.aws import S3FileStatus
+from arkindex.images.models import ImageServer, Image
 import logging
 
 logging.basicConfig(
@@ -59,7 +60,7 @@ class Command(PonosCommand):
             images = Image.objects.all()
 
         if not force:
-            images = images.exclude(status=ImageStatus.Checked)
+            images = images.exclude(status=S3FileStatus.Checked)
 
         return {'images': images, 'sample': sample}
 
@@ -68,7 +69,7 @@ class Command(PonosCommand):
             # Re-check a few images from each server
             for server in ImageServer.objects.all():
                 server_sample = server.images \
-                                      .filter(status=ImageStatus.Checked) \
+                                      .filter(status=S3FileStatus.Checked) \
                                       .order_by('?')[:sample]
                 logger.info('Re-checking {} images in server {}'.format(len(server_sample), server.display_name))
                 self.check(server_sample)
@@ -80,9 +81,9 @@ class Command(PonosCommand):
         for image in images:
             logger.info('Checking image {} at {}'.format(str(image.id), image.url))
             image.perform_check(save=True)
-            if image.status == ImageStatus.Checked:
+            if image.status == S3FileStatus.Checked:
                 successful += 1
-            elif image.status == ImageStatus.Error:
+            elif image.status == S3FileStatus.Error:
                 failed += 1
 
         return successful, failed
diff --git a/arkindex/images/migrations/0001_initial.py b/arkindex/images/migrations/0001_initial.py
index f653595a95..76fbfd2cc6 100644
--- a/arkindex/images/migrations/0001_initial.py
+++ b/arkindex/images/migrations/0001_initial.py
@@ -25,7 +25,7 @@ class Migration(migrations.Migration):
                 ('path', models.TextField()),
                 ('width', models.PositiveIntegerField(default=0)),
                 ('height', models.PositiveIntegerField(default=0)),
-                ('status', enumfields.fields.EnumField(default='unchecked', enum=arkindex.images.models.ImageStatus, max_length=50)),
+                ('status', enumfields.fields.EnumField(default='unchecked', enum=arkindex.project.aws.S3FileStatus, max_length=50)),
             ],
         ),
         migrations.CreateModel(
diff --git a/arkindex/images/models.py b/arkindex/images/models.py
index cb133dcd62..06952fb73a 100644
--- a/arkindex/images/models.py
+++ b/arkindex/images/models.py
@@ -9,9 +9,8 @@ from arkindex.images.managers import ImageServerManager
 from arkindex.project.models import IndexableModel
 from arkindex.project.fields import StripSlashURLField, LStripTextField
 from arkindex.project.polygon import PolygonField
-from arkindex.project.aws import s3
+from arkindex.project.aws import S3FileMixin, S3FileModelMixin, S3FileStatus
 from botocore.client import ClientError
-from enumfields import EnumField, Enum
 from io import BytesIO
 from PIL import Image as PillowImage
 import logging
@@ -179,16 +178,7 @@ class ImageServer(models.Model):
         return '{}_{}'.format(self.display_name, self.id)
 
 
-class ImageStatus(Enum):
-    """
-    Image validation status
-    """
-    Checked = "checked"
-    Unchecked = "unchecked"
-    Error = "error"
-
-
-class Image(IndexableModel):
+class Image(S3FileModelMixin, IndexableModel):
     """
     A document image
     """
@@ -196,9 +186,15 @@ class Image(IndexableModel):
     path = LStripTextField(chars='/')
     width = models.PositiveIntegerField(default=0)
     height = models.PositiveIntegerField(default=0)
-    status = EnumField(ImageStatus, default=ImageStatus.Unchecked, max_length=50)
-    hash = models.CharField(max_length=32, blank=True, null=True)
     datafile = models.ForeignKey(DataFile, related_name='images', null=True, on_delete=models.SET_NULL)
+    # Overwrite s3 hash to allow null value for external servers
+    hash = models.CharField(max_length=32, blank=True, null=True)
+
+    s3_bucket = settings.AWS_IIIF_BUCKET
+
+    @cached_property
+    def s3_key(self):
+        return urllib.parse.unquote(self.path)
 
     class Meta:
         unique_together = (
@@ -212,34 +208,7 @@ class Image(IndexableModel):
     @cached_property
     def s3_object(self):
         assert self.server.is_local, 'Cannot load images on remote image servers via S3'
-        return s3.Object(settings.AWS_IIIF_BUCKET, urllib.parse.unquote(self.path))
-
-    @property
-    def s3_url(self):
-        return s3.meta.client.generate_presigned_url('get_object', Params={
-            'Bucket': self.s3_object.bucket_name,
-            'Key': self.s3_object.key,
-        })
-
-    @property
-    def s3_put_url(self):
-        return s3.meta.client.generate_presigned_url('put_object', Params={
-            'Bucket': self.s3_object.bucket_name,
-            'Key': self.s3_object.key,
-        })
-
-    def exists(self):
-        """
-        Returns whether the Image exists on the IIIF S3 bucket by performing a HEAD request to S3.
-        See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.load
-        """
-        try:
-            self.s3_object.load()
-            return True
-        except ClientError as e:
-            if e.response['Error']['Code'] != '404':
-                raise
-            return False
+        return super(S3FileModelMixin, self).get_s3_object()
 
     def get_thumbnail_url(self, max_width=200, max_height=None):
         if max_width is None and max_height is None:
@@ -272,7 +241,7 @@ class Image(IndexableModel):
                 'Image id does not start with server url ({} vs. {})'.format(image_id, self.server.url)
         except Exception as e:
             logger.warn('Image check failed: {}'.format(str(e)))
-            self.status = ImageStatus.Error
+            self.status = S3FileStatus.Error
             if save:
                 self.save()
             if raise_exc:
@@ -283,25 +252,13 @@ class Image(IndexableModel):
         self.path = image_id[len(self.server.url) + 1:]
         self.width, self.height = int(data['width']), int(data['height'])
 
-        self.status = ImageStatus.Checked
+        self.status = S3FileStatus.Checked
         if save:
             self.save()
 
-    def check_hash(self, save=True, raise_exc=False):
-        """
-        Checks the MD5 hash against the hash from Amazon S3
-        """
+    def check_hash(self, *args, **kwargs):
         assert self.server.is_local, 'Image hash checks are not supported outside the local server'
-        assert self.hash, 'Image has no hash'
-        assert self.exists(), 'Image does not exist'
-        # The hash given by Boto seems to be surrounded by double quotes
-        if self.s3_object.e_tag.strip('"') == self.hash:
-            return
-        self.status = ImageStatus.Error
-        if save:
-            self.save()
-        if raise_exc:
-            raise ValueError('MD5 hashes do not match')
+        S3FileMixin.check_hash(self, *args, **kwargs)
 
     def pillow_open(self, max_width=500):
         if self.server.is_local:
@@ -333,10 +290,11 @@ class Image(IndexableModel):
         return '{} - {}'.format(self.id, self.url)
 
 
-class Thumbnail(object):
+class Thumbnail(S3FileMixin):
     """
     Describes an element thumbnail
     """
+    s3_bucket = settings.AWS_THUMBNAIL_BUCKET
 
     def __init__(self, element):
         self.element = element
@@ -346,15 +304,8 @@ class Thumbnail(object):
         return '{}.jpg'.format(str(self.element.id.hex))
 
     @cached_property
-    def s3_object(self):
-        return s3.Object(settings.AWS_THUMBNAIL_BUCKET, self.name)
-
-    @property
-    def url(self):
-        return s3.meta.client.generate_presigned_url('get_object', Params={
-            'Bucket': self.s3_object.bucket_name,
-            'Key': self.s3_object.key,
-        })
+    def s3_key(self):
+        return self.name
 
     def create(self, width=900, height=400, max_element_count=3, force=False):
         """
diff --git a/arkindex/images/serializers.py b/arkindex/images/serializers.py
index bf6fa6ce12..09db7992b9 100644
--- a/arkindex/images/serializers.py
+++ b/arkindex/images/serializers.py
@@ -1,8 +1,9 @@
 from rest_framework import serializers
 from arkindex.project.serializer_fields import PolygonField, EnumField
 from arkindex.documents.models import Corpus
-from arkindex.images.models import Image, Zone, ImageServer, ImageStatus
+from arkindex.images.models import Image, Zone, ImageServer
 from arkindex.dataimport.models import DataFile
+from arkindex.project.aws import S3FileStatus
 import re
 import uuid
 
@@ -25,7 +26,7 @@ class ImageSerializer(serializers.ModelSerializer):
     """
     Serialises an image
     """
-    status = EnumField(ImageStatus)
+    status = EnumField(S3FileStatus)
 
     class Meta:
         model = Image
@@ -42,8 +43,8 @@ class ImageSerializer(serializers.ModelSerializer):
     def validate_status(self, value):
         if not self.instance:
             # Force the Unchecked status when creating a new image
-            return ImageStatus.Unchecked
-        elif value == ImageStatus.Checked:
+            return S3FileStatus.Unchecked
+        elif value == S3FileStatus.Checked:
             # Perform image validation if we are updating an existing image to Checked
             try:
                 self.instance.check_hash(raise_exc=True)
@@ -57,7 +58,7 @@ class ImageUploadSerializer(ImageSerializer):
     """
     Serialize an image to upload in s3
     """
-    status = EnumField(ImageStatus, read_only=True)
+    status = EnumField(S3FileStatus, read_only=True)
     hash = serializers.RegexField(re.compile(r'[0-9A-Fa-f]{32}'), min_length=32, max_length=32)
     datafile = serializers.PrimaryKeyRelatedField(
         queryset=DataFile.objects.none(),
@@ -80,7 +81,7 @@ class ImageUploadSerializer(ImageSerializer):
         )
 
     def get_s3_put_url(self, obj):
-        if obj.status == ImageStatus.Checked or not obj.server.is_local:
+        if obj.status == S3FileStatus.Checked or not obj.server.is_local:
             # No PUT for existing images or external servers
             return None
         return obj.s3_put_url
diff --git a/arkindex/images/tests/test_check_images.py b/arkindex/images/tests/test_check_images.py
index 914437d8ef..e1de93ae05 100644
--- a/arkindex/images/tests/test_check_images.py
+++ b/arkindex/images/tests/test_check_images.py
@@ -1,5 +1,5 @@
 from arkindex.project.tests import FixtureTestCase
-from arkindex.images.models import ImageStatus
+from arkindex.project.aws import S3FileStatus
 from django.core.management import call_command
 from django.core.management.base import CommandError
 from unittest.mock import patch
@@ -19,8 +19,8 @@ class TestCheckImages(FixtureTestCase):
         cls.p2 = cls.corpus.elements.get(name='Volume 1, page 1v')
 
         # Set two images to Checked
-        cls.imgsrv.images.filter(path='img5').update(status=ImageStatus.Checked)
-        cls.imgsrv.images.filter(path='img6').update(status=ImageStatus.Checked)
+        cls.imgsrv.images.filter(path='img5').update(status=S3FileStatus.Checked)
+        cls.imgsrv.images.filter(path='img6').update(status=S3FileStatus.Checked)
 
         # Create an image linked to zero elements
         cls.imgsrv.images.create(path='am-outside')
diff --git a/arkindex/images/tests/test_image.py b/arkindex/images/tests/test_image.py
index d53d8df340..d5842e421a 100644
--- a/arkindex/images/tests/test_image.py
+++ b/arkindex/images/tests/test_image.py
@@ -3,7 +3,8 @@ from unittest.mock import patch, call
 from django.test import override_settings
 from botocore.exceptions import ClientError
 from arkindex.project.tests import FixtureTestCase
-from arkindex.images.models import ImageServer, ImageStatus
+from arkindex.project.aws import S3FileStatus
+from arkindex.images.models import ImageServer
 
 
 @override_settings(LOCAL_IMAGESERVER_ID=1, AWS_IIIF_BUCKET='iiif')
@@ -43,7 +44,7 @@ class TestImage(FixtureTestCase):
         # Should handle paths that look like an absolute URL but aren't
         self.assertEqual(self.imgsrv.build_url('ark:/some/path'), 'http://server/ark:/some/path')
 
-    @patch('arkindex.images.models.s3.meta.client.generate_presigned_url')
+    @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url')
     def test_s3_url(self, presigned_url_mock):
         presigned_url_mock.return_value = 'http://somewhere'
         img = ImageServer.objects.local.images.create(path='abcd')
@@ -54,7 +55,7 @@ class TestImage(FixtureTestCase):
             Params={'Bucket': 'iiif', 'Key': 'abcd'},
         ))
 
-    @patch('arkindex.images.models.s3.meta.client.generate_presigned_url')
+    @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url')
     def test_s3_put_url(self, presigned_url_mock):
         presigned_url_mock.return_value = 'http://somewhere'
         img = ImageServer.objects.local.images.create(path='abcd')
@@ -65,53 +66,53 @@ class TestImage(FixtureTestCase):
             Params={'Bucket': 'iiif', 'Key': 'abcd'},
         ))
 
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash(self, s3_mock):
         s3_mock.Object().e_tag = '"beef"'
         img = ImageServer.objects.local.images.create(path='abcd', hash='beef')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
         img.check_hash()
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
 
     @override_settings(LOCAL_IMAGESERVER_ID=-1)
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash_external(self, s3_mock):
         self.assertFalse(self.imgsrv.is_local)
         img = self.imgsrv.images.create(path='abcd', hash='beef')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
         with self.assertRaisesRegex(AssertionError, 'not supported'):
             img.check_hash()
 
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash_missing(self, s3_mock):
         img = ImageServer.objects.local.images.create(path='abcd')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
         with self.assertRaisesRegex(AssertionError, 'no hash'):
             img.check_hash()
 
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash_not_found(self, s3_mock):
         s3_mock.Object().load.side_effect = ClientError({'Error': {'Code': '404'}}, 'head_object')
         img = ImageServer.objects.local.images.create(path='abcd', hash='beef')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
-        with self.assertRaisesRegex(AssertionError, 'does not exist'):
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
+        with self.assertRaisesRegex(AssertionError, 'No file content, assert file has been correctly uploaded'):
             img.check_hash()
 
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash_error(self, s3_mock):
         s3_mock.Object().e_tag = '"wrong"'
         img = ImageServer.objects.local.images.create(path='abcd', hash='beef')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
         img.check_hash()
-        self.assertEqual(img.status, ImageStatus.Error)
+        self.assertEqual(img.status, S3FileStatus.Error)
 
-    @patch('arkindex.images.models.s3')
+    @patch('arkindex.project.aws.s3')
     def test_check_hash_exc(self, s3_mock):
         s3_mock.Object().e_tag = '"wrong"'
         img = ImageServer.objects.local.images.create(path='abcd', hash='beef')
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
         with self.assertRaisesRegex(ValueError, 'hashes do not match'):
             img.check_hash(raise_exc=True, save=False)
-        self.assertEqual(img.status, ImageStatus.Error)
+        self.assertEqual(img.status, S3FileStatus.Error)
         img.refresh_from_db()
-        self.assertEqual(img.status, ImageStatus.Unchecked)
+        self.assertEqual(img.status, S3FileStatus.Unchecked)
diff --git a/arkindex/images/tests/test_image_api.py b/arkindex/images/tests/test_image_api.py
index c9d80e2293..84b8967d19 100644
--- a/arkindex/images/tests/test_image_api.py
+++ b/arkindex/images/tests/test_image_api.py
@@ -1,5 +1,6 @@
 from arkindex.project.tests import FixtureAPITestCase
-from arkindex.images.models import Image, ImageServer, ImageStatus
+from arkindex.images.models import Image, ImageServer
+from arkindex.project.aws import S3FileStatus
 from rest_framework import status
 from django.test import override_settings
 from django.urls import reverse
@@ -20,7 +21,7 @@ class TestImageApi(FixtureAPITestCase):
         response = self.client.post(reverse('api:image-create'), {'hash': self.image_hash})
         self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
 
-    @patch('arkindex.images.models.s3.meta.client.generate_presigned_url')
+    @patch('arkindex.project.aws.s3.meta.client.generate_presigned_url')
     def test_create_image(self, s3_presigned_url_mock):
         """
         Use serializer to create a new Image
@@ -86,7 +87,7 @@ class TestImageApi(FixtureAPITestCase):
     def test_update_image_requires_login(self):
         self.client.logout()
         img = ImageServer.objects.local.images.create(
-            status=ImageStatus.Unchecked,
+            status=S3FileStatus.Unchecked,
             path='test_update',
         )
         response = self.client.put(
@@ -102,7 +103,7 @@ class TestImageApi(FixtureAPITestCase):
         Test setting an image's status to Checked runs the image checks
         """
         img = ImageServer.objects.local.images.create(
-            status=ImageStatus.Unchecked,
+            status=S3FileStatus.Unchecked,
             path='test_update',
         )
         response = self.client.put(
@@ -111,7 +112,7 @@ class TestImageApi(FixtureAPITestCase):
         )
         self.assertEqual(response.status_code, status.HTTP_200_OK)
         img.refresh_from_db()
-        self.assertEqual(img.status, ImageStatus.Checked)
+        self.assertEqual(img.status, S3FileStatus.Checked)
         self.assertEqual(check_hash_mock.call_count, 1)
         self.assertEqual(perform_check_mock.call_count, 1)
         self.assertEqual(check_hash_mock.call_args, call(raise_exc=True))
diff --git a/arkindex/images/tests/test_thumbnail.py b/arkindex/images/tests/test_thumbnail.py
index cbb4389db0..83ba23bf9a 100644
--- a/arkindex/images/tests/test_thumbnail.py
+++ b/arkindex/images/tests/test_thumbnail.py
@@ -11,7 +11,7 @@ class TestThumbnail(FixtureTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls.s3_mock = patch('arkindex.images.models.s3').start()
+        cls.s3_mock = patch('arkindex.project.aws.s3').start()
 
     @classmethod
     def setUpTestData(cls):
@@ -31,10 +31,11 @@ class TestThumbnail(FixtureTestCase):
         self.assertEqual(self.vol1.thumbnail.name, str(self.vol1.id.hex) + '.jpg')
 
     def test_s3_object(self):
-        with self.settings(AWS_THUMBNAIL_BUCKET="derp"):
-            # Test twice to ensure s3_object is only created once
-            self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value)
-            self.assertEqual(self.vol1.thumbnail.s3_object, self.s3_mock.Object.return_value)
+        thumbnail = self.vol1.thumbnail
+        thumbnail.s3_bucket = 'derp'
+        # Test twice to ensure s3_object is only created once
+        self.assertEqual(thumbnail.s3_object, self.s3_mock.Object.return_value)
+        self.assertEqual(thumbnail.s3_object, self.s3_mock.Object.return_value)
         self.assertEqual(self.s3_mock.Object.call_count, 1)
         self.assertEqual(self.s3_mock.Object.call_args, call('derp', self.vol1.thumbnail.name))
 
@@ -42,8 +43,9 @@ class TestThumbnail(FixtureTestCase):
         self.s3_mock.meta.client.generate_presigned_url.return_value = 'http://nowhere'
         self.s3_mock.Object.return_value.bucket_name = 'derp'
         self.s3_mock.Object.return_value.key = 'meme.jpg'
-        with self.settings(AWS_THUMBNAIL_BUCKET="derp"):
-            self.assertEqual(self.vol1.thumbnail.url, 'http://nowhere')
+        thumbnail = self.vol1.thumbnail
+        thumbnail.s3_bucket = 'derp'
+        self.assertEqual(thumbnail.s3_url, 'http://nowhere')
         self.assertEqual(self.s3_mock.Object.call_count, 1)
         self.assertEqual(self.s3_mock.meta.client.generate_presigned_url.call_count, 1)
         self.assertEqual(
diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py
index 6f34541cfa..37b0141b87 100644
--- a/arkindex/project/api_v1.py
+++ b/arkindex/project/api_v1.py
@@ -16,7 +16,7 @@ from arkindex.documents.api.iiif import (
 from arkindex.dataimport.api import (
     DataImportsList, DataImportDetails, DataImportRetry, DataImportFailures, DataImportDemo,
     DataFileList, DataFileRetrieve, DataFileUpload, DataImportFromFiles,
-    RepositoryList, RepositoryRetrieve, RepositoryStartImport,
+    RepositoryList, RepositoryRetrieve, RepositoryStartImport, DataFileCreate,
     GitRepositoryImportHook, AvailableRepositoriesList, ElementHistory, MLToolList,
 )
 from arkindex.images.api import ImageCreate, ImageRetrieve
@@ -105,6 +105,7 @@ api = [
     path('imports/<uuid:pk>/failures/', DataImportFailures.as_view(), name='import-failures'),
     path('imports/demo/<uuid:pk>/', DataImportDemo.as_view(), name='import-demo'),
     path('imports/files/<uuid:pk>/', DataFileList.as_view(), name='file-list'),
+    path('imports/files/create/', DataFileCreate.as_view(), name='file-create'),
     path('imports/file/<uuid:pk>/', DataFileRetrieve.as_view(), name='file-retrieve'),
     path('imports/upload/<uuid:pk>/', DataFileUpload.as_view(), name='file-upload'),
 
diff --git a/arkindex/project/aws.py b/arkindex/project/aws.py
index 041ed1e2c3..7cb31f2eeb 100644
--- a/arkindex/project/aws.py
+++ b/arkindex/project/aws.py
@@ -1,6 +1,15 @@
 from django.conf import settings
+from django.db import models
 from botocore.config import Config
+from botocore.exceptions import ClientError
+from django.utils.functional import cached_property
+from enumfields import EnumField, Enum
+from io import BytesIO
 import boto3.session
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 session = boto3.session.Session(
     aws_access_key_id=settings.AWS_ACCESS_KEY,
@@ -17,3 +26,80 @@ s3 = session.resource(
     endpoint_url=settings.AWS_ENDPOINT,
     config=config,
 )
+
+
+class S3FileStatus(Enum):
+    """
+    S3 file validation status
+    """
+    Checked = "checked"
+    Unchecked = "unchecked"
+    Error = "error"
+
+
+class S3FileMixin(object):
+
+    def get_s3_object(self):
+        return s3.Object(self.s3_bucket, self.s3_key)
+
+    s3_object = cached_property(get_s3_object, name='s3_object')
+
+    @property
+    def s3_put_url(self):
+        return s3.meta.client.generate_presigned_url('put_object', Params={
+            'Bucket': self.s3_object.bucket_name,
+            'Key': self.s3_object.key,
+        })
+
+    @property
+    def s3_url(self):
+        return s3.meta.client.generate_presigned_url('get_object', Params={
+            'Bucket': self.s3_object.bucket_name,
+            'Key': self.s3_object.key,
+        })
+
+    def exists(self):
+        """
+        Returns whether the file exists on S3 bucket by performing a HEAD request to S3.
+        See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Ob    ject.load
+        """
+        try:
+            self.s3_object.load()
+            return True
+        except ClientError as e:
+            if e.response['Error']['Code'] != '404':
+                raise
+            return False
+
+    def download(self):
+        b = BytesIO()
+        logger.debug('Downloading file {} from S3'.format(self.s3_key))
+        self.s3_object.download_fileobj(b)
+        return b
+
+    def download_to(self, path):
+        logger.debug('Downloading file {} from S3'.format(self.s3_key))
+        self.s3_object.download_file(path)
+
+    def check_hash(self, save=True, raise_exc=False):
+        """
+        Checks the MD5 hash against the hash from Amazon S3
+        """
+        assert self.hash, 'File has no hash'
+        assert self.exists(), 'No file content, assert file has been correctly uploaded'
+        # The hash given by Boto seems to be surrounded by double quotes
+        if self.s3_object.e_tag.strip('"') == self.hash:
+            return
+        self.status = S3FileStatus.Error
+        if save:
+            self.save()
+        if raise_exc:
+            raise ValueError('MD5 hashes do not match')
+
+
+class S3FileModelMixin(S3FileMixin, models.Model):
+    hash = models.CharField(max_length=32)
+    status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50)
+
+    class Meta:
+        abstract = True
diff --git a/openapi/patch.yml b/openapi/patch.yml
index 8538e9c5da..33c1cb6e02 100644
--- a/openapi/patch.yml
+++ b/openapi/patch.yml
@@ -229,11 +229,11 @@ paths:
       tags:
         - files
     patch:
-      description: Rename an uploaded file
+      description: Update a datafile's status
       tags:
         - files
     put:
-      description: Rename an uploaded file
+      description: Update a datafile's status
       tags:
         - files
     delete:
@@ -337,6 +337,64 @@ paths:
                     file:
                       - File already exists
                     id: 3cc2e9e0-4172-44b1-8d65-bc3fffd076dc
+  /api/v1/imports/files/create/:
+    post:
+      operationId: CreateDataFile
+      description: Create a Datafile. In case of success, a signed uri is returned to upload file content directly to remote server.
+      tags:
+        - files
+      responses:
+        '400':
+          description: An error occured while creating the data file.
+          content:
+            application/json:
+              schema:
+                properties:
+                  detail:
+                    type: string
+                    description: A generic error message when an error occurs outside of a specific field.
+                    readOnly: true
+                  hash:
+                    type: array
+                    description: Errors that occured during hash field validation.
+                    readOnly: true
+                  corpus:
+                    type: array
+                    description: Errors that occured during corpus ID field validation.
+                    readOnly: true
+                  name:
+                    type: array
+                    description: Errors that occured during name field validation.
+                    readOnly: true
+                  size:
+                    type: array
+                    description: Errors that occured during size field validation.
+                    readOnly: true
+                  id:
+                    type: string
+                    description: UUID of existing DataFile, if the error comes from a duplicated creation.
+                    readOnly: true
+                  status:
+                    type: string
+                    description: Status of existing DataFile, if the error comes from a duplicated creation.
+                    readOnly: true
+                  s3_put_url:
+                    type: string
+                    description: Signed url used to upload file content to remote server, if the error comes from a duplicated creation and file status is not checked.
+                    readOnly: true
+                  s3_url:
+                    type: string
+                    description: Remote file url, if the error comes from a duplicated creation and file status is checked.
+                    readOnly: true
+              examples:
+                file-exists:
+                  summary: Data file already exists. Response include existing file's UUID, status and remote server PUT url to upload file content.
+                  value:
+                    hash:
+                      - DataFile with this hash already exists
+                    id: 55cd009d-cd4b-4ec2-a475-b060f98f9138
+                    status: unchecked
+                    s3_put_url: https://remote-server.net/staging/55cd009d-cd4b-4ec2-a475-b060f98f9138?Credential=mycredential&Signature=mysignature
   /api/v1/imports/{id}/:
     get:
       description: Retrieve a data import
-- 
GitLab