Skip to content
Snippets Groups Projects
Commit 784e9a65 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'remove-datafile-hash' into 'master'

Remove DataFile hash

See merge request !777
parents cebcff64 99c10ad6
No related branches found
No related tags found
1 merge request!777Remove DataFile hash
Showing with 87 additions and 224 deletions
......@@ -31,8 +31,8 @@ class DataImportAdmin(admin.ModelAdmin):
class DataFileAdmin(admin.ModelAdmin):
list_display = ('id', 'name', 'corpus')
list_filter = ('corpus', )
fields = ('id', 'name', 'size', 'hash', 'content_type', 'corpus')
readonly_fields = ('id', 'size', 'hash', )
fields = ('id', 'name', 'size', 'content_type', 'corpus')
readonly_fields = ('id', 'size', )
inlines = [DataFileInline, ImageInline]
......
......@@ -33,7 +33,6 @@ from arkindex_common.enums import DataImportMode
from ponos.models import State
from datetime import datetime
from uuid import UUID
import hashlib
import ijson
import magic
import os.path
......@@ -369,20 +368,6 @@ class DataFileUpload(CorpusACLMixin, APIView):
'file': [f'File name cannot exceed {MAX_FILENAME_LENGTH} characters'],
})
md5hash = hashlib.md5()
for chunk in file_obj.chunks():
md5hash.update(chunk)
file_hash = md5hash.hexdigest()
existing_file = corpus.files.filter(hash=file_hash).first()
if existing_file:
raise ValidationError({
'file': ['File already exists'],
'id': str(existing_file.id),
})
# Reopen file to reread from beginning
file_obj.open()
file_type = magic.from_buffer(file_obj.read(1024), mime=True)
# libmagic 5.35 recognizes JSON, but older versions detect it as text/plain.
......@@ -390,6 +375,7 @@ class DataFileUpload(CorpusACLMixin, APIView):
# JSON-LD files with an expected IIIF context will use application/ld+json.
# JSON and JSON-LD files without the IIIF context will use application/json.
if file_type in ('text/plain', 'application/json') and file_obj.size < 5e6:
# Reopen file to reread from beginning
file_obj.open()
try:
jsonld_context = next(ijson.items(file_obj, '@context'))
......@@ -407,7 +393,6 @@ class DataFileUpload(CorpusACLMixin, APIView):
file_type = 'application/json'
df = DataFile(
hash=file_hash,
corpus=corpus,
name=file_obj.name,
size=file_obj.size,
......@@ -420,10 +405,10 @@ class DataFileUpload(CorpusACLMixin, APIView):
ExtraArgs={'ContentType': df.content_type}
)
try:
# If hash check is passed datafile instance will be saved
df.check_hash(save=True, raise_exc=True)
except ValueError:
raise ValidationError({'file': ['Remote server hash do not match given file hash']})
# If the S3 check passes, the DataFile instance will be saved
df.perform_check(save=True, raise_exc=True)
except (AssertionError, ValueError) as e:
raise ValidationError({'status': [str(e)]})
return Response(
data=DataFileSerializer(df).data,
......
# Generated by Django 2.2.13 on 2020-06-08 10:25
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('dataimport', '0008_add_gitref'),
]
operations = [
migrations.AlterUniqueTogether(
name='datafile',
unique_together=set(),
),
migrations.RemoveField(
model_name='datafile',
name='hash',
),
]
......@@ -6,7 +6,7 @@ from rest_framework.exceptions import ValidationError
from enumfields import EnumField, Enum
from arkindex_common.enums import DataImportMode
from arkindex_common.ml_tool import MLToolType
from arkindex.project.aws import S3FileModelMixin
from arkindex.project.aws import S3FileMixin, S3FileStatus
from arkindex.project.models import IndexableModel
from arkindex.dataimport.providers import git_providers, get_provider
from arkindex.dataimport.serializers.ml_tool import MLToolTaskSerializer
......@@ -214,12 +214,13 @@ inside a Django model is unnecessarily complex.
"""
class DataFile(S3FileModelMixin):
class DataFile(S3FileMixin, models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=MAX_FILENAME_LENGTH)
size = models.PositiveIntegerField(help_text='file size in bytes')
content_type = models.CharField(max_length=50)
corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='files')
status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50)
s3_bucket = settings.AWS_STAGING_BUCKET
......@@ -228,19 +229,26 @@ class DataFile(S3FileModelMixin):
return str(self.id)
class Meta:
unique_together = (('corpus', 'hash'), )
ordering = ['corpus', 'name']
def perform_check(self, save=True, raise_exc=False):
"""
Check Datafile's existence and update size, content_type from S3
Check Datafile's existence, content type and size from S3
"""
self.check_hash(save=save, raise_exc=raise_exc)
self.size = self.s3_object.content_length
self.content_type = self.s3_object.content_type
if save:
self.save()
try:
assert self.exists(), 'File has not been uploaded'
assert self.s3_object.content_length == self.size, \
f'Uploaded file size is {self.s3_object.content_length} bytes, expected {self.size} bytes'
assert self.s3_object.content_type == self.content_type, \
f'Uploaded file content type is {self.s3_object.content_type}, expected {self.content_type}'
self.status = S3FileStatus.Checked
except AssertionError:
self.status = S3FileStatus.Error
if raise_exc:
raise
finally:
if save:
self.save()
class Repository(models.Model):
......
......@@ -20,14 +20,13 @@ class DataFileSerializer(serializers.ModelSerializer):
fields = (
'id',
'name',
'hash',
'content_type',
'size',
'images',
'status',
's3_url',
)
read_only_fields = ('id', 'hash', 'size', 'content_type', 'images', 's3_url', )
read_only_fields = ('id', 'size', 'content_type', 'images', 's3_url', )
def validate_status(self, value):
if value == S3FileStatus.Checked:
......@@ -61,7 +60,7 @@ class DataFileCreateSerializer(serializers.ModelSerializer):
fields = (
'id',
'name',
'hash',
'content_type',
'size',
'corpus',
'status',
......@@ -69,8 +68,6 @@ class DataFileCreateSerializer(serializers.ModelSerializer):
's3_put_url',
)
read_only_fields = ('id', 'status', 's3_url', 's3_put_url')
# Remove the default UniqueTogetherValidator
validators = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
......@@ -83,20 +80,3 @@ class DataFileCreateSerializer(serializers.ModelSerializer):
if obj.status == S3FileStatus.Checked:
return None
return obj.s3_put_url
def validate(self, data):
existing_datafile = DataFile.objects.filter(hash=data['hash'], corpus_id=data['corpus']).first()
if existing_datafile:
message = {
'hash': ['DataFile with this hash already exists in corpus'],
'id': str(existing_datafile.id),
'status': existing_datafile.status.value,
}
if existing_datafile.status != S3FileStatus.Checked:
message['s3_put_url'] = existing_datafile.s3_put_url
else:
message['s3_url'] = existing_datafile.s3_url
self._errors = message
raise serializers.ValidationError(message)
return super().validate(data)
......@@ -18,16 +18,15 @@ class TestDataFileApi(FixtureAPITestCase):
self.df = DataFile.objects.create(
name='test.pdf',
size=42,
hash='11111111111111111111111111111112',
content_type='application/pdf',
corpus=self.corpus
)
def build_file_create_request(self, name=None, hash=None, size=None, corpus=None):
def build_file_create_request(self, name=None, size=None, corpus=None):
return {
'name': name or 'some text',
'hash': hash or '552e21cd4cd9918678e3c1a0df491bc3',
'size': size or 1,
'content_type': 'text/plain',
'corpus': corpus or str(self.corpus.id),
}
......@@ -58,8 +57,8 @@ class TestDataFileApi(FixtureAPITestCase):
{
'id': str(df.id),
'name': df.name,
'hash': str(df.hash),
'size': df.size,
'content_type': df.content_type,
'status': df.status.value,
's3_url': df.s3_url,
's3_put_url': df.s3_put_url,
......@@ -67,38 +66,14 @@ class TestDataFileApi(FixtureAPITestCase):
}
)
self.assertListEqual(
[df.name, df.hash, df.size, df.s3_put_url],
['some text', '552e21cd4cd9918678e3c1a0df491bc3', 1, 'http://s3/upload_put_url']
[df.name, df.size, df.s3_put_url],
['some text', 1, 'http://s3/upload_put_url']
)
def test_create_existing_hash(self):
self.client.force_login(self.user)
request = self.build_file_create_request(hash='11111111111111111111111111111112')
response = self.client.post(reverse('api:file-create'), request)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
informations = response.json()
self.assertIn('hash', informations)
self.assertIn('id', informations)
self.assertIn('s3_put_url', informations)
def test_create_checked_existing_hash(self):
self.client.force_login(self.user)
self.df.status = S3FileStatus.Checked
self.df.save()
request = self.build_file_create_request(hash='11111111111111111111111111111112')
response = self.client.post(reverse('api:file-create'), request)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
informations = response.json().keys()
self.assertIn('hash', informations)
self.assertIn('id', informations)
self.assertNotIn('s3_put_url', informations)
self.assertIn('s3_url', informations)
@patch('arkindex.project.aws.s3.Object')
def test_check_uploaded_datafile(self, s3_object):
s3_object().e_tag = '11111111111111111111111111111112'
s3_object().content_length = '99942'
s3_object().content_type = 'test/testfile'
s3_object().content_length = 42
s3_object().content_type = 'application/pdf'
self.client.force_login(self.user)
response = self.client.patch(
......@@ -112,7 +87,6 @@ class TestDataFileApi(FixtureAPITestCase):
{
'id': str(self.df.id),
'name': self.df.name,
'hash': self.df.hash,
'content_type': self.df.content_type,
'size': self.df.size,
'status': self.df.status.value,
......@@ -121,22 +95,13 @@ class TestDataFileApi(FixtureAPITestCase):
}
)
self.assertEqual(self.df.status, S3FileStatus.Checked)
self.assertEqual(self.df.content_type, 'test/testfile')
self.assertEqual(self.df.size, 99942)
@patch('arkindex.project.aws.s3.Object')
def test_check_wrong_md5(self, s3_object):
s3_object().e_tag = 'wrong md5'
self.client.force_login(self.user)
response = self.client.patch(
reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
{'status': 'checked'},
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(self.df.content_type, 'application/pdf')
self.assertEqual(self.df.size, 42)
@patch('arkindex.project.aws.s3.Object')
def test_set_error_status_on_failure(self, s3_object):
s3_object().e_tag = 'wrong md5'
s3_object().content_type = 'wrong/type'
s3_object().content_length = -1
self.client.force_login(self.user)
response = self.client.patch(
reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
......@@ -150,7 +115,8 @@ class TestDataFileApi(FixtureAPITestCase):
def test_check_even_if_already_checked(self, s3_object):
self.df.status = S3FileStatus.Checked
self.df.save()
s3_object().e_tag = 'corrupted md5'
s3_object().content_type = 'wrong/type'
s3_object().content_length = -1
self.client.force_login(self.user)
response = self.client.patch(
reverse('api:file-retrieve', kwargs={'pk': self.df.id}),
......
......@@ -24,7 +24,6 @@ class TestFiles(FixtureAPITestCase):
cls.df = DataFile.objects.create(
name='test.txt',
size=42,
hash='aaaa',
content_type='text/plain',
corpus=cls.corpus
)
......@@ -97,7 +96,8 @@ class TestFiles(FixtureAPITestCase):
Assert a file upload creates a database instance and saves the file
"""
f = SimpleUploadedFile('test.txt', b'This is a text file')
s3_mock.Object.return_value.e_tag = "bdc70a6c91dbb9b87a3bde0436bfd3f0"
s3_mock.Object.return_value.content_length = 19
s3_mock.Object.return_value.content_type = 'text/plain'
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
......@@ -109,7 +109,6 @@ class TestFiles(FixtureAPITestCase):
self.assertEqual(df.size, 19)
self.assertEqual(df.content_type, 'text/plain')
self.assertEqual(df.corpus, self.corpus)
self.assertEqual(df.hash, s3_mock.Object.return_value.e_tag)
self.assertEqual(df.status, S3FileStatus.Checked)
self.assertEqual(s3_mock.Object.call_count, 1)
self.assertEqual(s3_mock.Object.call_args, call('staging', str(df.id)))
......@@ -148,7 +147,8 @@ class TestFiles(FixtureAPITestCase):
Assert file upload does not trust the client defined content type
"""
f = SimpleUploadedFile('test.mp3', b'This actually is a text file', content_type='audio/mpeg')
s3_mock.Object.return_value.e_tag = "a92f087e06876600af579a472ab4db96"
s3_mock.Object.return_value.content_length = 28
s3_mock.Object.return_value.content_type = 'text/plain'
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
......@@ -164,25 +164,24 @@ class TestFiles(FixtureAPITestCase):
self.assertEqual(s3_mock.Object().upload_fileobj.call_count, 1)
@patch('arkindex.project.aws.s3')
def test_file_upload_unique(self, s3_mock):
def test_file_upload_allows_duplicates(self, s3_mock):
"""
Assert uploading the same file twice fails
Assert uploading the same file twice is allowed
"""
s3_mock.Object.return_value.content_length = 19
s3_mock.Object.return_value.content_type = 'text/plain'
f = SimpleUploadedFile('test.txt', b'This is a text file')
s3_mock.Object.return_value.e_tag = "bdc70a6c91dbb9b87a3bde0436bfd3f0"
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
file_id = data['id']
first_file_id = data['id']
f = SimpleUploadedFile('test2.txt', b'This is a text file')
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
self.assertEqual(data['id'], file_id)
self.assertEqual(s3_mock.Object.call_count, 1)
self.assertEqual(s3_mock.Object().upload_fileobj.call_count, 1)
self.assertNotEqual(data['id'], first_file_id)
@patch('arkindex.project.aws.s3')
def test_file_upload_json(self, s3_mock):
......@@ -192,7 +191,8 @@ class TestFiles(FixtureAPITestCase):
f = SimpleUploadedFile('manifest', json.dumps({
'a': 'b',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "bd722b96a0bfdc0ef6115a2ee60b63f0"
s3_mock.Object.return_value.content_length = 10
s3_mock.Object.return_value.content_type = 'application/json'
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
......@@ -211,7 +211,8 @@ class TestFiles(FixtureAPITestCase):
f = SimpleUploadedFile('manifest', json.dumps({
'@context': 'http://iiif.io/api/presentation/2/context.json',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "e177a825d986b1bb00d933daa50ccb69"
s3_mock.Object.return_value.content_length = 62
s3_mock.Object.return_value.content_type = 'application/ld+json'
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
......@@ -230,7 +231,8 @@ class TestFiles(FixtureAPITestCase):
f = SimpleUploadedFile('manifest', json.dumps({
'@context': 'http://iiif.io/api/presentation/42/context.json',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "445d0c41d2f6188aaa1724c52f79dadd"
s3_mock.Object.return_value.content_length = 63
s3_mock.Object.return_value.content_type = 'application/json'
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
......
......@@ -24,19 +24,16 @@ class TestImports(FixtureAPITestCase):
cls.img_df = cls.corpus.files.create(
name='test.jpg',
size=42,
hash='aaaa',
content_type='image/jpg',
)
cls.pdf_df = cls.corpus.files.create(
name='test.pdf',
size=42,
hash='bbbb',
content_type='application/pdf',
)
cls.iiif_df = cls.corpus.files.create(
name='test.json',
size=42,
hash='cccc',
content_type='application/ld+json',
)
......@@ -318,7 +315,7 @@ class TestImports(FixtureAPITestCase):
def test_from_files_pdf_single(self):
self.client.force_login(self.user)
pdf2 = DataFile.objects.create(
name='test2.pdf', size=1337, hash='0', content_type='application/pdf', corpus=self.corpus)
name='test2.pdf', size=1337, content_type='application/pdf', corpus=self.corpus)
response = self.client.post(reverse('api:import-from-files'), {
'files': [str(self.pdf_df.id), str(pdf2.id)],
'folder_type': 'volume',
......
......@@ -32,7 +32,6 @@ class TestDeleteCorpus(FixtureTestCase):
cls.df = cls.di.files.create(
name='a.txt',
size=1234,
hash='c0ffee',
content_type='text/plain',
corpus=cls.corpus2,
)
......
......@@ -5,9 +5,10 @@ from django.db import models
from django.db.models.functions import Concat, Substr
from django.utils.functional import cached_property
from django.utils.text import slugify
from enumfields import EnumField
from arkindex.dataimport.models import DataFile
from arkindex.images.managers import ImageServerManager
from arkindex.project.aws import S3FileMixin, S3FileModelMixin, S3FileStatus
from arkindex.project.aws import S3FileMixin, S3FileStatus
from arkindex.project.fields import StripSlashURLField, LStripTextField, MD5HashField
from arkindex.project.models import IndexableModel
from arkindex.project.polygon import PolygonField
......@@ -156,7 +157,7 @@ class ImageServer(models.Model):
return '{}_{}'.format(self.display_name, self.id)
class Image(S3FileModelMixin, IndexableModel):
class Image(S3FileMixin, IndexableModel):
"""
An element image
"""
......@@ -165,8 +166,9 @@ class Image(S3FileModelMixin, IndexableModel):
width = models.PositiveIntegerField(default=0)
height = models.PositiveIntegerField(default=0)
datafile = models.ForeignKey(DataFile, related_name='images', null=True, on_delete=models.SET_NULL)
# Overwrite s3 hash to allow null value for external servers
# Allow null values for external servers
hash = MD5HashField(blank=True, null=True)
status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50)
@cached_property
def s3_bucket(self):
......@@ -207,7 +209,7 @@ class Image(S3FileModelMixin, IndexableModel):
@property
def s3_put_url(self):
assert self.server.is_local, 'Cannot PUT images on remote servers'
return super(S3FileModelMixin, self).s3_put_url
return super().s3_put_url
def perform_check(self, save=True, raise_exc=False):
"""
......
......@@ -130,7 +130,6 @@ class TestImageApi(FixtureAPITestCase):
df = self.corpus.files.create(
name='test.jpg',
size=42,
hash='aaaa',
content_type='image/jpeg',
)
......
from django.conf import settings
from django.db import models
from botocore.config import Config
from botocore.exceptions import ClientError
from django.utils.functional import cached_property
from arkindex.project.fields import MD5HashField
from enumfields import EnumField, Enum
from enumfields import Enum
from io import BytesIO
import boto3.session
import logging
......@@ -134,11 +132,3 @@ class S3FileMixin(object):
self.save()
if self.status == S3FileStatus.Error and raise_exc:
raise ValueError('MD5 hashes do not match')
class S3FileModelMixin(S3FileMixin, models.Model):
hash = MD5HashField()
status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50)
class Meta:
abstract = True
......@@ -218,92 +218,6 @@ paths:
/api/v1/imports/repos/{id}/:
delete:
description: Delete a repository
/api/v1/imports/upload/{id}/:
post:
responses:
'400':
description: An error occured while validating the file.
content:
application/json:
schema:
properties:
detail:
type: string
description: A generic error message when an error occurs outside of a specific field.
readOnly: true
id:
type: string
description: UUID of an existing DataFile, if the error comes from a duplicated upload.
readOnly: true
file:
type: array
description: One or more error messages for errors when validating the file itself.
readOnly: true
corpus:
type: array
description: One or more error messages for errors when validating the destination corpus ID.
readOnly: true
examples:
file-exists:
summary: An error where the data file already exists, including the existing file's UUID.
value:
file:
- File already exists
id: 3cc2e9e0-4172-44b1-8d65-bc3fffd076dc
/api/v1/imports/files/create/:
post:
responses:
'400':
description: An error occured while creating the data file.
content:
application/json:
schema:
properties:
detail:
type: string
description: A generic error message when an error occurs outside of a specific field.
readOnly: true
hash:
type: array
description: Errors that occured during hash field validation.
readOnly: true
corpus:
type: array
description: Errors that occured during corpus ID field validation.
readOnly: true
name:
type: array
description: Errors that occured during name field validation.
readOnly: true
size:
type: array
description: Errors that occured during size field validation.
readOnly: true
id:
type: string
description: UUID of existing DataFile, if the error comes from a duplicated creation.
readOnly: true
status:
type: string
description: Status of existing DataFile, if the error comes from a duplicated creation.
readOnly: true
s3_put_url:
type: string
description: Signed url used to upload file content to remote server, if the error comes from a duplicated creation and file status is not checked.
readOnly: true
s3_url:
type: string
description: Remote file url, if the error comes from a duplicated creation and file status is checked.
readOnly: true
examples:
file-exists:
summary: Data file already exists. Response include existing file's UUID, status and remote server PUT url to upload file content.
value:
hash:
- DataFile with this hash already exists
id: 55cd009d-cd4b-4ec2-a475-b060f98f9138
status: unchecked
s3_put_url: https://remote-server.net/staging/55cd009d-cd4b-4ec2-a475-b060f98f9138?Credential=mycredential&Signature=mysignature
/api/v1/imports/{id}/:
delete:
description: Delete a data import. Cannot be used on currently running data imports.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment