Skip to content
Snippets Groups Projects
Commit 2b431726 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'detect-iiif-datafile' into 'master'

Detect MIME types for JSON and JSON-LD

See merge request !646
parents 52cdeda3 6e00ca9a
No related branches found
No related tags found
1 merge request!646Detect MIME types for JSON and JSON-LD
......@@ -33,6 +33,7 @@ from ponos.models import State
from datetime import datetime
from uuid import UUID
import hashlib
import ijson
import magic
import os.path
......@@ -371,10 +372,6 @@ class DataFileUpload(CorpusACLMixin, APIView):
md5hash.update(chunk)
file_hash = md5hash.hexdigest()
# Reopen file to reread from beginning
file_obj.open()
file_type = magic.from_buffer(file_obj.read(1024), mime=True)
existing_file = corpus.files.filter(hash=file_hash).first()
if existing_file:
raise ValidationError({
......@@ -382,6 +379,31 @@ class DataFileUpload(CorpusACLMixin, APIView):
'id': str(existing_file.id),
})
# Reopen file to reread from beginning
file_obj.open()
file_type = magic.from_buffer(file_obj.read(1024), mime=True)
# libmagic 5.35 recognizes JSON, but older versions detect it as text/plain.
# To allow for IIIF imports, if the file is small enough, try to read as JSON.
# JSON-LD files with an expected IIIF context will use application/ld+json.
# JSON and JSON-LD files without the IIIF context will use application/json.
if file_type in ('text/plain', 'application/json') and file_obj.size < 5e6:
file_obj.open()
try:
jsonld_context = next(ijson.items(file_obj, '@context'))
except ijson.JSONError:
pass
except StopIteration:
file_type = 'application/json'
else:
# The JSON-LD @context attribute can be a string or an array of strings
if isinstance(jsonld_context, str):
jsonld_context = [jsonld_context]
if isinstance(jsonld_context, list) and settings.IIIF_PRESENTATION_CONTEXT in jsonld_context:
file_type = 'application/ld+json'
else:
file_type = 'application/json'
df = DataFile(
hash=file_hash,
corpus=corpus,
......
......@@ -8,6 +8,7 @@ from arkindex.dataimport.models import DataFile
from arkindex.images.models import ImageServer
from arkindex.project.tests import FixtureAPITestCase
from arkindex.project.aws import S3FileStatus
import json
import uuid
......@@ -182,3 +183,60 @@ class TestFiles(FixtureAPITestCase):
self.assertEqual(s3_mock.Object.call_count, 1)
self.assertEqual(s3_mock.Object().upload_fileobj.call_count, 1)
@patch('arkindex.project.aws.s3')
def test_file_upload_json(self, s3_mock):
"""
Assert uploading a JSON document (not JSON-LD) uses application/json
"""
f = SimpleUploadedFile('manifest', json.dumps({
'a': 'b',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "bd722b96a0bfdc0ef6115a2ee60b63f0"
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
self.assertIn('id', data)
df = DataFile.objects.get(id=data['id'])
self.assertEqual(df.name, 'manifest')
self.assertEqual(df.content_type, 'application/json')
@patch('arkindex.project.aws.s3')
def test_file_upload_iiif(self, s3_mock):
"""
Assert uploading a JSON-LD document with an IIIF context uses application/ld+json
"""
f = SimpleUploadedFile('manifest', json.dumps({
'@context': 'http://iiif.io/api/presentation/2/context.json',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "e177a825d986b1bb00d933daa50ccb69"
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
self.assertIn('id', data)
df = DataFile.objects.get(id=data['id'])
self.assertEqual(df.name, 'manifest')
self.assertEqual(df.content_type, 'application/ld+json')
@patch('arkindex.project.aws.s3')
def test_file_upload_not_iiif(self, s3_mock):
"""
Assert uploading a JSON-LD document without an IIIF context uses application/json
"""
f = SimpleUploadedFile('manifest', json.dumps({
'@context': 'http://iiif.io/api/presentation/42/context.json',
}).encode('utf-8'))
s3_mock.Object.return_value.e_tag = "445d0c41d2f6188aaa1724c52f79dadd"
response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
self.assertIn('id', data)
df = DataFile.objects.get(id=data['id'])
self.assertEqual(df.name, 'manifest')
self.assertEqual(df.content_type, 'application/json')
......@@ -2,6 +2,7 @@ from django.core.checks import register, Warning, Error
from ponos.recipe import parse_recipe
import os.path
import subprocess
import magic
import yaml
......@@ -205,3 +206,15 @@ def s3_check(*args, **kwargs):
id='arkindex.E011',
))
return errors
@register()
def libmagic_check(*args, **kwargs):
version = magic.libmagic.magic_version()
if version < 535:
return [Warning(
'Outdated libmagic version {}. '
'Versions before 535 may detect JSON files as text/plain.'.format(version),
id='arkindex.W005',
)]
return []
......@@ -260,3 +260,17 @@ class ChecksTestCase(TestCase):
settings.AWS_THUMBNAIL_BUCKET = 'Thumbs.db'
settings.AWS_STAGING_BUCKET = 'buckette'
self.assertListEqual(s3_check(), [])
@patch('arkindex.project.checks.magic')
def test_libmagic_check(self, magic_mock):
from arkindex.project.checks import libmagic_check
magic_mock.libmagic.magic_version.return_value = 534
self.assertListEqual(libmagic_check(), [
Warning(
'Outdated libmagic version 534. Versions before 535 may detect JSON files as text/plain.',
id='arkindex.W005',
)
])
magic_mock.libmagic.magic_version.return_value = 535
self.assertListEqual(libmagic_check(), [])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment