From e2a92722856dded1ac1542ad00bef5c51a34ea04 Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Tue, 9 Jun 2020 09:30:42 +0200
Subject: [PATCH] Remove JSON-LD detection on file upload

---
 arkindex/dataimport/api.py                 | 21 ++++-------
 arkindex/dataimport/serializers/imports.py |  2 +-
 arkindex/dataimport/tests/test_files.py    | 42 +---------------------
 arkindex/dataimport/tests/test_imports.py  |  2 +-
 4 files changed, 10 insertions(+), 57 deletions(-)

diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py
index d0971a6b8d..2c42faa1a6 100644
--- a/arkindex/dataimport/api.py
+++ b/arkindex/dataimport/api.py
@@ -371,26 +371,19 @@ class DataFileUpload(CorpusACLMixin, APIView):
         file_type = magic.from_buffer(file_obj.read(1024), mime=True)
 
         # libmagic 5.35 recognizes JSON, but older versions detect it as text/plain.
-        # To allow for IIIF imports, if the file is small enough, try to read as JSON.
-        # JSON-LD files with an expected IIIF context will use application/ld+json.
-        # JSON and JSON-LD files without the IIIF context will use application/json.
-        if file_type in ('text/plain', 'application/json') and file_obj.size < 5e6:
+        # To allow for IIIF imports, if the file is small enough, try to read as JSON,
+        # and use application/json instead.
+        if file_type == 'text/plain' and file_obj.size < 5e6:
             # Reopen file to reread from beginning
             file_obj.open()
             try:
-                jsonld_context = next(ijson.items(file_obj, '@context'))
+                for _ in ijson.parse(file_obj):
+                    # Do nothing, just parse through the whole file to check for its syntax without using memory
+                    pass
             except ijson.JSONError:
                 pass
-            except StopIteration:
-                file_type = 'application/json'
             else:
-                # The JSON-LD @context attribute can be a string or an array of strings
-                if isinstance(jsonld_context, str):
-                    jsonld_context = [jsonld_context]
-                if isinstance(jsonld_context, list) and settings.IIIF_PRESENTATION_CONTEXT in jsonld_context:
-                    file_type = 'application/ld+json'
-                else:
-                    file_type = 'application/json'
+                file_type = 'application/json'
 
         df = DataFile(
             corpus=corpus,
diff --git a/arkindex/dataimport/serializers/imports.py b/arkindex/dataimport/serializers/imports.py
index 49b56cee5e..9444f53df5 100644
--- a/arkindex/dataimport/serializers/imports.py
+++ b/arkindex/dataimport/serializers/imports.py
@@ -153,7 +153,7 @@ class DataImportFromFilesSerializer(serializers.Serializer):
                 self.fail('images_only')
 
         elif data['mode'] == DataImportMode.IIIF:
-            if not set(f.content_type for f in data['files']) == {'application/ld+json'}:
+            if not set(f.content_type for f in data['files']) <= {'application/json', 'application/ld+json'}:
                 self.fail('iiif_only')
 
         else:
diff --git a/arkindex/dataimport/tests/test_files.py b/arkindex/dataimport/tests/test_files.py
index 8deb01dce5..4bed87a259 100644
--- a/arkindex/dataimport/tests/test_files.py
+++ b/arkindex/dataimport/tests/test_files.py
@@ -186,7 +186,7 @@ class TestFiles(FixtureAPITestCase):
     @patch('arkindex.project.aws.s3')
     def test_file_upload_json(self, s3_mock):
         """
-        Assert uploading a JSON document (not JSON-LD) uses application/json
+        Assert uploading a JSON document uses application/json
         """
         f = SimpleUploadedFile('manifest', json.dumps({
             'a': 'b',
@@ -202,43 +202,3 @@ class TestFiles(FixtureAPITestCase):
         df = DataFile.objects.get(id=data['id'])
         self.assertEqual(df.name, 'manifest')
         self.assertEqual(df.content_type, 'application/json')
-
-    @patch('arkindex.project.aws.s3')
-    def test_file_upload_iiif(self, s3_mock):
-        """
-        Assert uploading a JSON-LD document with an IIIF context uses application/ld+json
-        """
-        f = SimpleUploadedFile('manifest', json.dumps({
-            '@context': 'http://iiif.io/api/presentation/2/context.json',
-        }).encode('utf-8'))
-        s3_mock.Object.return_value.content_length = 62
-        s3_mock.Object.return_value.content_type = 'application/ld+json'
-
-        response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
-        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
-        data = response.json()
-        self.assertIn('id', data)
-
-        df = DataFile.objects.get(id=data['id'])
-        self.assertEqual(df.name, 'manifest')
-        self.assertEqual(df.content_type, 'application/ld+json')
-
-    @patch('arkindex.project.aws.s3')
-    def test_file_upload_not_iiif(self, s3_mock):
-        """
-        Assert uploading a JSON-LD document without an IIIF context uses application/json
-        """
-        f = SimpleUploadedFile('manifest', json.dumps({
-            '@context': 'http://iiif.io/api/presentation/42/context.json',
-        }).encode('utf-8'))
-        s3_mock.Object.return_value.content_length = 63
-        s3_mock.Object.return_value.content_type = 'application/json'
-
-        response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f})
-        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
-        data = response.json()
-        self.assertIn('id', data)
-
-        df = DataFile.objects.get(id=data['id'])
-        self.assertEqual(df.name, 'manifest')
-        self.assertEqual(df.content_type, 'application/json')
diff --git a/arkindex/dataimport/tests/test_imports.py b/arkindex/dataimport/tests/test_imports.py
index b9bebeac79..a616e4f1bc 100644
--- a/arkindex/dataimport/tests/test_imports.py
+++ b/arkindex/dataimport/tests/test_imports.py
@@ -34,7 +34,7 @@ class TestImports(FixtureAPITestCase):
         cls.iiif_df = cls.corpus.files.create(
             name='test.json',
             size=42,
-            content_type='application/ld+json',
+            content_type='application/json',
         )
 
     def setUp(self):
-- 
GitLab