From ef71a48c91bf90ac28dd3488b371cec70fc19b9e Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Thu, 28 Mar 2019 10:17:22 +0000
Subject: [PATCH] Fix PDF workflows with S3

---
 arkindex/dataimport/tasks/pdf.py      | 10 ++--------
 arkindex/dataimport/tests/test_pdf.py | 15 ++++++---------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py
index eeee64357c..dd03bb913e 100644
--- a/arkindex/dataimport/tasks/pdf.py
+++ b/arkindex/dataimport/tasks/pdf.py
@@ -2,10 +2,9 @@ import distutils.spawn
 import glob
 import os
 import subprocess
-import tempfile
 
 
-def extract_pdf_images(pdf_file, working_dir):
+def extract_pdf_images(pdf_file, pdf_path, working_dir):
     """
     Convert a PDF file to a list of images
     """
@@ -13,20 +12,15 @@ def extract_pdf_images(pdf_file, working_dir):
     assert pdf_file.exists(), 'File does not exist'
     assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH'
 
-    _, path = tempfile.mkstemp()
-    pdf_file.s3_object.download_file(path)
-
     if not os.path.exists(working_dir):
         os.makedirs(working_dir)
 
     cmd = [
         'convert', '-density', '300',
-        'pdf:{}'.format(path),
+        'pdf:{}'.format(pdf_path),
         os.path.join(working_dir, 'pdf-%04d.jpg'),
     ]
     subprocess.run(cmd, check=True)
 
-    os.remove(path)
-
     # Dump all the images in the working dir
     return sorted(glob.glob(os.path.join(working_dir, '*.jpg')))
diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py
index c40d95fbad..7e641aa8fb 100644
--- a/arkindex/dataimport/tests/test_pdf.py
+++ b/arkindex/dataimport/tests/test_pdf.py
@@ -20,14 +20,13 @@ class TestPdf(FixtureTestCase):
         cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg')
         cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf')
 
-        cls.media_root = tempfile.mkdtemp()
         cls.working_dir = tempfile.mkdtemp()
-        shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf_file.id)))
+        cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id))
+        shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path)
 
     @classmethod
     def tearDownClass(cls):
         super().tearDownClass()
-        shutil.rmtree(cls.media_root)
         shutil.rmtree(cls.working_dir)
 
     def test_extract_pdf_images_filetype(self):
@@ -35,7 +34,7 @@ class TestPdf(FixtureTestCase):
         Test extract_pdf_images task the file's content type
         """
         with self.assertRaises(AssertionError):
-            extract_pdf_images(self.img_file, self.working_dir)
+            extract_pdf_images(self.img_file, self.pdf_path, self.working_dir)
 
     def test_extract_pdf_images_exists(self):
         """
@@ -46,7 +45,7 @@ class TestPdf(FixtureTestCase):
         file_mock.exists.return_value = False
 
         with self.assertRaises(AssertionError):
-            extract_pdf_images(file_mock, self.working_dir)
+            extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
 
     @patch('arkindex.dataimport.models.s3')
     def test_extract_pdf_images_s3_error(self, s3_mock):
@@ -59,16 +58,14 @@ class TestPdf(FixtureTestCase):
         file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object')
 
         with self.assertRaises(ClientError):
-            extract_pdf_images(file_mock, self.working_dir)
+            extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
 
     @patch('arkindex.dataimport.models.s3')
     def test_extract_pdf_images(self, s3_mock):
         """
         Test extract_pdf_images runs ImageMagick and returns proper info
         """
-        s3_mock.Object.return_value.download_file.side_effect = \
-            lambda path: shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), path)
-        result = extract_pdf_images(self.pdf_file, self.working_dir)
+        result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir)
 
         self.assertListEqual(result, [
             os.path.join(self.working_dir, 'pdf-0000.jpg'),
-- 
GitLab