From ef71a48c91bf90ac28dd3488b371cec70fc19b9e Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Thu, 28 Mar 2019 10:17:22 +0000 Subject: [PATCH] Fix PDF workflows with S3 --- arkindex/dataimport/tasks/pdf.py | 10 ++-------- arkindex/dataimport/tests/test_pdf.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py index eeee64357c..dd03bb913e 100644 --- a/arkindex/dataimport/tasks/pdf.py +++ b/arkindex/dataimport/tasks/pdf.py @@ -2,10 +2,9 @@ import distutils.spawn import glob import os import subprocess -import tempfile -def extract_pdf_images(pdf_file, working_dir): +def extract_pdf_images(pdf_file, pdf_path, working_dir): """ Convert a PDF file to a list of images """ @@ -13,20 +12,15 @@ def extract_pdf_images(pdf_file, working_dir): assert pdf_file.exists(), 'File does not exist' assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH' - _, path = tempfile.mkstemp() - pdf_file.s3_object.download_file(path) - if not os.path.exists(working_dir): os.makedirs(working_dir) cmd = [ 'convert', '-density', '300', - 'pdf:{}'.format(path), + 'pdf:{}'.format(pdf_path), os.path.join(working_dir, 'pdf-%04d.jpg'), ] subprocess.run(cmd, check=True) - os.remove(path) - # Dump all the images in the working dir return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py index c40d95fbad..7e641aa8fb 100644 --- a/arkindex/dataimport/tests/test_pdf.py +++ b/arkindex/dataimport/tests/test_pdf.py @@ -20,14 +20,13 @@ class TestPdf(FixtureTestCase): cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg') cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf') - cls.media_root = tempfile.mkdtemp() cls.working_dir = tempfile.mkdtemp() - shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf_file.id))) + cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id)) + shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path) @classmethod def tearDownClass(cls): super().tearDownClass() - shutil.rmtree(cls.media_root) shutil.rmtree(cls.working_dir) def test_extract_pdf_images_filetype(self): @@ -35,7 +34,7 @@ class TestPdf(FixtureTestCase): Test extract_pdf_images task the file's content type """ with self.assertRaises(AssertionError): - extract_pdf_images(self.img_file, self.working_dir) + extract_pdf_images(self.img_file, self.pdf_path, self.working_dir) def test_extract_pdf_images_exists(self): """ @@ -46,7 +45,7 @@ class TestPdf(FixtureTestCase): file_mock.exists.return_value = False with self.assertRaises(AssertionError): - extract_pdf_images(file_mock, self.working_dir) + extract_pdf_images(file_mock, self.pdf_path, self.working_dir) @patch('arkindex.dataimport.models.s3') def test_extract_pdf_images_s3_error(self, s3_mock): @@ -59,16 +58,14 @@ class TestPdf(FixtureTestCase): file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object') with self.assertRaises(ClientError): - extract_pdf_images(file_mock, self.working_dir) + extract_pdf_images(file_mock, self.pdf_path, self.working_dir) @patch('arkindex.dataimport.models.s3') def test_extract_pdf_images(self, s3_mock): """ Test extract_pdf_images runs ImageMagick and returns proper info """ - s3_mock.Object.return_value.download_file.side_effect = \ - lambda path: shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), path) - result = extract_pdf_images(self.pdf_file, self.working_dir) + result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir) self.assertListEqual(result, [ os.path.join(self.working_dir, 'pdf-0000.jpg'), -- GitLab