diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py index eeee64357c87eaf78ff07901bbcdf536f90908e4..dd03bb913e487b952655929d84e74a87ae0c9cd7 100644 --- a/arkindex/dataimport/tasks/pdf.py +++ b/arkindex/dataimport/tasks/pdf.py @@ -2,10 +2,9 @@ import distutils.spawn import glob import os import subprocess -import tempfile -def extract_pdf_images(pdf_file, working_dir): +def extract_pdf_images(pdf_file, pdf_path, working_dir): """ Convert a PDF file to a list of images """ @@ -13,20 +12,15 @@ def extract_pdf_images(pdf_file, working_dir): assert pdf_file.exists(), 'File does not exist' assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH' - _, path = tempfile.mkstemp() - pdf_file.s3_object.download_file(path) - if not os.path.exists(working_dir): os.makedirs(working_dir) cmd = [ 'convert', '-density', '300', - 'pdf:{}'.format(path), + 'pdf:{}'.format(pdf_path), os.path.join(working_dir, 'pdf-%04d.jpg'), ] subprocess.run(cmd, check=True) - os.remove(path) - # Dump all the images in the working dir return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py index c40d95fbad56035d6504c82c67be24d4e59d9fc7..7e641aa8fbd41ab92dff76ca3339e932e0b77e4a 100644 --- a/arkindex/dataimport/tests/test_pdf.py +++ b/arkindex/dataimport/tests/test_pdf.py @@ -20,14 +20,13 @@ class TestPdf(FixtureTestCase): cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg') cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf') - cls.media_root = tempfile.mkdtemp() cls.working_dir = tempfile.mkdtemp() - shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf_file.id))) + cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id)) + shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path) @classmethod def tearDownClass(cls): super().tearDownClass() - shutil.rmtree(cls.media_root) shutil.rmtree(cls.working_dir) def test_extract_pdf_images_filetype(self): @@ -35,7 +34,7 @@ class TestPdf(FixtureTestCase): Test extract_pdf_images task the file's content type """ with self.assertRaises(AssertionError): - extract_pdf_images(self.img_file, self.working_dir) + extract_pdf_images(self.img_file, self.pdf_path, self.working_dir) def test_extract_pdf_images_exists(self): """ @@ -46,7 +45,7 @@ class TestPdf(FixtureTestCase): file_mock.exists.return_value = False with self.assertRaises(AssertionError): - extract_pdf_images(file_mock, self.working_dir) + extract_pdf_images(file_mock, self.pdf_path, self.working_dir) @patch('arkindex.dataimport.models.s3') def test_extract_pdf_images_s3_error(self, s3_mock): @@ -59,16 +58,14 @@ class TestPdf(FixtureTestCase): file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object') with self.assertRaises(ClientError): - extract_pdf_images(file_mock, self.working_dir) + extract_pdf_images(file_mock, self.pdf_path, self.working_dir) @patch('arkindex.dataimport.models.s3') def test_extract_pdf_images(self, s3_mock): """ Test extract_pdf_images runs ImageMagick and returns proper info """ - s3_mock.Object.return_value.download_file.side_effect = \ - lambda path: shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), path) - result = extract_pdf_images(self.pdf_file, self.working_dir) + result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir) self.assertListEqual(result, [ os.path.join(self.working_dir, 'pdf-0000.jpg'),