Skip to content
Snippets Groups Projects
Commit ef71a48c authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Fix PDF workflows with S3

parent 6968998e
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,9 @@ import distutils.spawn
import glob
import os
import subprocess
import tempfile
def extract_pdf_images(pdf_file, working_dir):
def extract_pdf_images(pdf_file, pdf_path, working_dir):
"""
Convert a PDF file to a list of images
"""
......@@ -13,20 +12,15 @@ def extract_pdf_images(pdf_file, working_dir):
assert pdf_file.exists(), 'File does not exist'
assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH'
_, path = tempfile.mkstemp()
pdf_file.s3_object.download_file(path)
if not os.path.exists(working_dir):
os.makedirs(working_dir)
cmd = [
'convert', '-density', '300',
'pdf:{}'.format(path),
'pdf:{}'.format(pdf_path),
os.path.join(working_dir, 'pdf-%04d.jpg'),
]
subprocess.run(cmd, check=True)
os.remove(path)
# Dump all the images in the working dir
return sorted(glob.glob(os.path.join(working_dir, '*.jpg')))
......@@ -20,14 +20,13 @@ class TestPdf(FixtureTestCase):
cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg')
cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf')
cls.media_root = tempfile.mkdtemp()
cls.working_dir = tempfile.mkdtemp()
shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf_file.id)))
cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id))
shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path)
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.media_root)
shutil.rmtree(cls.working_dir)
def test_extract_pdf_images_filetype(self):
......@@ -35,7 +34,7 @@ class TestPdf(FixtureTestCase):
Test extract_pdf_images task the file's content type
"""
with self.assertRaises(AssertionError):
extract_pdf_images(self.img_file, self.working_dir)
extract_pdf_images(self.img_file, self.pdf_path, self.working_dir)
def test_extract_pdf_images_exists(self):
"""
......@@ -46,7 +45,7 @@ class TestPdf(FixtureTestCase):
file_mock.exists.return_value = False
with self.assertRaises(AssertionError):
extract_pdf_images(file_mock, self.working_dir)
extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
@patch('arkindex.dataimport.models.s3')
def test_extract_pdf_images_s3_error(self, s3_mock):
......@@ -59,16 +58,14 @@ class TestPdf(FixtureTestCase):
file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object')
with self.assertRaises(ClientError):
extract_pdf_images(file_mock, self.working_dir)
extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
@patch('arkindex.dataimport.models.s3')
def test_extract_pdf_images(self, s3_mock):
"""
Test extract_pdf_images runs ImageMagick and returns proper info
"""
s3_mock.Object.return_value.download_file.side_effect = \
lambda path: shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), path)
result = extract_pdf_images(self.pdf_file, self.working_dir)
result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir)
self.assertListEqual(result, [
os.path.join(self.working_dir, 'pdf-0000.jpg'),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment