Skip to content
Snippets Groups Projects
Commit 2be1287e authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'pdf-s3' into 'master'

Fix PDF workflows with S3

See merge request !259
parents 6968998e ef71a48c
No related branches found
No related tags found
1 merge request!259Fix PDF workflows with S3
......@@ -2,10 +2,9 @@ import distutils.spawn
import glob
import os
import subprocess
import tempfile
def extract_pdf_images(pdf_file, working_dir):
def extract_pdf_images(pdf_file, pdf_path, working_dir):
"""
Convert a PDF file to a list of images
"""
......@@ -13,20 +12,15 @@ def extract_pdf_images(pdf_file, working_dir):
assert pdf_file.exists(), 'File does not exist'
assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH'
_, path = tempfile.mkstemp()
pdf_file.s3_object.download_file(path)
if not os.path.exists(working_dir):
os.makedirs(working_dir)
cmd = [
'convert', '-density', '300',
'pdf:{}'.format(path),
'pdf:{}'.format(pdf_path),
os.path.join(working_dir, 'pdf-%04d.jpg'),
]
subprocess.run(cmd, check=True)
os.remove(path)
# Dump all the images in the working dir
return sorted(glob.glob(os.path.join(working_dir, '*.jpg')))
......@@ -20,14 +20,13 @@ class TestPdf(FixtureTestCase):
cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg')
cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf')
cls.media_root = tempfile.mkdtemp()
cls.working_dir = tempfile.mkdtemp()
shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf_file.id)))
cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id))
shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path)
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.media_root)
shutil.rmtree(cls.working_dir)
def test_extract_pdf_images_filetype(self):
......@@ -35,7 +34,7 @@ class TestPdf(FixtureTestCase):
Test extract_pdf_images task the file's content type
"""
with self.assertRaises(AssertionError):
extract_pdf_images(self.img_file, self.working_dir)
extract_pdf_images(self.img_file, self.pdf_path, self.working_dir)
def test_extract_pdf_images_exists(self):
"""
......@@ -46,7 +45,7 @@ class TestPdf(FixtureTestCase):
file_mock.exists.return_value = False
with self.assertRaises(AssertionError):
extract_pdf_images(file_mock, self.working_dir)
extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
@patch('arkindex.dataimport.models.s3')
def test_extract_pdf_images_s3_error(self, s3_mock):
......@@ -59,16 +58,14 @@ class TestPdf(FixtureTestCase):
file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object')
with self.assertRaises(ClientError):
extract_pdf_images(file_mock, self.working_dir)
extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
@patch('arkindex.dataimport.models.s3')
def test_extract_pdf_images(self, s3_mock):
"""
Test extract_pdf_images runs ImageMagick and returns proper info
"""
s3_mock.Object.return_value.download_file.side_effect = \
lambda path: shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), path)
result = extract_pdf_images(self.pdf_file, self.working_dir)
result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir)
self.assertListEqual(result, [
os.path.join(self.working_dir, 'pdf-0000.jpg'),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment