diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d1249bac93680ebe8ca5f229b44ac2cdea5fc836..b0d47adb63c4e6003a04e32c0dee0e7c40a99305 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: registry.gitlab.com/arkindex/backend:base-0.9.3 +image: registry.gitlab.com/arkindex/backend:base-0.9.4 stages: - test - build diff --git a/Dockerfile b/Dockerfile index ec9bd5704d7d939e1c35a0faea93dd3624468e93..f3bdcc71278cb7186093854af2e6159d1e5ccee1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.gitlab.com/arkindex/backend:base-0.9.3 +FROM registry.gitlab.com/arkindex/backend:base-0.9.4 ARG COMMON_BRANCH=master ARG COMMON_ID=9855787 diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 14d31776269af891b39e381e6dae3c9ced352045..b24fe8ad621c86cd22fa644e55040a65e2b8adee 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -140,6 +140,7 @@ class DataImportFromFiles(CreateAPIView): # Serializer validation codes returns a volume for 'volume_id' volume = serializer.validated_data.get('volume_id') volume_name = serializer.validated_data.get('volume_name') + pdf_engine = serializer.validated_data.get('pdf_engine') if volume: # The files' corpus is already validated as writable @@ -159,13 +160,19 @@ class DataImportFromFiles(CreateAPIView): ) volume.events.create(type=EventType.Addition) + payload = { + 'volume_id': str(volume.id), + 'folder_name': str(volume.id), + } + + if mode == DataImportMode.PDF: + assert pdf_engine is not None, 'PDF engine does not exist' + payload['pdf_engine'] = pdf_engine.value + self.dataimport = corpus.imports.create( creator=self.request.user, mode=mode, - payload={ - 'volume_id': str(volume.id), - 'folder_name': str(volume.id), - }, + payload=payload, ) for f in files: self.dataimport.files.add(f) diff --git a/arkindex/dataimport/management/commands/import.py b/arkindex/dataimport/management/commands/import.py index 90cac6835c403a7559c1f4ff5b21fe6eb11444c9..65106cce007e3148a48b3b625c49cbb18bda4764 100644 --- a/arkindex/dataimport/management/commands/import.py +++ b/arkindex/dataimport/management/commands/import.py @@ -66,7 +66,7 @@ class Command(BaseCommand): assert pdf_file.content_type == 'application/pdf', 'File is not a PDF' # Extract images from the PDF into the task working dir and get their paths - images = extract_pdf_images(pdf_file, pdf_path, task_dir) + images = extract_pdf_images(pdf_file, pdf_path, task_dir, data_import.pdf_engine) # Add those images to the volume # all linked to the original pdf file diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 8851870b4e326e9b7e3ff04ca54e38262c2333dd..9708e2823d1131597aee784712b2b1bc6c0c3808 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -25,6 +25,11 @@ class DataImportMode(Enum): Repository = 'repository' +class DataImportPDFEngine(Enum): + Convert = 'convert' + Poppler = 'poppler' + + class DataImport(IndexableModel): """ A single import workflow @@ -52,6 +57,14 @@ class DataImport(IndexableModel): else: return self.workflow.state + @property + def pdf_engine(self): + if not self.payload: + return + if not self.payload.get('pdf_engine'): + return DataImportPDFEngine.Convert + return DataImportPDFEngine(self.payload.get('pdf_engine')) + def build_workflow(self): ''' Create a ponos workflow with a recipe according to configuration diff --git a/arkindex/dataimport/serializers.py b/arkindex/dataimport/serializers.py index 4a2d25f597881736f61c2e2abb7db0259ba0cbf4..4a4f606cded3a8250ab0f1fc3390decf0ee6de01 100644 --- a/arkindex/dataimport/serializers.py +++ b/arkindex/dataimport/serializers.py @@ -3,7 +3,7 @@ from rest_framework.utils import model_meta from arkindex.project.serializer_fields import EnumField from arkindex.dataimport.models import ( DataImport, DataImportMode, DataImportFailure, DataFile, - Repository, Revision, Event, EventType + Repository, Revision, Event, EventType, DataImportPDFEngine ) from arkindex.documents.models import Corpus, Element, ElementType from arkindex.documents.serializers.light import ElementLightSerializer @@ -39,7 +39,7 @@ class DataImportLightSerializer(serializers.ModelSerializer): read_only_fields = ('id', 'state') -class FilesPayloadSerializer(serializers.Serializer): +class ImagesPayloadSerializer(serializers.Serializer): """ Serialize an image importing payload """ @@ -48,6 +48,14 @@ class FilesPayloadSerializer(serializers.Serializer): volume_id = serializers.CharField() +class PDFPayloadSerializer(ImagesPayloadSerializer): + """ + Serialize a pdf importing payload + """ + + pdf_engine = EnumField(DataImportPDFEngine, default=DataImportPDFEngine.Convert) + + class RevisionSerializer(serializers.ModelSerializer): """ Serialize a repository revision @@ -85,8 +93,8 @@ class DataImportSerializer(DataImportLightSerializer): if self.instance is None: return payload_serializers = { - DataImportMode.Images: FilesPayloadSerializer, - DataImportMode.PDF: FilesPayloadSerializer, + DataImportMode.Images: ImagesPayloadSerializer, + DataImportMode.PDF: PDFPayloadSerializer, } self.fields['payload'] = payload_serializers.get(self.instance.mode, serializers.JSONField)() @@ -121,6 +129,7 @@ class DataImportFromFilesSerializer(serializers.Serializer): files = serializers.PrimaryKeyRelatedField(queryset=DataFile.objects.all(), many=True) volume_id = serializers.UUIDField(required=False, allow_null=True) volume_name = serializers.CharField(max_length=250, required=False, allow_null=True) + pdf_engine = EnumField(DataImportPDFEngine, default=DataImportPDFEngine.Convert) def validate_mode(self, mode): if mode not in (DataImportMode.Images, DataImportMode.PDF): diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py index dd03bb913e487b952655929d84e74a87ae0c9cd7..89a1bbbad8d7d4b78d5004f353899d95f072a91c 100644 --- a/arkindex/dataimport/tasks/pdf.py +++ b/arkindex/dataimport/tasks/pdf.py @@ -2,14 +2,37 @@ import distutils.spawn import glob import os import subprocess +import logging +from pdf2image import convert_from_path +from arkindex.dataimport.models import DataImportPDFEngine +from arkindex.project.tools import Timer +logger = logging.getLogger(__name__) -def extract_pdf_images(pdf_file, pdf_path, working_dir): + +def extract_pdf_images(pdf_file, pdf_path, working_dir, engine=DataImportPDFEngine.Convert): + assert pdf_file.content_type == 'application/pdf', 'File is not a PDF' + assert pdf_file.exists(), 'File does not exist' + + methods = { + DataImportPDFEngine.Convert: extract_pdf_images_convert, + DataImportPDFEngine.Poppler: extract_pdf_images_poppler, + } + + assert engine in methods, 'Unsupported engine {}'.format(str(engine)) + + logger.info('Convert PDF file with {}'.format(str(engine))) + method = methods[engine] + with Timer() as t: + images = method(pdf_file, pdf_path, working_dir) + logger.info('Time {}'.format(str(t.delta))) + return images + + +def extract_pdf_images_convert(pdf_file, pdf_path, working_dir): """ Convert a PDF file to a list of images """ - assert pdf_file.content_type == 'application/pdf', 'File is not a PDF' - assert pdf_file.exists(), 'File does not exist' assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH' if not os.path.exists(working_dir): @@ -24,3 +47,20 @@ def extract_pdf_images(pdf_file, pdf_path, working_dir): # Dump all the images in the working dir return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) + + +def extract_pdf_images_poppler(pdf_file, pdf_path, working_dir): + """ + Convert a PDF file to a list of images with poppler + """ + assert distutils.spawn.find_executable('pdfimages'), 'Missing pdfimages in PATH' + + if not os.path.exists(working_dir): + os.makedirs(working_dir) + + images = convert_from_path(pdf_path) + for i, img in enumerate(images): + img.save(os.path.join(working_dir, 'pdf-{}.jpg'.format(i))) + + # Dump all the images in the working dir + return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) diff --git a/arkindex/dataimport/tests/test_imports.py b/arkindex/dataimport/tests/test_imports.py index 2fe9e385a7791bac92e56e2e902109f6e26c749d..1ed4ccac22341d0af09cdc7ef69b1423114af9e6 100644 --- a/arkindex/dataimport/tests/test_imports.py +++ b/arkindex/dataimport/tests/test_imports.py @@ -1,7 +1,7 @@ from rest_framework import status from django.urls import reverse from arkindex.dataimport.models import \ - DataImport, DataImportMode, DataFile + DataImport, DataImportMode, DataFile, DataImportPDFEngine from arkindex.documents.models import Element, ElementType, Corpus from arkindex.project.tests import FixtureAPITestCase from ponos.models import State @@ -230,6 +230,42 @@ class TestImports(FixtureAPITestCase): self.assertIn('folder_name', dataimport.payload) self.assertEqual(Element.objects.get(id=dataimport.payload['volume_id']).name, 'Import pdf test') + def test_from_files_pdf_convert(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:import-from-files'), + {'files': [str(self.pdf_df.id)], 'mode': 'pdf', 'pdf_engine': 'convert'}, + format='json', + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + dataimport = DataImport.objects.get(id=data['id']) + self.assertEqual(dataimport.mode, DataImportMode.PDF) + self.assertListEqual(list(dataimport.files.all()), [self.pdf_df]) + self.assertIn('volume_id', dataimport.payload) + self.assertIn('folder_name', dataimport.payload) + self.assertIn('pdf_engine', dataimport.payload) + self.assertEqual(dataimport.pdf_engine, DataImportPDFEngine.Convert) + self.assertEqual(Element.objects.get(id=dataimport.payload['volume_id']).name, 'Import pdf test') + + def test_from_files_pdf_poppler(self): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:import-from-files'), + {'files': [str(self.pdf_df.id)], 'mode': 'pdf', 'pdf_engine': 'poppler'}, + format='json', + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + dataimport = DataImport.objects.get(id=data['id']) + self.assertEqual(dataimport.mode, DataImportMode.PDF) + self.assertListEqual(list(dataimport.files.all()), [self.pdf_df]) + self.assertIn('volume_id', dataimport.payload) + self.assertIn('folder_name', dataimport.payload) + self.assertIn('pdf_engine', dataimport.payload) + self.assertEqual(dataimport.pdf_engine, DataImportPDFEngine.Poppler) + self.assertEqual(Element.objects.get(id=dataimport.payload['volume_id']).name, 'Import pdf test') + def test_from_files_invalid_mode(self): self.client.force_login(self.user) response = self.client.post( diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py index 7e641aa8fbd41ab92dff76ca3339e932e0b77e4a..da87d98f1f223b72d799d91d43ee946a2cd44569 100644 --- a/arkindex/dataimport/tests/test_pdf.py +++ b/arkindex/dataimport/tests/test_pdf.py @@ -1,9 +1,11 @@ from arkindex.project.tests import FixtureTestCase from arkindex.dataimport.tasks import extract_pdf_images +from arkindex.dataimport.models import DataImportPDFEngine from unittest.mock import patch, MagicMock from botocore.exceptions import ClientError import tempfile import shutil +import glob import os.path FIXTURES = os.path.join( @@ -61,7 +63,7 @@ class TestPdf(FixtureTestCase): extract_pdf_images(file_mock, self.pdf_path, self.working_dir) @patch('arkindex.dataimport.models.s3') - def test_extract_pdf_images(self, s3_mock): + def test_extract_pdf_images_with_convert(self, s3_mock): """ Test extract_pdf_images runs ImageMagick and returns proper info """ @@ -71,3 +73,19 @@ class TestPdf(FixtureTestCase): os.path.join(self.working_dir, 'pdf-0000.jpg'), os.path.join(self.working_dir, 'pdf-0001.jpg'), ]) + + @patch('arkindex.dataimport.models.s3') + def test_extract_pdf_images_with_poppler(self, s3_mock): + """ + Test extract_pdf_images runs ImageMagick and returns proper info + """ + oldImages = glob.glob(os.path.join(self.working_dir, '*.jpg')) + for img in oldImages: + os.remove(os.path.join(self.working_dir, img)) + + result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir, DataImportPDFEngine.Poppler) + + self.assertListEqual(result, [ + os.path.join(self.working_dir, 'pdf-0.jpg'), + os.path.join(self.working_dir, 'pdf-1.jpg') + ]) diff --git a/arkindex/project/serializer_fields.py b/arkindex/project/serializer_fields.py index 6c3df8d5c75c23fca4e988199d5e02ea254a54e9..bc03c4f3eb8cbc8ed014c713e803995d6a14a32b 100644 --- a/arkindex/project/serializer_fields.py +++ b/arkindex/project/serializer_fields.py @@ -18,6 +18,8 @@ class EnumField(serializers.ChoiceField): super().__init__(choices, *args, **kwargs) def to_representation(self, obj): + if not isinstance(obj, self.enum): + obj = self.to_internal_value(obj) return obj.value def to_internal_value(self, data): diff --git a/base/Dockerfile b/base/Dockerfile index ef1d48c8f3808f1d5d2c413c2f6589899fdf34ef..40ddccc93f8411067a8492c6afd21526b3d14c73 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -19,7 +19,7 @@ COPY --from=staging /build /usr ENV PYTHONPATH=/usr/lib/python3.6/site-packages # Add runtime system deps -RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick +RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick poppler-utils # Add unprivilegied user RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark diff --git a/requirements.txt b/requirements.txt index 10dd5e991ad912f1250fcf1da4011d8b2c710665..da4456f39860e61f32fde0aff7a507cc2440a7af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ gitpython==2.1.11 idna==2.6 jdcal==1.3 olefile==0.44 +pdf2image==1.5.1 python-gitlab==1.7.0 python-magic==0.4.15 python-memcached==1.59