Skip to content
Snippets Groups Projects
Commit a11ed3b6 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'import-pdf' into 'master'

PDF import

See merge request !132
parents d77303d3 79bad2e6
No related branches found
No related tags found
1 merge request!132PDF import
......@@ -3,7 +3,7 @@ stages:
backend-tests:
stage: test
image: registry.gitlab.com/arkindex/backend:base-0.8.0
image: registry.gitlab.com/arkindex/backend:base-0.8.7
services:
- postgres:latest
......
FROM registry.gitlab.com/arkindex/backend:base-0.8.0
FROM registry.gitlab.com/arkindex/backend:base-0.8.7
ARG FRONTEND_BRANCH=master
ARG GITLAB_TOKEN="gTPA5UQYesSuKMCRM2r_"
......
......@@ -26,6 +26,7 @@ from arkindex.users.models import OAuthCredentials
from datetime import datetime
import hashlib
import magic
import os.path
class DataImportsList(CorpusACLMixin, ListAPIView):
......@@ -95,16 +96,21 @@ class DataImportFromFiles(CreateAPIView):
def perform_create(self, serializer):
files = serializer.validated_data['files']
corpus = files[0].corpus
mode = serializer.validated_data['mode']
suffix = datetime.today().strftime('%Y-%m-%d %H:%M')
if mode == DataImportMode.PDF:
suffix = os.path.splitext(files[0].name)[0]
volume = corpus.elements.create(
name='Import images {:%Y-%m-%d %H:%M}'.format(datetime.today()),
name='Import {} {}'.format(mode.value, suffix),
type=ElementType.Volume,
)
volume.events.create(type=EventType.Addition)
self.dataimport = DataImport.objects.create(
self.dataimport = corpus.imports.create(
creator=self.request.user,
corpus=corpus,
state=DataImportState.Configured,
mode=DataImportMode.Images,
mode=mode,
payload={
'volume_id': str(volume.id),
'folder_name': str(volume.id),
......
......@@ -24,6 +24,7 @@ class DataImportState(Enum):
class DataImportMode(Enum):
Images = 'images'
PDF = 'pdf'
Repository = 'repository'
......@@ -69,22 +70,29 @@ class DataImport(IndexableModel):
return self.tasks[-1].result
def build_workflow(self):
if self.mode == DataImportMode.Images:
from arkindex.dataimport.tasks import check_images, import_images, save_ml_results
analyze = celery_app.signature('arkindex_ml.tasks.analyze_pages')
workflow = check_images.s(self) | import_images.s(self) | analyze
workflow |= save_ml_results.s(dataimport_id=self.id)
return workflow
elif self.mode == DataImportMode.Repository:
if self.mode == DataImportMode.Repository:
assert self.revision, 'A Revision instance is required for repository imports'
assert self.revision.repo.enabled, 'Repository does not have any credentials'
from arkindex.dataimport.tasks import clone_repo, diff_repo, dispatch_imports, cleanup_repo
return clone_repo.si(self) | diff_repo.si(self) | dispatch_imports.s(self) | cleanup_repo.si(self)
from arkindex.dataimport.tasks import import_pdf, check_images, import_images, build_volume, save_ml_results
from arkindex.documents.tasks import generate_thumbnail
if self.mode == DataImportMode.Images:
workflow = check_images.s(self) | import_images.s(self)
elif self.mode == DataImportMode.PDF:
workflow = import_pdf.s(self)
else:
raise NotImplementedError
analyze = celery_app.signature('arkindex_ml.tasks.analyze_pages')
workflow |= build_volume.s(self)
workflow |= analyze
workflow |= save_ml_results.s(dataimport_id=self.id)
workflow |= generate_thumbnail.si(self.payload['volume_id'])
return workflow
def get_task_count(self, signature):
assert isinstance(signature, Signature)
......
......@@ -165,8 +165,14 @@ class DataImportSerializer(DataImportLightSerializer):
class DataImportFromFilesSerializer(serializers.Serializer):
mode = EnumField(DataImportMode, default=DataImportMode.Images)
files = serializers.PrimaryKeyRelatedField(queryset=DataFile.objects.all(), many=True)
def validate_mode(self, mode):
if mode not in (DataImportMode.Images, DataImportMode.PDF):
raise serializers.ValidationError("This mode is not allowed when importing from files")
return mode
def validate_files(self, files):
corpora = set(f.corpus for f in files)
if len(corpora) > 1:
......@@ -176,6 +182,22 @@ class DataImportFromFilesSerializer(serializers.Serializer):
raise serializers.ValidationError('Cannot write in corpus')
return files
def validate(self, data):
if data['mode'] == DataImportMode.PDF:
if len(data['files']) > 1:
raise serializers.ValidationError('Cannot import more than one PDF at once')
if data['files'][0].content_type != 'application/pdf':
raise serializers.ValidationError('PDF imports can only import PDF files')
elif data['mode'] == DataImportMode.Images:
if not all(f.content_type.startswith('image/') for f in data['files']):
raise serializers.ValidationError('Image imports can only import image files')
else:
raise NotImplementedError
return data
class DataFileSerializer(serializers.ModelSerializer):
"""
......
......@@ -22,6 +22,8 @@ import logging
import shutil
import urllib.parse
import git
import subprocess
import distutils.spawn
root_logger = logging.getLogger(__name__)
logger = get_task_logger(__name__)
......@@ -65,35 +67,53 @@ def check_images(self, dataimport):
return valid_files
ImportedImage = namedtuple('ImportedImage', 'datafile, path, iiif_path')
@shared_task(bind=True, base=ReportingTask)
def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGESERVER_ID):
def import_pdf(self, dataimport):
"""
Convert a PDF file directly into the IIIF server
"""
assert isinstance(dataimport, DataImport)
self.report_progress(0, 'Pre import checks...')
try:
server = ImageServer.objects.get(id=server_id)
except ImageServer.DoesNotExist:
raise ValueError('Image server {} does not exist'.format(server_id))
assert dataimport.files.count() == 1, 'Cannot import more than one PDF at once'
datafile = dataimport.files.get()
assert datafile.content_type == 'application/pdf', 'File is not a PDF'
assert os.path.exists(datafile.staging_path), 'File does not exist'
assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH'
# Get volume
try:
volume = Element.objects.get(
type=ElementType.Volume,
corpus=dataimport.corpus,
pk=dataimport.payload['volume_id'],
if not os.path.exists(dataimport.iiif_path):
os.makedirs(dataimport.iiif_path)
cmd = [
'convert', '-density', '300',
'pdf:{}'.format(datafile.staging_path),
os.path.join(dataimport.iiif_path, '%04d.jpg'),
]
subprocess.run(cmd, check=True)
return [
ImportedImage(
datafile,
path,
urllib.parse.urljoin(dataimport.folder_name + '/', os.path.basename(path)),
)
except Element.DoesNotExist:
raise ValueError('Missing volume')
for path in sorted(glob.glob(os.path.join(dataimport.iiif_path, '*')))
]
@shared_task(bind=True, base=ReportingTask)
def import_images(self, valid_files, dataimport):
assert isinstance(dataimport, DataImport)
if not os.path.exists(dataimport.iiif_path):
os.makedirs(dataimport.iiif_path)
datafiles = dataimport.files.all()
pages = []
result = []
for i, datafile in enumerate(datafiles, 1):
self.report_progress(i / len(datafiles), 'Importing image {} of {}'.format(i, len(datafiles)))
pillow_img = Image.open(datafile.staging_path)
width, height = pillow_img.size
# Non-JPEG image formats that should not be converted
# Will default to .jpg if the image format is not in there
......@@ -117,8 +137,45 @@ def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGES
destination_path = os.path.join(dataimport.iiif_path, newfilename)
pillow_img.save(destination_path)
result.append(ImportedImage(
datafile,
destination_path,
urllib.parse.urljoin(dataimport.folder_name + '/', newfilename),
))
assert len(result) > 0, 'No imported files'
return result
@shared_task(bind=True, base=ReportingTask)
def build_volume(self, files, dataimport, server_id=settings.LOCAL_IMAGESERVER_ID):
self.report_progress(0, 'Pre import checks...')
assert len(files) > 0, 'No files to import'
try:
server = ImageServer.objects.get(id=server_id)
except ImageServer.DoesNotExist:
raise ValueError('Image server {} does not exist'.format(server_id))
# Get volume
try:
volume = Element.objects.get(
type=ElementType.Volume,
corpus=dataimport.corpus,
pk=dataimport.payload['volume_id'],
)
except Element.DoesNotExist:
raise ValueError('Missing volume')
pages, count = [], len(files)
for i, (datafile, path, iiif_path) in enumerate(files):
self.report_progress(i / count, 'Adding page {} of {}'.format(i + 1, count))
pillow_img = Image.open(path)
width, height = pillow_img.size
img, _ = server.images.get_or_create(
path=urllib.parse.urljoin(dataimport.folder_name + '/', newfilename),
path=iiif_path,
defaults={
'width': width,
'height': height,
......@@ -131,8 +188,8 @@ def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGES
page.events.create(type=EventType.Addition)
pages.append((page.id, img.get_thumbnail_url(max_width=500)))
assert len(pages) > 0, 'No imported files'
self.report_message("Imported files into {}".format(volume))
assert len(pages) > 0, 'No imported pages'
self.report_message("Imported {} pages into {}".format(len(pages), volume))
return {
'volume': str(volume.id),
'pages': pages,
......
File added
......@@ -6,6 +6,7 @@ from arkindex.dataimport.models import EventType, DataImportMode, DataImportStat
from arkindex.dataimport.iiif import ManifestParser
import os.path
import git
import tempfile
import shutil
FIXTURES = os.path.join(
......@@ -23,6 +24,16 @@ class TestManifestParser(RedisMockMixin, FixtureTestCase):
cls.repo = cls.creds.repos.get()
cls.rev = cls.repo.revisions.get()
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.repo_dir = tempfile.mkdtemp()
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.repo_dir)
def _assert_first_import(self, first_rev):
"""
Check importing base.json
......@@ -197,18 +208,14 @@ class TestManifestParser(RedisMockMixin, FixtureTestCase):
Import manifest files from a Git repo
"""
# Create a Git repo
repo_dir = os.path.join(FIXTURES, 'repo')
if os.path.exists(repo_dir):
shutil.rmtree(repo_dir)
os.makedirs(repo_dir, exist_ok=True)
repo = git.Repo.init(repo_dir)
repo = git.Repo.init(self.repo_dir)
# Prevent cloning from anywhere else but this repo
clone_mock.side_effect = lambda src, dest, **kwargs: repo.clone(dest, **kwargs)
def copy_commit(message, src=[], dst=[]):
src = [os.path.join(FIXTURES, path) for path in src]
dst = [os.path.join(repo_dir, path) for path in dst]
dst = [os.path.join(self.repo_dir, path) for path in dst]
list(map(shutil.copyfile, src, dst))
repo.index.add(dst)
return repo.index.commit(message)
......@@ -258,6 +265,3 @@ class TestManifestParser(RedisMockMixin, FixtureTestCase):
second_rev = run_import(second_commit)
self._assert_second_import(first_rev, second_rev)
self.assertGreater(self.redis.llen('celery'), 0)
# Remove the repo
shutil.rmtree(repo_dir)
from arkindex.project.tests import FixtureTestCase
from arkindex.documents.models import ElementType
from arkindex.dataimport.models import DataImportMode, DataImportState
from arkindex.dataimport.tasks import import_pdf, ImportedImage
import tempfile
import shutil
import os.path
FIXTURES = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'pdf_samples',
)
class TestImportPdf(FixtureTestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.vol = cls.corpus.elements.create(type=ElementType.Volume, name='A volume')
cls.di = cls.corpus.imports.create(
mode=DataImportMode.PDF,
state=DataImportState.Running,
creator=cls.user,
payload={
'volume_id': str(cls.vol.id),
'folder_name': 'somewhere',
},
)
cls.img_df = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg')
cls.pdf1 = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf')
cls.pdf2 = cls.corpus.files.create(name='sample.pdf', size=43, hash='1337', content_type='application/pdf')
cls.media_root = tempfile.mkdtemp()
cls.local_media_root = tempfile.mkdtemp()
shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), os.path.join(cls.media_root, str(cls.pdf1.id)))
@classmethod
def tearDownClass(cls):
super().tearDownClass()
shutil.rmtree(cls.media_root)
shutil.rmtree(cls.local_media_root)
def test_import_pdf_task_single(self):
"""
Test the import_pdf task only accepts a single PDF file
"""
self.di.files.add(self.pdf1)
self.di.files.add(self.pdf2)
with self.assertRaises(Exception):
import_pdf(self.di)
def test_import_pdf_task_filetype(self):
"""
Test the import_pdf task checks the file's content type
"""
self.di.files.add(self.img_df)
with self.assertRaises(Exception):
import_pdf(self.di)
def test_import_pdf_task_exists(self):
"""
Test the import_pdf task checks the file's existence
"""
self.di.files.add(self.pdf2)
with self.settings(MEDIA_ROOT=self.media_root):
with self.assertRaises(Exception):
import_pdf(self.di)
def test_import_pdf(self):
"""
Test the import_pdf task runs ImageMagick and returns proper info
"""
self.di.files.add(self.pdf1)
with self.settings(MEDIA_ROOT=self.media_root, LOCAL_MEDIA_ROOT=self.local_media_root):
result = import_pdf(self.di)
self.assertEqual(len(result), 2)
for i in result:
self.assertIsInstance(i, ImportedImage)
self.assertEqual(i.datafile, self.pdf1)
img1, img2 = result
self.assertEqual(img1.path, os.path.join(self.local_media_root, 'somewhere/0000.jpg'))
self.assertEqual(img2.path, os.path.join(self.local_media_root, 'somewhere/0001.jpg'))
self.assertEqual(img1.iiif_path, 'somewhere/0000.jpg')
self.assertEqual(img2.iiif_path, 'somewhere/0001.jpg')
......@@ -18,8 +18,10 @@ class TestImports(RedisMockMixin, FixtureAPITestCase):
cls.repo = cls.creds.repos.get()
cls.rev = cls.repo.revisions.get()
cls.demo_volume = Element.objects.create(type=ElementType.Volume, name='Demo', corpus=cls.corpus)
cls.df = DataFile.objects.create(
name='test.txt', size=42, hash='aaaa', content_type='text/plain', corpus=cls.corpus)
cls.img_df = DataFile.objects.create(
name='test.jpg', size=42, hash='aaaa', content_type='image/jpg', corpus=cls.corpus)
cls.pdf_df = DataFile.objects.create(
name='test.pdf', size=42, hash='bbbb', content_type='application/pdf', corpus=cls.corpus)
def setUp(self):
super().setUp()
......@@ -54,18 +56,18 @@ class TestImports(RedisMockMixin, FixtureAPITestCase):
self.assertEqual(data['id'], str(self.dataimport.id))
def test_start_demo_requires_login(self):
response = self.client.post(reverse('api:import-demo', kwargs={'pk': self.df.id}))
response = self.client.post(reverse('api:import-demo', kwargs={'pk': self.img_df.id}))
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
def test_start_demo(self):
self.client.force_login(self.user)
response = self.client.post(reverse('api:import-demo', kwargs={'pk': self.df.id}))
response = self.client.post(reverse('api:import-demo', kwargs={'pk': self.img_df.id}))
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
dataimport = DataImport.objects.get(id=data['id'])
self.assertEqual(dataimport.mode, DataImportMode.Images)
self.assertEqual(dataimport.state, DataImportState.Running)
self.assertListEqual(list(dataimport.files.all()), [self.df])
self.assertListEqual(list(dataimport.files.all()), [self.img_df])
self.assertEqual(dataimport.payload['volume_id'], str(self.demo_volume.id))
self.assertEqual(dataimport.payload['folder_name'], 'user-{}-demo'.format(self.user.id))
self.assertGreaterEqual(self.redis.llen('celery'), 1)
......@@ -99,17 +101,72 @@ class TestImports(RedisMockMixin, FixtureAPITestCase):
self.assertEqual(self.dataimport.failures.get().message, "Test failure")
def test_from_files_requires_login(self):
response = self.client.post(reverse('api:import-from-files'), {'files': [str(self.df.id)]}, format='json')
response = self.client.post(reverse('api:import-from-files'), {'files': [str(self.img_df.id)]}, format='json')
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
def test_from_files(self):
self.client.force_login(self.user)
response = self.client.post(reverse('api:import-from-files'), {'files': [str(self.df.id)]}, format='json')
response = self.client.post(reverse('api:import-from-files'), {'files': [str(self.img_df.id)]}, format='json')
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
dataimport = DataImport.objects.get(id=data['id'])
self.assertEqual(dataimport.mode, DataImportMode.Images)
self.assertListEqual(list(dataimport.files.all()), [self.df])
self.assertListEqual(list(dataimport.files.all()), [self.img_df])
self.assertIn('volume_id', dataimport.payload)
self.assertIn('folder_name', dataimport.payload)
self.assertGreaterEqual(self.redis.llen('celery'), 1)
def test_from_files_pdf(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:import-from-files'),
{'files': [str(self.pdf_df.id)], 'mode': 'pdf'},
format='json',
)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
data = response.json()
dataimport = DataImport.objects.get(id=data['id'])
self.assertEqual(dataimport.mode, DataImportMode.PDF)
self.assertListEqual(list(dataimport.files.all()), [self.pdf_df])
self.assertIn('volume_id', dataimport.payload)
self.assertIn('folder_name', dataimport.payload)
self.assertGreaterEqual(self.redis.llen('celery'), 1)
self.assertEqual(Element.objects.get(id=dataimport.payload['volume_id']).name, 'Import pdf test')
def test_from_files_invalid_mode(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:import-from-files'),
{'files': [str(self.pdf_df.id)], 'mode': 'repository'},
format='json',
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_from_files_pdf_wrong_type(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:import-from-files'),
{'files': [str(self.img_df.id)], 'mode': 'pdf'},
format='json',
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_from_files_pdf_single(self):
self.client.force_login(self.user)
pdf2 = DataFile.objects.create(
name='test2.pdf', size=1337, hash='0', content_type='application/pdf', corpus=self.corpus)
response = self.client.post(
reverse('api:import-from-files'),
{'files': [str(self.pdf_df.id), str(pdf2.id)], 'mode': 'pdf'},
format='json',
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_from_files_images_wrong_type(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:import-from-files'),
{'files': [str(self.pdf_df.id)]},
format='json',
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
from django.views.generic import TemplateView, DetailView
from django.contrib.auth.mixins import LoginRequiredMixin
from arkindex.documents.models import Corpus
from arkindex.dataimport.models import DataImport, DataImportState
......@@ -18,7 +19,9 @@ class DataImportStatus(LoginRequiredMixin, DetailView):
context_object_name = 'dataimport'
def get_queryset(self):
return DataImport.objects.filter(creator=self.request.user).exclude(
return DataImport.objects.filter(
corpus__in=Corpus.objects.readable(self.request.user),
).exclude(
state__in=[DataImportState.Created, DataImportState.Configured],
)
......@@ -31,7 +34,9 @@ class DataImportFailures(LoginRequiredMixin, DetailView):
context_object_name = 'dataimport'
def get_queryset(self):
return DataImport.objects.filter(creator=self.request.user).exclude(
return DataImport.objects.filter(
corpus__in=Corpus.objects.readable(self.request.user),
).exclude(
state__in=[DataImportState.Created, DataImportState.Configured],
)
......
......@@ -19,7 +19,7 @@ COPY --from=staging /build /usr
ENV PYTHONPATH=/usr/lib/python3.5/site-packages
# Add runtime system deps
RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg
RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick
# Add unprivilegied user
RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment