diff --git a/arkindex/dataimport/git.py b/arkindex/dataimport/git.py index 02938476affb1279fcdbd5f4630781d7e52aa63f..f48a2649678f93710b8078adee8deae6719f4954 100644 --- a/arkindex/dataimport/git.py +++ b/arkindex/dataimport/git.py @@ -1,12 +1,14 @@ from collections import namedtuple from enum import Enum from django.core.exceptions import ValidationError +from arkindex_common.ml_tool import MLToolType from arkindex_common.enums import DataImportMode from arkindex.documents.models import Element, ElementType, Page from arkindex.dataimport.config import ConfigFile, ImportType from arkindex.dataimport.models import DataImport from arkindex.dataimport.filetypes import FileType from ponos.models import State +import json import logging import os import shutil @@ -47,6 +49,7 @@ class GitFlow(object): assert os.path.isdir(working_directory), \ 'Invalid working directory {}'.format(working_directory) + self.base_dir = working_directory self.repo_dir = os.path.join(working_directory, 'repo') self.img_dir = os.path.join(working_directory, 'git_images') @@ -59,6 +62,7 @@ class GitFlow(object): diffs = self.diff() pages = self.dispatch_imports(diffs) self.cleanup() + self.setup_ml_analysis(pages) return pages def parse(self): @@ -217,3 +221,36 @@ class GitFlow(object): def cleanup(self): logger.info('Cleaning up...') shutil.rmtree(self.repo_dir) + + def setup_ml_analysis(self, pages): + """ + Write the JSON config file for the ML workers + """ + config = { + 'pages': [ + { + 'id': str(page.id), + 'path': path, + } + for page, path in pages + ], + 'tools': [], + } + + if self.config.classifier: + config['tools'].append({ + 'tool': MLToolType.Classifier.value, + 'slug': self.config.classifier.slug, + }) + if self.config.recognizer: + config['tools'].append({ + 'tool': MLToolType.Recognizer.value, + 'slug': self.config.recognizer.slug, + }) + + if not config['tools']: + # TODO: Abort the ML task when there are no tools + return + + with open(os.path.join(self.base_dir, 'ml_analysis.json'), 'w') as f: + json.dump(config, f, indent=4) diff --git a/arkindex/dataimport/management/commands/import.py b/arkindex/dataimport/management/commands/import.py index c89b92a12cd60879537f33632e13b900811a3cd0..a382f20c5e9d0277bed2caa7d743b362a4dff6d2 100644 --- a/arkindex/dataimport/management/commands/import.py +++ b/arkindex/dataimport/management/commands/import.py @@ -1,18 +1,9 @@ #!/usr/bin/env python3 from django.core.management.base import BaseCommand -from arkindex_common.ml_tool import MLToolType from arkindex_common.enums import DataImportMode from arkindex.project.argparse import DataImportArgument -from arkindex.dataimport.tasks import ( - download_files, - check_images, - extract_pdf_images, - populate_volume, - setup_ml_analysis, -) from arkindex.dataimport.git import GitFlow import tempfile -import json import os import logging import shutil @@ -35,8 +26,6 @@ class Command(BaseCommand): ) def handle(self, *args, data_import=None, **options): - ml_tools = data_import.ml_tools - # Use shared directory when running in docker # Fallback to a temp directory while developing task_dir = os.environ.get('PONOS_DATA', tempfile.mkdtemp(suffix='-ponos')) @@ -46,59 +35,10 @@ class Command(BaseCommand): # Use temp folder for anything that does not need to be shared temp_dir = tempfile.mkdtemp(suffix='-ponostmp') logger.debug('Using temp dir: {}'.format(temp_dir)) + assert data_import.mode == DataImportMode.Repository, \ + 'Only Repository imports are supported via this command' - pages, files = None, None - if data_import.files.exists(): - files = download_files(data_import, temp_dir) - - if data_import.mode == DataImportMode.Images: - # Validate images from data import - files = check_images(files) - - elif data_import.mode == DataImportMode.PDF: - assert len(files) == 1, 'Only one file in PDF mode' - pdf_file, pdf_path = files[0] - assert pdf_file.content_type == 'application/pdf', 'File is not a PDF' - - # Extract images from the PDF into the task working dir and get their paths - images = extract_pdf_images(pdf_file, pdf_path, task_dir, data_import.pdf_engine) - - # Add those images to the volume - # all linked to the original pdf file - files = [ - (pdf_file, img_path) - for img_path in images - ] - - elif data_import.mode == DataImportMode.Repository: - git = GitFlow(data_import, task_dir) - pages = git.run() - - # use ml_tools from Repo - ml_tools = [] - if git.config.classifier: - ml_tools.append((MLToolType.Classifier, git.config.classifier.slug)) - if git.config.recognizer: - ml_tools.append((MLToolType.Recognizer, git.config.recognizer.slug)) - - else: - # Should never happen - raise NotImplementedError - - # Load all image files into the volume - if pages is None and files is not None: - - # Load or create volume from dataimport - volume = data_import.get_volume() - logger.info('Using volume: {}'.format(volume)) - - pages = populate_volume(volume, files) + git = GitFlow(data_import, task_dir) + git.run() shutil.rmtree(temp_dir) - - # Setup the analysis process through a json config - ml_analysis = os.path.join(task_dir, 'ml_analysis.json') - if os.path.exists(ml_analysis) or not ml_tools: - return - with open(ml_analysis, 'w') as f: - json.dump(setup_ml_analysis(pages, ml_tools), f, indent=4) diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 55e60b8ca0d37fa98ab951f9a905c9c17ddb7d24..1e7407bdab3e5cc3f5c10ae09db3e71d1aa2df8c 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -60,12 +60,20 @@ class DataImport(IndexableModel): raise ValidationError('Git repository does not have any valid credentials') # Import data in Arkindex, then do the ML analysis - tasks = { - 'import': { - 'image': settings.ARKINDEX_APP_IMAGE, - 'command': 'manage.py import {}'.format(self.id), - }, - } + if self.mode == DataImportMode.Repository: + tasks = { + 'import': { + 'image': settings.ARKINDEX_APP_IMAGE, + 'command': 'manage.py import {}'.format(self.id), + }, + } + else: + tasks = { + 'import': { + 'image': settings.ARKINDEX_TASKS_IMAGE, + 'command': 'python -m arkindex_tasks.import_files {}'.format(self.id), + }, + } if self.mode == DataImportMode.Repository or self.ml_tools: # Add the ML task if ML is required - for Git, let the Git import decide diff --git a/arkindex/dataimport/tasks/__init__.py b/arkindex/dataimport/tasks/__init__.py deleted file mode 100644 index ec6c2212b09aa0d8d8202112cd8955ba306714d9..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tasks/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# flake8: noqa - -from arkindex.dataimport.tasks.base import download_files, populate_volume, setup_ml_analysis -from arkindex.dataimport.tasks.image import check_images, build_iiif_image -from arkindex.dataimport.tasks.pdf import extract_pdf_images diff --git a/arkindex/dataimport/tasks/base.py b/arkindex/dataimport/tasks/base.py deleted file mode 100644 index ab4dfb354ac5383af289c020ab846737f9e0a8bb..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tasks/base.py +++ /dev/null @@ -1,89 +0,0 @@ -from arkindex.documents.models import Element, ElementType -from arkindex.documents.importer import import_page -from arkindex.dataimport.tasks.image import build_iiif_image -from arkindex.dataimport.models import EventType, DataImport -from botocore.exceptions import ClientError -import logging -import os.path - -logger = logging.getLogger(__name__) - - -def download_files(dataimport, dest_dir): - assert isinstance(dataimport, DataImport) - assert os.access(dest_dir, os.W_OK | os.X_OK), 'Destination directory is read-only' - - datafiles = dataimport.files.all() - valid_files = [] - filecount = len(datafiles) - - if filecount < 1: - logger.info('No files to download - skipping') - return [] - - for i, datafile in enumerate(datafiles): - logger.info('Downloading file {}Â of {}'.format(i + 1, filecount)) - path = os.path.join(dest_dir, datafile.s3_key) - - try: - datafile.download_to(path) - except (IOError, ClientError) as e: - logger.warning('Failed downloading file {} ({}): {}'.format(datafile.name, str(datafile.id), str(e))) - continue - - valid_files.append((datafile, path)) - - assert len(valid_files) > 0, 'No files were successfully downloaded' - return valid_files - - -def populate_volume(volume, files): - ''' - Import files into the volume, and post on IIIF server - TODO: this could use an API endpoint to ingest a new page - ''' - logger.info('Pre import checks...') - assert len(files) > 0, 'No files to import' - assert isinstance(volume, Element) - assert volume.type == ElementType.Volume - - pages, count = [], len(files) - for i, (data_file, staging_path) in enumerate(files): - logger.info('Adding page {} of {}'.format(i + 1, count)) - - # Build local IIIF image - img = build_iiif_image(volume, staging_path, data_file, suffix=str(i)) - - # Build page with image - page = import_page(volume, img, volume.name) - page.events.create(type=EventType.Addition) - pages.append((page, img.s3_url)) - - assert len(pages) > 0, 'No imported pages' - logger.info("Imported {} pages into {}".format(len(pages), volume.name)) - volume.generate_thumbnail() - - return pages - - -def setup_ml_analysis(pages, tools): - ''' - Build a JSON serializable configuration for ML analysis - Using the structure from populate_volume - ''' - return { - 'pages': [ - { - 'id': str(page.id), - 'path': path, - } - for page, path in pages - ], - 'tools': [ - { - 'tool': tool.value, - 'slug': slug, - } - for tool, slug in tools - ] - } diff --git a/arkindex/dataimport/tasks/image.py b/arkindex/dataimport/tasks/image.py deleted file mode 100644 index 9e71c991851d169bfcaee972644d390a06ca89e8..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tasks/image.py +++ /dev/null @@ -1,90 +0,0 @@ -from PIL import Image as PillowImage -from arkindex.dataimport.models import DataFile -from arkindex.images.models import ImageServer, Image -from arkindex.documents.models import Element, ElementType -from arkindex.project.aws import S3FileStatus -from urllib.parse import quote -import logging - -logger = logging.getLogger(__name__) - - -def check_images(files): - assert len(files), 'No files to check' - - valid_files = [] - filecount = len(files) - - for i, (datafile, path) in enumerate(files): - logger.info("Checking image {} of {}".format(i + 1, filecount)) - - try: - img = PillowImage.open(path) - assert max(img.size) >= 500, "Image {} is too small".format(datafile.name) - except IOError: - logger.warn("File {} is not a valid image".format(datafile.name)) - continue - except AssertionError as e: - logger.warn(str(e)) - continue - - valid_files.append((datafile, path)) - - assert len(valid_files) > 0, "No valid images in selected files" - return valid_files - - -def build_iiif_image(volume, path, data_file, suffix=None): - ''' - Import a staging image into the local IIIF server - ''' - assert isinstance(volume, Element) - assert volume.type == ElementType.Volume - assert isinstance(path, str) - assert isinstance(data_file, DataFile) - - pillow_img = PillowImage.open(path) - width, height = pillow_img.size - - # Non-JPEG image formats that should not be converted - # Will default to .jpg if the image format is not in there - # Formats are Pillow images formats, - # see https://pillow.readthedocs.io/en/5.1.x/handbook/image-file-formats.html - known_exts = { - "JPEG2000": ".jp2", - "PNG": ".png", - "TIFF": ".tif", - } - img_format = pillow_img.format - if img_format not in known_exts: - img_format = 'JPEG' - ext = known_exts.get(img_format, '.jpg') - - # Build image path - filename = str(data_file.id) - if suffix is not None: - filename += '-{}'.format(suffix) - filename += ext - iiif_path = quote('{}/{}'.format(volume.id, filename), safe='') - - # Get Image instance - try: - img = ImageServer.objects.local.images.get(path=iiif_path) - except Image.DoesNotExist: - img = Image( - server=ImageServer.objects.local, - path=iiif_path, - width=width, - height=height, - datafile=data_file, - ) - - if img.exists(): - logger.warning('Image already exists on the IIIF server') - else: - # Save to S3 using optional image type conversion - img.pillow_save(pillow_img, format=img_format) - img.status = S3FileStatus.Checked - img.save() - - return img diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py deleted file mode 100644 index f3550bd7ee0168e9178c5575e2891ae734b9291e..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tasks/pdf.py +++ /dev/null @@ -1,70 +0,0 @@ -import distutils.spawn -import glob -import os -import subprocess -import logging -from pdf2image import convert_from_path -from arkindex_common.enums import DataImportPDFEngine -from arkindex_common.tools import Timer - -logger = logging.getLogger(__name__) - - -def extract_pdf_images(pdf_file, pdf_path, working_dir, engine=DataImportPDFEngine.Convert): - assert pdf_file.content_type == 'application/pdf', 'File is not a PDF' - assert pdf_file.exists(), 'File does not exist' - - methods = { - DataImportPDFEngine.Convert: extract_pdf_images_convert, - DataImportPDFEngine.Poppler: extract_pdf_images_poppler, - } - - assert engine in methods, 'Unsupported engine {}'.format(str(engine)) - - logger.info('Convert PDF file with {}'.format(str(engine))) - method = methods[engine] - with Timer() as t: - images = method(pdf_file, pdf_path, working_dir) - logger.info('Time {}'.format(str(t.delta))) - return images - - -def extract_pdf_images_convert(pdf_file, pdf_path, working_dir): - """ - Convert a PDF file to a list of images - """ - assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH' - - if not os.path.exists(working_dir): - os.makedirs(working_dir) - - cmd = [ - 'convert', '-density', '300', - 'pdf:{}'.format(pdf_path), - os.path.join(working_dir, 'pdf-%04d.jpg'), - ] - subprocess.run(cmd, check=True) - - # Dump all the images in the working dir - return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) - - -def extract_pdf_images_poppler(pdf_file, pdf_path, working_dir): - """ - Convert a PDF file to a list of images with poppler - """ - assert distutils.spawn.find_executable('pdfimages'), 'Missing pdfimages in PATH' - - if not os.path.exists(working_dir): - os.makedirs(working_dir) - - convert_from_path( - pdf_path, - output_folder=working_dir, - output_file='pdf', # The pdf- prefix on images - dpi=300, - fmt='jpg', - ) - - # Dump all the images in the working dir - return sorted(glob.glob(os.path.join(working_dir, '*.jpg'))) diff --git a/arkindex/dataimport/tests/test_image.py b/arkindex/dataimport/tests/test_image.py deleted file mode 100644 index 059142eaedb7287dbfa733bb25f7c3309f87e91a..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tests/test_image.py +++ /dev/null @@ -1,91 +0,0 @@ -from unittest.mock import patch, call -from arkindex.project.tests import FixtureTestCase -from arkindex.documents.models import ElementType -from arkindex.dataimport.tasks import check_images, build_iiif_image -from arkindex.project.aws import S3FileStatus -from botocore.exceptions import ClientError - - -class TestImageTasks(FixtureTestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.vol = cls.corpus.elements.create(name='Test volume', type=ElementType.Volume) - cls.df = cls.corpus.files.create( - name='file.jpg', - size=1234, - hash='cafe', - content_type='image/jpeg', - ) - - @patch('arkindex.dataimport.tasks.image.PillowImage') - def test_check_images(self, image_mock): - image_mock.open.return_value.size = (1000, 1000) - - result = check_images([(self.df, '/some/path'), ]) - - self.assertEqual(image_mock.open.call_count, 1) - self.assertEqual(image_mock.open.call_args, call('/some/path')) - self.assertListEqual(result, [ - (self.df, '/some/path'), - ]) - - @patch('arkindex.project.aws.s3.Object') - @patch('arkindex.dataimport.tasks.image.PillowImage') - def test_build_iiif_image(self, image_mock, s3obj_mock): - image_mock.open.return_value.format = 'BMP' - image_mock.open.return_value.size = (400, 900) - s3obj_mock.return_value.load.side_effect = ClientError({'Error': {'Code': '404'}}, 'head_object') - - with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'): - img = build_iiif_image( - self.vol, - '/somewhere/Untitled.bmp', - self.df, - suffix='42', - ) - - expected_path = '{}/{}-42.jpg'.format(str(self.vol.id), str(self.df.id)) - self.assertEqual(image_mock.open.call_count, 1) - self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp')) - self.assertEqual(s3obj_mock.call_count, 1) - self.assertEqual(s3obj_mock.call_args, call('iiif', expected_path)) - self.assertEqual(s3obj_mock().load.call_count, 1) - self.assertEqual(image_mock.open().save.call_count, 1) - self.assertEqual(s3obj_mock().upload_fileobj.call_count, 1) - self.assertEqual(img.server, self.imgsrv) - self.assertEqual(img.path, expected_path.replace('/', '%2F')) - self.assertEqual(img.width, 400) - self.assertEqual(img.height, 900) - self.assertEqual(img.status, S3FileStatus.Checked) - self.assertEqual(img.datafile, self.df) - - @patch('arkindex.project.aws.s3.Object') - @patch('arkindex.dataimport.tasks.image.PillowImage') - def test_build_iiif_image_retry(self, image_mock, s3obj_mock): - """ - Test build_iiif_image just returns existing images if they already exist - """ - image_mock.open.return_value.format = 'JPEG2000' - image_mock.open.return_value.size = (400, 900) - original_img = self.imgsrv.images.create( - path='{}%2F{}.jp2'.format(str(self.vol.id), str(self.df.id)), - datafile=self.df, - width=900, - height=400, - status=S3FileStatus.Checked, - ) - - with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'): - new_img = build_iiif_image( - self.vol, - '/somewhere/Untitled.bmp', - self.df, - ) - - self.assertEqual(original_img.id, new_img.id) - self.assertEqual(image_mock.open.call_count, 1) - self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp')) - self.assertEqual(s3obj_mock().load.call_count, 1) - self.assertFalse(image_mock.open().save.called) diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py deleted file mode 100644 index bbb882295c16aed9677fa60e967448ba9e9e3f4b..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tests/test_pdf.py +++ /dev/null @@ -1,91 +0,0 @@ -from arkindex_common.enums import DataImportPDFEngine -from arkindex.project.tests import FixtureTestCase -from arkindex.dataimport.tasks import extract_pdf_images -from unittest.mock import patch, MagicMock -from botocore.exceptions import ClientError -import tempfile -import shutil -import glob -import os.path - -FIXTURES = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - 'pdf_samples', -) - - -class TestPdf(FixtureTestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg') - cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf') - - cls.working_dir = tempfile.mkdtemp() - cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id)) - shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path) - - @classmethod - def tearDownClass(cls): - super().tearDownClass() - shutil.rmtree(cls.working_dir) - - def test_extract_pdf_images_filetype(self): - """ - Test extract_pdf_images task the file's content type - """ - with self.assertRaises(AssertionError): - extract_pdf_images(self.img_file, self.pdf_path, self.working_dir) - - def test_extract_pdf_images_exists(self): - """ - Test extract_pdf_images task checks the file's existence - """ - file_mock = MagicMock() - file_mock.content_type = 'application/pdf' - file_mock.exists.return_value = False - - with self.assertRaises(AssertionError): - extract_pdf_images(file_mock, self.pdf_path, self.working_dir) - - @patch('arkindex.project.aws.s3') - def test_extract_pdf_images_s3_error(self, s3_mock): - """ - Test extract_pdf_images task lets S3 errors through - """ - file_mock = MagicMock() - file_mock.content_type = 'application/pdf' - # Any ClientError with a code other than 404 - file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object') - - with self.assertRaises(ClientError): - extract_pdf_images(file_mock, self.pdf_path, self.working_dir) - - @patch('arkindex.project.aws.s3') - def test_extract_pdf_images_with_convert(self, s3_mock): - """ - Test extract_pdf_images runs ImageMagick and returns proper info - """ - result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir) - - self.assertListEqual(result, [ - os.path.join(self.working_dir, 'pdf-0000.jpg'), - os.path.join(self.working_dir, 'pdf-0001.jpg'), - ]) - - @patch('arkindex.project.aws.s3') - def test_extract_pdf_images_with_poppler(self, s3_mock): - """ - Test extract_pdf_images runs ImageMagick and returns proper info - """ - oldImages = glob.glob(os.path.join(self.working_dir, '*.jpg')) - for img in oldImages: - os.remove(os.path.join(self.working_dir, img)) - - result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir, DataImportPDFEngine.Poppler) - - self.assertListEqual(result, [ - os.path.join(self.working_dir, 'pdf-1.jpg'), - os.path.join(self.working_dir, 'pdf-2.jpg') - ]) diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py deleted file mode 100644 index 553d66c83900f1c6dadfba6c91e55f982a551658..0000000000000000000000000000000000000000 --- a/arkindex/dataimport/tests/test_tasks.py +++ /dev/null @@ -1,105 +0,0 @@ -from unittest.mock import patch, call -from arkindex_common.enums import DataImportMode -from arkindex.project.tests import FixtureTestCase -from arkindex.dataimport.tasks import download_files - - -class TestTasks(FixtureTestCase): - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - cls.df1 = cls.corpus.files.create( - name='file1.jpg', - size=1234, - hash='cafe', - content_type='image/jpeg', - ) - cls.df2 = cls.corpus.files.create( - name='file2.png', - size=5678, - hash='beef', - content_type='image/png', - ) - cls.di = cls.corpus.imports.create( - mode=DataImportMode.Images, - creator=cls.user, - ) - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.s3obj_patch = patch('arkindex.project.aws.s3.Object') - cls.s3obj_mock = cls.s3obj_patch.start() - cls.access_patch = patch('arkindex.dataimport.tasks.base.os.access') - cls.access_mock = cls.access_patch.start() - - @classmethod - def tearDownClass(cls): - cls.s3obj_patch.stop() - cls.access_patch.stop() - super().tearDownClass() - - def setUp(self): - super().setUp() - self.s3obj_mock.reset_mock() - self.access_mock.reset_mock() - self.access_mock.return_value = True - self.s3obj_mock.return_value.download_file.side_effect = None - - def test_download_files(self): - self.di.files.set([self.df1]) - expected_path = '/somewhere/{}'.format(str(self.df1.id)) - self.assertListEqual( - download_files(self.di, '/somewhere'), - [(self.df1, expected_path)], - ) - self.assertEqual(self.s3obj_mock().download_file.call_count, 1) - self.assertEqual(self.s3obj_mock().download_file.call_args, call(expected_path)) - - def test_download_files_fail(self): - self.di.files.set([self.df1, self.df2]) - # Fail only once - self.s3obj_mock().download_file.side_effect = [IOError, None] - - expected_path1 = '/somewhere/{}'.format(str(self.df1.id)) - expected_path2 = '/somewhere/{}'.format(str(self.df2.id)) - - self.assertListEqual( - download_files(self.di, '/somewhere'), - [(self.df2, expected_path2)], - ) - - self.assertEqual(self.s3obj_mock().download_file.call_count, 2) - self.assertEqual(self.s3obj_mock().download_file.call_args_list, [ - call(expected_path1), - call(expected_path2), - ]) - - def test_download_files_epic_fail(self): - self.di.files.set([self.df1, self.df2]) - # Fail all the time - self.s3obj_mock().download_file.side_effect = IOError - - expected_path1 = '/somewhere/{}'.format(str(self.df1.id)) - expected_path2 = '/somewhere/{}'.format(str(self.df2.id)) - - with self.assertRaisesRegex(AssertionError, 'No files'): - download_files(self.di, '/somewhere') - - self.assertEqual(self.s3obj_mock().download_file.call_count, 2) - self.assertEqual(self.s3obj_mock().download_file.call_args_list, [ - call(expected_path1), - call(expected_path2), - ]) - - def test_download_files_empty(self): - self.di.files.set([]) - self.assertListEqual(download_files(self.di, '/somewhere'), []) - self.assertFalse(self.s3obj_mock.called) - - def test_download_files_read_only(self): - self.access_mock.return_value = False - self.di.files.set([self.df1]) - with self.assertRaisesRegex(AssertionError, 'read-only'): - download_files(self.di, '/somewhere') diff --git a/arkindex/project/checks.py b/arkindex/project/checks.py index d63eb1ef7f9f0bb76d5873f2601578649ebd7978..8109259f61a10640de359ac0d300e25b23850f1a 100644 --- a/arkindex/project/checks.py +++ b/arkindex/project/checks.py @@ -114,6 +114,7 @@ def docker_images_check(*args, **kwargs): images = ( (settings.ARKINDEX_APP_IMAGE, 'ARKINDEX_APP_IMAGE'), (settings.ARKINDEX_ML_IMAGE, 'ARKINDEX_ML_IMAGE'), + (settings.ARKINDEX_TASKS_IMAGE, 'ARKINDEX_TASKS_IMAGE'), ) for image_tag, setting_name in images: try: diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 5238babadc316bce08713b380c89693b6503bca9..12cc187bde0d7536f987d0f25fa0f467da23dbaa 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -393,6 +393,7 @@ CORS_URLS_REGEX = r'^/(api|ponos)/.*$' # Docker images used by our ponos workflow ARKINDEX_APP_IMAGE = os.environ.get('ARKINDEX_APP_IMAGE', 'arkindex-app') ARKINDEX_ML_IMAGE = os.environ.get('ARKINDEX_ML_IMAGE', 'arkindex-worker-ml') +ARKINDEX_TASKS_IMAGE = os.environ.get('ARKINDEX_TASKS_IMAGE', 'arkindex-tasks') # ML worker ML_DEFAULT_CLASSIFIER = 'tobacco' diff --git a/arkindex/project/tests/test_checks.py b/arkindex/project/tests/test_checks.py index 97e88213a5f8d99f88d7cd811a152b98781baa75..13e5fa5c66012fb746bad18bbd02fc26da74871d 100644 --- a/arkindex/project/tests/test_checks.py +++ b/arkindex/project/tests/test_checks.py @@ -99,21 +99,15 @@ class ChecksTestCase(TestCase): srv.delete() @patch('arkindex.project.checks.subprocess.run') + @override_settings( + ARKINDEX_APP_IMAGE='nope', + ARKINDEX_ML_IMAGE='me-neither', + ARKINDEX_TASKS_IMAGE='nuh', + ) def test_docker_images_check(self, run_mock): from arkindex.project.checks import docker_images_check - run_mock.side_effect = [CalledProcessError(1, ''), None] - with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'): - self.assertListEqual(docker_images_check(), [ - Error( - 'Docker image with tag "nope" was not found.', - hint='settings.ARKINDEX_APP_IMAGE = "nope"', - id='arkindex.E006', - ) - ]) - - self.assertEqual(run_mock.call_count, 2) - self.assertEqual(run_mock.call_args_list, [ + expected_calls = [ call( ['docker', 'image', 'inspect', 'nope'], stdout=subprocess.PIPE, @@ -126,35 +120,44 @@ class ChecksTestCase(TestCase): stderr=subprocess.STDOUT, check=True, ), - ]) - - run_mock.reset_mock() - run_mock.side_effect = [None, CalledProcessError(1, '')] - with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'): - self.assertListEqual(docker_images_check(), [ - Error( - 'Docker image with tag "me-neither" was not found.', - hint='settings.ARKINDEX_ML_IMAGE = "me-neither"', - id='arkindex.E006', - ) - ]) - - self.assertEqual(run_mock.call_count, 2) - self.assertEqual(run_mock.call_args_list, [ call( - ['docker', 'image', 'inspect', 'nope'], + ['docker', 'image', 'inspect', 'nuh'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True, ), - call( - ['docker', 'image', 'inspect', 'me-neither'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - check=True, + ] + + run_mock.side_effect = [CalledProcessError(1, ''), None, None] + self.assertListEqual(docker_images_check(), [ + Error( + 'Docker image with tag "nope" was not found.', + hint='settings.ARKINDEX_APP_IMAGE = "nope"', + id='arkindex.E006', + ) + ]) + + self.assertEqual(run_mock.call_count, 3) + self.assertEqual(run_mock.call_args_list, expected_calls) + + run_mock.reset_mock() + run_mock.side_effect = [None, CalledProcessError(1, ''), CalledProcessError(1, '')] + self.assertListEqual(docker_images_check(), [ + Error( + 'Docker image with tag "me-neither" was not found.', + hint='settings.ARKINDEX_ML_IMAGE = "me-neither"', + id='arkindex.E006', ), + Error( + 'Docker image with tag "nuh" was not found.', + hint='settings.ARKINDEX_TASKS_IMAGE = "nuh"', + id='arkindex.E006', + ) ]) + self.assertEqual(run_mock.call_count, 3) + self.assertEqual(run_mock.call_args_list, expected_calls) + @patch('arkindex.project.checks.subprocess.run') def test_docker_images_check_missing_client(self, run_mock): """ @@ -163,10 +166,10 @@ class ChecksTestCase(TestCase): from arkindex.project.checks import docker_images_check run_mock.side_effect = FileNotFoundError - with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'): + with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither', ARKINDEX_TASKS_IMAGE='nuh'): self.assertListEqual(docker_images_check(), []) - self.assertEqual(run_mock.call_count, 2) + self.assertEqual(run_mock.call_count, 3) self.assertEqual(run_mock.call_args_list, [ call( ['docker', 'image', 'inspect', 'nope'], @@ -180,6 +183,12 @@ class ChecksTestCase(TestCase): stderr=subprocess.STDOUT, check=True, ), + call( + ['docker', 'image', 'inspect', 'nuh'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + check=True, + ), ]) @patch('arkindex.project.checks.parse_recipe') diff --git a/base/Dockerfile b/base/Dockerfile index 40ddccc93f8411067a8492c6afd21526b3d14c73..ef1d48c8f3808f1d5d2c413c2f6589899fdf34ef 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -19,7 +19,7 @@ COPY --from=staging /build /usr ENV PYTHONPATH=/usr/lib/python3.6/site-packages # Add runtime system deps -RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick poppler-utils +RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick # Add unprivilegied user RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark diff --git a/requirements.txt b/requirements.txt index 58b5c2dabe260ebe46094f5281bad3694f587f09..7f406b55fdcfe49bcef22ffaeaee9fcfa1478ee4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,6 @@ gitpython==2.1.11 idna==2.6 jdcal==1.3 olefile==0.44 -pdf2image==1.5.1 python-gitlab==1.7.0 python-magic==0.4.15 python-memcached==1.59