diff --git a/arkindex/dataimport/git.py b/arkindex/dataimport/git.py
index 02938476affb1279fcdbd5f4630781d7e52aa63f..f48a2649678f93710b8078adee8deae6719f4954 100644
--- a/arkindex/dataimport/git.py
+++ b/arkindex/dataimport/git.py
@@ -1,12 +1,14 @@
 from collections import namedtuple
 from enum import Enum
 from django.core.exceptions import ValidationError
+from arkindex_common.ml_tool import MLToolType
 from arkindex_common.enums import DataImportMode
 from arkindex.documents.models import Element, ElementType, Page
 from arkindex.dataimport.config import ConfigFile, ImportType
 from arkindex.dataimport.models import DataImport
 from arkindex.dataimport.filetypes import FileType
 from ponos.models import State
+import json
 import logging
 import os
 import shutil
@@ -47,6 +49,7 @@ class GitFlow(object):
 
         assert os.path.isdir(working_directory), \
             'Invalid working directory {}'.format(working_directory)
+        self.base_dir = working_directory
         self.repo_dir = os.path.join(working_directory, 'repo')
         self.img_dir = os.path.join(working_directory, 'git_images')
 
@@ -59,6 +62,7 @@ class GitFlow(object):
         diffs = self.diff()
         pages = self.dispatch_imports(diffs)
         self.cleanup()
+        self.setup_ml_analysis(pages)
         return pages
 
     def parse(self):
@@ -217,3 +221,36 @@ class GitFlow(object):
     def cleanup(self):
         logger.info('Cleaning up...')
         shutil.rmtree(self.repo_dir)
+
+    def setup_ml_analysis(self, pages):
+        """
+        Write the JSON config file for the ML workers
+        """
+        config = {
+            'pages': [
+                {
+                    'id': str(page.id),
+                    'path': path,
+                }
+                for page, path in pages
+            ],
+            'tools': [],
+        }
+
+        if self.config.classifier:
+            config['tools'].append({
+                'tool': MLToolType.Classifier.value,
+                'slug': self.config.classifier.slug,
+            })
+        if self.config.recognizer:
+            config['tools'].append({
+                'tool': MLToolType.Recognizer.value,
+                'slug': self.config.recognizer.slug,
+            })
+
+        if not config['tools']:
+            # TODO: Abort the ML task when there are no tools
+            return
+
+        with open(os.path.join(self.base_dir, 'ml_analysis.json'), 'w') as f:
+            json.dump(config, f, indent=4)
diff --git a/arkindex/dataimport/management/commands/import.py b/arkindex/dataimport/management/commands/import.py
index c89b92a12cd60879537f33632e13b900811a3cd0..a382f20c5e9d0277bed2caa7d743b362a4dff6d2 100644
--- a/arkindex/dataimport/management/commands/import.py
+++ b/arkindex/dataimport/management/commands/import.py
@@ -1,18 +1,9 @@
 #!/usr/bin/env python3
 from django.core.management.base import BaseCommand
-from arkindex_common.ml_tool import MLToolType
 from arkindex_common.enums import DataImportMode
 from arkindex.project.argparse import DataImportArgument
-from arkindex.dataimport.tasks import (
-    download_files,
-    check_images,
-    extract_pdf_images,
-    populate_volume,
-    setup_ml_analysis,
-)
 from arkindex.dataimport.git import GitFlow
 import tempfile
-import json
 import os
 import logging
 import shutil
@@ -35,8 +26,6 @@ class Command(BaseCommand):
         )
 
     def handle(self, *args, data_import=None, **options):
-        ml_tools = data_import.ml_tools
-
         # Use shared directory when running in docker
         # Fallback to a temp directory while developing
         task_dir = os.environ.get('PONOS_DATA', tempfile.mkdtemp(suffix='-ponos'))
@@ -46,59 +35,10 @@ class Command(BaseCommand):
         # Use temp folder for anything that does not need to be shared
         temp_dir = tempfile.mkdtemp(suffix='-ponostmp')
         logger.debug('Using temp dir: {}'.format(temp_dir))
+        assert data_import.mode == DataImportMode.Repository, \
+            'Only Repository imports are supported via this command'
 
-        pages, files = None, None
-        if data_import.files.exists():
-            files = download_files(data_import, temp_dir)
-
-        if data_import.mode == DataImportMode.Images:
-            # Validate images from data import
-            files = check_images(files)
-
-        elif data_import.mode == DataImportMode.PDF:
-            assert len(files) == 1, 'Only one file in PDF mode'
-            pdf_file, pdf_path = files[0]
-            assert pdf_file.content_type == 'application/pdf', 'File is not a PDF'
-
-            # Extract images from the PDF into the task working dir and get their paths
-            images = extract_pdf_images(pdf_file, pdf_path, task_dir, data_import.pdf_engine)
-
-            # Add those images to the volume
-            # all linked to the original pdf file
-            files = [
-                (pdf_file, img_path)
-                for img_path in images
-            ]
-
-        elif data_import.mode == DataImportMode.Repository:
-            git = GitFlow(data_import, task_dir)
-            pages = git.run()
-
-            # use ml_tools from Repo
-            ml_tools = []
-            if git.config.classifier:
-                ml_tools.append((MLToolType.Classifier, git.config.classifier.slug))
-            if git.config.recognizer:
-                ml_tools.append((MLToolType.Recognizer, git.config.recognizer.slug))
-
-        else:
-            # Should never happen
-            raise NotImplementedError
-
-        # Load all image files into the volume
-        if pages is None and files is not None:
-
-            # Load or create volume from dataimport
-            volume = data_import.get_volume()
-            logger.info('Using volume: {}'.format(volume))
-
-            pages = populate_volume(volume, files)
+        git = GitFlow(data_import, task_dir)
+        git.run()
 
         shutil.rmtree(temp_dir)
-
-        # Setup the analysis process through a json config
-        ml_analysis = os.path.join(task_dir, 'ml_analysis.json')
-        if os.path.exists(ml_analysis) or not ml_tools:
-            return
-        with open(ml_analysis, 'w') as f:
-            json.dump(setup_ml_analysis(pages, ml_tools), f, indent=4)
diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py
index 55e60b8ca0d37fa98ab951f9a905c9c17ddb7d24..1e7407bdab3e5cc3f5c10ae09db3e71d1aa2df8c 100644
--- a/arkindex/dataimport/models.py
+++ b/arkindex/dataimport/models.py
@@ -60,12 +60,20 @@ class DataImport(IndexableModel):
             raise ValidationError('Git repository does not have any valid credentials')
 
         # Import data in Arkindex, then do the ML analysis
-        tasks = {
-            'import': {
-                'image': settings.ARKINDEX_APP_IMAGE,
-                'command': 'manage.py import {}'.format(self.id),
-            },
-        }
+        if self.mode == DataImportMode.Repository:
+            tasks = {
+                'import': {
+                    'image': settings.ARKINDEX_APP_IMAGE,
+                    'command': 'manage.py import {}'.format(self.id),
+                },
+            }
+        else:
+            tasks = {
+                'import': {
+                    'image': settings.ARKINDEX_TASKS_IMAGE,
+                    'command': 'python -m arkindex_tasks.import_files {}'.format(self.id),
+                },
+            }
 
         if self.mode == DataImportMode.Repository or self.ml_tools:
             # Add the ML task if ML is required - for Git, let the Git import decide
diff --git a/arkindex/dataimport/tasks/__init__.py b/arkindex/dataimport/tasks/__init__.py
deleted file mode 100644
index ec6c2212b09aa0d8d8202112cd8955ba306714d9..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tasks/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# flake8: noqa
-
-from arkindex.dataimport.tasks.base import download_files, populate_volume, setup_ml_analysis
-from arkindex.dataimport.tasks.image import check_images, build_iiif_image
-from arkindex.dataimport.tasks.pdf import extract_pdf_images
diff --git a/arkindex/dataimport/tasks/base.py b/arkindex/dataimport/tasks/base.py
deleted file mode 100644
index ab4dfb354ac5383af289c020ab846737f9e0a8bb..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tasks/base.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from arkindex.documents.models import Element, ElementType
-from arkindex.documents.importer import import_page
-from arkindex.dataimport.tasks.image import build_iiif_image
-from arkindex.dataimport.models import EventType, DataImport
-from botocore.exceptions import ClientError
-import logging
-import os.path
-
-logger = logging.getLogger(__name__)
-
-
-def download_files(dataimport, dest_dir):
-    assert isinstance(dataimport, DataImport)
-    assert os.access(dest_dir, os.W_OK | os.X_OK), 'Destination directory is read-only'
-
-    datafiles = dataimport.files.all()
-    valid_files = []
-    filecount = len(datafiles)
-
-    if filecount < 1:
-        logger.info('No files to download - skipping')
-        return []
-
-    for i, datafile in enumerate(datafiles):
-        logger.info('Downloading file {}Â of {}'.format(i + 1, filecount))
-        path = os.path.join(dest_dir, datafile.s3_key)
-
-        try:
-            datafile.download_to(path)
-        except (IOError, ClientError) as e:
-            logger.warning('Failed downloading file {} ({}): {}'.format(datafile.name, str(datafile.id), str(e)))
-            continue
-
-        valid_files.append((datafile, path))
-
-    assert len(valid_files) > 0, 'No files were successfully downloaded'
-    return valid_files
-
-
-def populate_volume(volume, files):
-    '''
-    Import files into the volume, and post on IIIF server
-    TODO: this could use an API endpoint to ingest a new page
-    '''
-    logger.info('Pre import checks...')
-    assert len(files) > 0, 'No files to import'
-    assert isinstance(volume, Element)
-    assert volume.type == ElementType.Volume
-
-    pages, count = [], len(files)
-    for i, (data_file, staging_path) in enumerate(files):
-        logger.info('Adding page {} of {}'.format(i + 1, count))
-
-        # Build local IIIF image
-        img = build_iiif_image(volume, staging_path, data_file, suffix=str(i))
-
-        # Build page with image
-        page = import_page(volume, img, volume.name)
-        page.events.create(type=EventType.Addition)
-        pages.append((page, img.s3_url))
-
-    assert len(pages) > 0, 'No imported pages'
-    logger.info("Imported {} pages into {}".format(len(pages), volume.name))
-    volume.generate_thumbnail()
-
-    return pages
-
-
-def setup_ml_analysis(pages, tools):
-    '''
-    Build a JSON serializable configuration for ML analysis
-    Using the structure from populate_volume
-    '''
-    return {
-        'pages': [
-            {
-                'id': str(page.id),
-                'path': path,
-            }
-            for page, path in pages
-        ],
-        'tools': [
-            {
-                'tool': tool.value,
-                'slug': slug,
-            }
-            for tool, slug in tools
-        ]
-    }
diff --git a/arkindex/dataimport/tasks/image.py b/arkindex/dataimport/tasks/image.py
deleted file mode 100644
index 9e71c991851d169bfcaee972644d390a06ca89e8..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tasks/image.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from PIL import Image as PillowImage
-from arkindex.dataimport.models import DataFile
-from arkindex.images.models import ImageServer, Image
-from arkindex.documents.models import Element, ElementType
-from arkindex.project.aws import S3FileStatus
-from urllib.parse import quote
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-def check_images(files):
-    assert len(files), 'No files to check'
-
-    valid_files = []
-    filecount = len(files)
-
-    for i, (datafile, path) in enumerate(files):
-        logger.info("Checking image {} of {}".format(i + 1, filecount))
-
-        try:
-            img = PillowImage.open(path)
-            assert max(img.size) >= 500, "Image {} is too small".format(datafile.name)
-        except IOError:
-            logger.warn("File {} is not a valid image".format(datafile.name))
-            continue
-        except AssertionError as e:
-            logger.warn(str(e))
-            continue
-
-        valid_files.append((datafile, path))
-
-    assert len(valid_files) > 0, "No valid images in selected files"
-    return valid_files
-
-
-def build_iiif_image(volume, path, data_file, suffix=None):
-    '''
-    Import a staging image into the local IIIF server
-    '''
-    assert isinstance(volume, Element)
-    assert volume.type == ElementType.Volume
-    assert isinstance(path, str)
-    assert isinstance(data_file, DataFile)
-
-    pillow_img = PillowImage.open(path)
-    width, height = pillow_img.size
-
-    # Non-JPEG image formats that should not be converted
-    # Will default to .jpg if the image format is not in there
-    # Formats are Pillow images formats,
-    # see https://pillow.readthedocs.io/en/5.1.x/handbook/image-file-formats.html
-    known_exts = {
-        "JPEG2000": ".jp2",
-        "PNG": ".png",
-        "TIFF": ".tif",
-    }
-    img_format = pillow_img.format
-    if img_format not in known_exts:
-        img_format = 'JPEG'
-    ext = known_exts.get(img_format, '.jpg')
-
-    # Build image path
-    filename = str(data_file.id)
-    if suffix is not None:
-        filename += '-{}'.format(suffix)
-    filename += ext
-    iiif_path = quote('{}/{}'.format(volume.id, filename), safe='')
-
-    # Get Image instance
-    try:
-        img = ImageServer.objects.local.images.get(path=iiif_path)
-    except Image.DoesNotExist:
-        img = Image(
-            server=ImageServer.objects.local,
-            path=iiif_path,
-            width=width,
-            height=height,
-            datafile=data_file,
-        )
-
-    if img.exists():
-        logger.warning('Image already exists on the IIIF server')
-    else:
-        # Save to S3 using optional image type conversion
-        img.pillow_save(pillow_img, format=img_format)
-        img.status = S3FileStatus.Checked
-        img.save()
-
-    return img
diff --git a/arkindex/dataimport/tasks/pdf.py b/arkindex/dataimport/tasks/pdf.py
deleted file mode 100644
index f3550bd7ee0168e9178c5575e2891ae734b9291e..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tasks/pdf.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import distutils.spawn
-import glob
-import os
-import subprocess
-import logging
-from pdf2image import convert_from_path
-from arkindex_common.enums import DataImportPDFEngine
-from arkindex_common.tools import Timer
-
-logger = logging.getLogger(__name__)
-
-
-def extract_pdf_images(pdf_file, pdf_path, working_dir, engine=DataImportPDFEngine.Convert):
-    assert pdf_file.content_type == 'application/pdf', 'File is not a PDF'
-    assert pdf_file.exists(), 'File does not exist'
-
-    methods = {
-        DataImportPDFEngine.Convert: extract_pdf_images_convert,
-        DataImportPDFEngine.Poppler: extract_pdf_images_poppler,
-    }
-
-    assert engine in methods, 'Unsupported engine {}'.format(str(engine))
-
-    logger.info('Convert PDF file with {}'.format(str(engine)))
-    method = methods[engine]
-    with Timer() as t:
-        images = method(pdf_file, pdf_path, working_dir)
-    logger.info('Time {}'.format(str(t.delta)))
-    return images
-
-
-def extract_pdf_images_convert(pdf_file, pdf_path, working_dir):
-    """
-    Convert a PDF file to a list of images
-    """
-    assert distutils.spawn.find_executable('convert'), 'Missing convert in PATH'
-
-    if not os.path.exists(working_dir):
-        os.makedirs(working_dir)
-
-    cmd = [
-        'convert', '-density', '300',
-        'pdf:{}'.format(pdf_path),
-        os.path.join(working_dir, 'pdf-%04d.jpg'),
-    ]
-    subprocess.run(cmd, check=True)
-
-    # Dump all the images in the working dir
-    return sorted(glob.glob(os.path.join(working_dir, '*.jpg')))
-
-
-def extract_pdf_images_poppler(pdf_file, pdf_path, working_dir):
-    """
-    Convert a PDF file to a list of images with poppler
-    """
-    assert distutils.spawn.find_executable('pdfimages'), 'Missing pdfimages in PATH'
-
-    if not os.path.exists(working_dir):
-        os.makedirs(working_dir)
-
-    convert_from_path(
-        pdf_path,
-        output_folder=working_dir,
-        output_file='pdf',  # The pdf- prefix on images
-        dpi=300,
-        fmt='jpg',
-    )
-
-    # Dump all the images in the working dir
-    return sorted(glob.glob(os.path.join(working_dir, '*.jpg')))
diff --git a/arkindex/dataimport/tests/test_image.py b/arkindex/dataimport/tests/test_image.py
deleted file mode 100644
index 059142eaedb7287dbfa733bb25f7c3309f87e91a..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tests/test_image.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from unittest.mock import patch, call
-from arkindex.project.tests import FixtureTestCase
-from arkindex.documents.models import ElementType
-from arkindex.dataimport.tasks import check_images, build_iiif_image
-from arkindex.project.aws import S3FileStatus
-from botocore.exceptions import ClientError
-
-
-class TestImageTasks(FixtureTestCase):
-
-    @classmethod
-    def setUpTestData(cls):
-        super().setUpTestData()
-        cls.vol = cls.corpus.elements.create(name='Test volume', type=ElementType.Volume)
-        cls.df = cls.corpus.files.create(
-            name='file.jpg',
-            size=1234,
-            hash='cafe',
-            content_type='image/jpeg',
-        )
-
-    @patch('arkindex.dataimport.tasks.image.PillowImage')
-    def test_check_images(self, image_mock):
-        image_mock.open.return_value.size = (1000, 1000)
-
-        result = check_images([(self.df, '/some/path'), ])
-
-        self.assertEqual(image_mock.open.call_count, 1)
-        self.assertEqual(image_mock.open.call_args, call('/some/path'))
-        self.assertListEqual(result, [
-            (self.df, '/some/path'),
-        ])
-
-    @patch('arkindex.project.aws.s3.Object')
-    @patch('arkindex.dataimport.tasks.image.PillowImage')
-    def test_build_iiif_image(self, image_mock, s3obj_mock):
-        image_mock.open.return_value.format = 'BMP'
-        image_mock.open.return_value.size = (400, 900)
-        s3obj_mock.return_value.load.side_effect = ClientError({'Error': {'Code': '404'}}, 'head_object')
-
-        with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'):
-            img = build_iiif_image(
-                self.vol,
-                '/somewhere/Untitled.bmp',
-                self.df,
-                suffix='42',
-            )
-
-        expected_path = '{}/{}-42.jpg'.format(str(self.vol.id), str(self.df.id))
-        self.assertEqual(image_mock.open.call_count, 1)
-        self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp'))
-        self.assertEqual(s3obj_mock.call_count, 1)
-        self.assertEqual(s3obj_mock.call_args, call('iiif', expected_path))
-        self.assertEqual(s3obj_mock().load.call_count, 1)
-        self.assertEqual(image_mock.open().save.call_count, 1)
-        self.assertEqual(s3obj_mock().upload_fileobj.call_count, 1)
-        self.assertEqual(img.server, self.imgsrv)
-        self.assertEqual(img.path, expected_path.replace('/', '%2F'))
-        self.assertEqual(img.width, 400)
-        self.assertEqual(img.height, 900)
-        self.assertEqual(img.status, S3FileStatus.Checked)
-        self.assertEqual(img.datafile, self.df)
-
-    @patch('arkindex.project.aws.s3.Object')
-    @patch('arkindex.dataimport.tasks.image.PillowImage')
-    def test_build_iiif_image_retry(self, image_mock, s3obj_mock):
-        """
-        Test build_iiif_image just returns existing images if they already exist
-        """
-        image_mock.open.return_value.format = 'JPEG2000'
-        image_mock.open.return_value.size = (400, 900)
-        original_img = self.imgsrv.images.create(
-            path='{}%2F{}.jp2'.format(str(self.vol.id), str(self.df.id)),
-            datafile=self.df,
-            width=900,
-            height=400,
-            status=S3FileStatus.Checked,
-        )
-
-        with self.settings(LOCAL_IMAGESERVER_ID=self.imgsrv.id, AWS_IIIF_BUCKET='iiif'):
-            new_img = build_iiif_image(
-                self.vol,
-                '/somewhere/Untitled.bmp',
-                self.df,
-            )
-
-        self.assertEqual(original_img.id, new_img.id)
-        self.assertEqual(image_mock.open.call_count, 1)
-        self.assertEqual(image_mock.open.call_args, call('/somewhere/Untitled.bmp'))
-        self.assertEqual(s3obj_mock().load.call_count, 1)
-        self.assertFalse(image_mock.open().save.called)
diff --git a/arkindex/dataimport/tests/test_pdf.py b/arkindex/dataimport/tests/test_pdf.py
deleted file mode 100644
index bbb882295c16aed9677fa60e967448ba9e9e3f4b..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tests/test_pdf.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from arkindex_common.enums import DataImportPDFEngine
-from arkindex.project.tests import FixtureTestCase
-from arkindex.dataimport.tasks import extract_pdf_images
-from unittest.mock import patch, MagicMock
-from botocore.exceptions import ClientError
-import tempfile
-import shutil
-import glob
-import os.path
-
-FIXTURES = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'pdf_samples',
-)
-
-
-class TestPdf(FixtureTestCase):
-
-    @classmethod
-    def setUpTestData(cls):
-        super().setUpTestData()
-        cls.img_file = cls.corpus.files.create(name='sample.jpg', size=42, hash='abcd', content_type='image/jpg')
-        cls.pdf_file = cls.corpus.files.create(name='sample.pdf', size=42, hash='dcba', content_type='application/pdf')
-
-        cls.working_dir = tempfile.mkdtemp()
-        cls.pdf_path = os.path.join(cls.working_dir, str(cls.pdf_file.id))
-        shutil.copyfile(os.path.join(FIXTURES, 'sample.pdf'), cls.pdf_path)
-
-    @classmethod
-    def tearDownClass(cls):
-        super().tearDownClass()
-        shutil.rmtree(cls.working_dir)
-
-    def test_extract_pdf_images_filetype(self):
-        """
-        Test extract_pdf_images task the file's content type
-        """
-        with self.assertRaises(AssertionError):
-            extract_pdf_images(self.img_file, self.pdf_path, self.working_dir)
-
-    def test_extract_pdf_images_exists(self):
-        """
-        Test extract_pdf_images task checks the file's existence
-        """
-        file_mock = MagicMock()
-        file_mock.content_type = 'application/pdf'
-        file_mock.exists.return_value = False
-
-        with self.assertRaises(AssertionError):
-            extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
-
-    @patch('arkindex.project.aws.s3')
-    def test_extract_pdf_images_s3_error(self, s3_mock):
-        """
-        Test extract_pdf_images task lets S3 errors through
-        """
-        file_mock = MagicMock()
-        file_mock.content_type = 'application/pdf'
-        # Any ClientError with a code other than 404
-        file_mock.exists.side_effect = ClientError({'Error': {'Code': '999'}}, 'head_object')
-
-        with self.assertRaises(ClientError):
-            extract_pdf_images(file_mock, self.pdf_path, self.working_dir)
-
-    @patch('arkindex.project.aws.s3')
-    def test_extract_pdf_images_with_convert(self, s3_mock):
-        """
-        Test extract_pdf_images runs ImageMagick and returns proper info
-        """
-        result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir)
-
-        self.assertListEqual(result, [
-            os.path.join(self.working_dir, 'pdf-0000.jpg'),
-            os.path.join(self.working_dir, 'pdf-0001.jpg'),
-        ])
-
-    @patch('arkindex.project.aws.s3')
-    def test_extract_pdf_images_with_poppler(self, s3_mock):
-        """
-        Test extract_pdf_images runs ImageMagick and returns proper info
-        """
-        oldImages = glob.glob(os.path.join(self.working_dir, '*.jpg'))
-        for img in oldImages:
-            os.remove(os.path.join(self.working_dir, img))
-
-        result = extract_pdf_images(self.pdf_file, self.pdf_path, self.working_dir, DataImportPDFEngine.Poppler)
-
-        self.assertListEqual(result, [
-            os.path.join(self.working_dir, 'pdf-1.jpg'),
-            os.path.join(self.working_dir, 'pdf-2.jpg')
-        ])
diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py
deleted file mode 100644
index 553d66c83900f1c6dadfba6c91e55f982a551658..0000000000000000000000000000000000000000
--- a/arkindex/dataimport/tests/test_tasks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from unittest.mock import patch, call
-from arkindex_common.enums import DataImportMode
-from arkindex.project.tests import FixtureTestCase
-from arkindex.dataimport.tasks import download_files
-
-
-class TestTasks(FixtureTestCase):
-
-    @classmethod
-    def setUpTestData(cls):
-        super().setUpTestData()
-        cls.df1 = cls.corpus.files.create(
-            name='file1.jpg',
-            size=1234,
-            hash='cafe',
-            content_type='image/jpeg',
-        )
-        cls.df2 = cls.corpus.files.create(
-            name='file2.png',
-            size=5678,
-            hash='beef',
-            content_type='image/png',
-        )
-        cls.di = cls.corpus.imports.create(
-            mode=DataImportMode.Images,
-            creator=cls.user,
-        )
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls.s3obj_patch = patch('arkindex.project.aws.s3.Object')
-        cls.s3obj_mock = cls.s3obj_patch.start()
-        cls.access_patch = patch('arkindex.dataimport.tasks.base.os.access')
-        cls.access_mock = cls.access_patch.start()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.s3obj_patch.stop()
-        cls.access_patch.stop()
-        super().tearDownClass()
-
-    def setUp(self):
-        super().setUp()
-        self.s3obj_mock.reset_mock()
-        self.access_mock.reset_mock()
-        self.access_mock.return_value = True
-        self.s3obj_mock.return_value.download_file.side_effect = None
-
-    def test_download_files(self):
-        self.di.files.set([self.df1])
-        expected_path = '/somewhere/{}'.format(str(self.df1.id))
-        self.assertListEqual(
-            download_files(self.di, '/somewhere'),
-            [(self.df1, expected_path)],
-        )
-        self.assertEqual(self.s3obj_mock().download_file.call_count, 1)
-        self.assertEqual(self.s3obj_mock().download_file.call_args, call(expected_path))
-
-    def test_download_files_fail(self):
-        self.di.files.set([self.df1, self.df2])
-        # Fail only once
-        self.s3obj_mock().download_file.side_effect = [IOError, None]
-
-        expected_path1 = '/somewhere/{}'.format(str(self.df1.id))
-        expected_path2 = '/somewhere/{}'.format(str(self.df2.id))
-
-        self.assertListEqual(
-            download_files(self.di, '/somewhere'),
-            [(self.df2, expected_path2)],
-        )
-
-        self.assertEqual(self.s3obj_mock().download_file.call_count, 2)
-        self.assertEqual(self.s3obj_mock().download_file.call_args_list, [
-            call(expected_path1),
-            call(expected_path2),
-        ])
-
-    def test_download_files_epic_fail(self):
-        self.di.files.set([self.df1, self.df2])
-        # Fail all the time
-        self.s3obj_mock().download_file.side_effect = IOError
-
-        expected_path1 = '/somewhere/{}'.format(str(self.df1.id))
-        expected_path2 = '/somewhere/{}'.format(str(self.df2.id))
-
-        with self.assertRaisesRegex(AssertionError, 'No files'):
-            download_files(self.di, '/somewhere')
-
-        self.assertEqual(self.s3obj_mock().download_file.call_count, 2)
-        self.assertEqual(self.s3obj_mock().download_file.call_args_list, [
-            call(expected_path1),
-            call(expected_path2),
-        ])
-
-    def test_download_files_empty(self):
-        self.di.files.set([])
-        self.assertListEqual(download_files(self.di, '/somewhere'), [])
-        self.assertFalse(self.s3obj_mock.called)
-
-    def test_download_files_read_only(self):
-        self.access_mock.return_value = False
-        self.di.files.set([self.df1])
-        with self.assertRaisesRegex(AssertionError, 'read-only'):
-            download_files(self.di, '/somewhere')
diff --git a/arkindex/project/checks.py b/arkindex/project/checks.py
index d63eb1ef7f9f0bb76d5873f2601578649ebd7978..8109259f61a10640de359ac0d300e25b23850f1a 100644
--- a/arkindex/project/checks.py
+++ b/arkindex/project/checks.py
@@ -114,6 +114,7 @@ def docker_images_check(*args, **kwargs):
     images = (
         (settings.ARKINDEX_APP_IMAGE, 'ARKINDEX_APP_IMAGE'),
         (settings.ARKINDEX_ML_IMAGE, 'ARKINDEX_ML_IMAGE'),
+        (settings.ARKINDEX_TASKS_IMAGE, 'ARKINDEX_TASKS_IMAGE'),
     )
     for image_tag, setting_name in images:
         try:
diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py
index 5238babadc316bce08713b380c89693b6503bca9..12cc187bde0d7536f987d0f25fa0f467da23dbaa 100644
--- a/arkindex/project/settings.py
+++ b/arkindex/project/settings.py
@@ -393,6 +393,7 @@ CORS_URLS_REGEX = r'^/(api|ponos)/.*$'
 # Docker images used by our ponos workflow
 ARKINDEX_APP_IMAGE = os.environ.get('ARKINDEX_APP_IMAGE', 'arkindex-app')
 ARKINDEX_ML_IMAGE = os.environ.get('ARKINDEX_ML_IMAGE', 'arkindex-worker-ml')
+ARKINDEX_TASKS_IMAGE = os.environ.get('ARKINDEX_TASKS_IMAGE', 'arkindex-tasks')
 
 # ML worker
 ML_DEFAULT_CLASSIFIER = 'tobacco'
diff --git a/arkindex/project/tests/test_checks.py b/arkindex/project/tests/test_checks.py
index 97e88213a5f8d99f88d7cd811a152b98781baa75..13e5fa5c66012fb746bad18bbd02fc26da74871d 100644
--- a/arkindex/project/tests/test_checks.py
+++ b/arkindex/project/tests/test_checks.py
@@ -99,21 +99,15 @@ class ChecksTestCase(TestCase):
         srv.delete()
 
     @patch('arkindex.project.checks.subprocess.run')
+    @override_settings(
+        ARKINDEX_APP_IMAGE='nope',
+        ARKINDEX_ML_IMAGE='me-neither',
+        ARKINDEX_TASKS_IMAGE='nuh',
+    )
     def test_docker_images_check(self, run_mock):
         from arkindex.project.checks import docker_images_check
 
-        run_mock.side_effect = [CalledProcessError(1, ''), None]
-        with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'):
-            self.assertListEqual(docker_images_check(), [
-                Error(
-                    'Docker image with tag "nope" was not found.',
-                    hint='settings.ARKINDEX_APP_IMAGE = "nope"',
-                    id='arkindex.E006',
-                )
-            ])
-
-        self.assertEqual(run_mock.call_count, 2)
-        self.assertEqual(run_mock.call_args_list, [
+        expected_calls = [
             call(
                 ['docker', 'image', 'inspect', 'nope'],
                 stdout=subprocess.PIPE,
@@ -126,35 +120,44 @@ class ChecksTestCase(TestCase):
                 stderr=subprocess.STDOUT,
                 check=True,
             ),
-        ])
-
-        run_mock.reset_mock()
-        run_mock.side_effect = [None, CalledProcessError(1, '')]
-        with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'):
-            self.assertListEqual(docker_images_check(), [
-                Error(
-                    'Docker image with tag "me-neither" was not found.',
-                    hint='settings.ARKINDEX_ML_IMAGE = "me-neither"',
-                    id='arkindex.E006',
-                )
-            ])
-
-        self.assertEqual(run_mock.call_count, 2)
-        self.assertEqual(run_mock.call_args_list, [
             call(
-                ['docker', 'image', 'inspect', 'nope'],
+                ['docker', 'image', 'inspect', 'nuh'],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 check=True,
             ),
-            call(
-                ['docker', 'image', 'inspect', 'me-neither'],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                check=True,
+        ]
+
+        run_mock.side_effect = [CalledProcessError(1, ''), None, None]
+        self.assertListEqual(docker_images_check(), [
+            Error(
+                'Docker image with tag "nope" was not found.',
+                hint='settings.ARKINDEX_APP_IMAGE = "nope"',
+                id='arkindex.E006',
+            )
+        ])
+
+        self.assertEqual(run_mock.call_count, 3)
+        self.assertEqual(run_mock.call_args_list, expected_calls)
+
+        run_mock.reset_mock()
+        run_mock.side_effect = [None, CalledProcessError(1, ''), CalledProcessError(1, '')]
+        self.assertListEqual(docker_images_check(), [
+            Error(
+                'Docker image with tag "me-neither" was not found.',
+                hint='settings.ARKINDEX_ML_IMAGE = "me-neither"',
+                id='arkindex.E006',
             ),
+            Error(
+                'Docker image with tag "nuh" was not found.',
+                hint='settings.ARKINDEX_TASKS_IMAGE = "nuh"',
+                id='arkindex.E006',
+            )
         ])
 
+        self.assertEqual(run_mock.call_count, 3)
+        self.assertEqual(run_mock.call_args_list, expected_calls)
+
     @patch('arkindex.project.checks.subprocess.run')
     def test_docker_images_check_missing_client(self, run_mock):
         """
@@ -163,10 +166,10 @@ class ChecksTestCase(TestCase):
         from arkindex.project.checks import docker_images_check
 
         run_mock.side_effect = FileNotFoundError
-        with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither'):
+        with self.settings(ARKINDEX_APP_IMAGE='nope', ARKINDEX_ML_IMAGE='me-neither', ARKINDEX_TASKS_IMAGE='nuh'):
             self.assertListEqual(docker_images_check(), [])
 
-        self.assertEqual(run_mock.call_count, 2)
+        self.assertEqual(run_mock.call_count, 3)
         self.assertEqual(run_mock.call_args_list, [
             call(
                 ['docker', 'image', 'inspect', 'nope'],
@@ -180,6 +183,12 @@ class ChecksTestCase(TestCase):
                 stderr=subprocess.STDOUT,
                 check=True,
             ),
+            call(
+                ['docker', 'image', 'inspect', 'nuh'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                check=True,
+            ),
         ])
 
     @patch('arkindex.project.checks.parse_recipe')
diff --git a/base/Dockerfile b/base/Dockerfile
index 40ddccc93f8411067a8492c6afd21526b3d14c73..ef1d48c8f3808f1d5d2c413c2f6589899fdf34ef 100644
--- a/base/Dockerfile
+++ b/base/Dockerfile
@@ -19,7 +19,7 @@ COPY --from=staging /build /usr
 ENV PYTHONPATH=/usr/lib/python3.6/site-packages
 
 # Add runtime system deps
-RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick poppler-utils
+RUN apk add --update --no-cache wget gzip libmagic git unzip libpq libxslt libjpeg imagemagick
 
 # Add unprivilegied user
 RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark
diff --git a/requirements.txt b/requirements.txt
index 58b5c2dabe260ebe46094f5281bad3694f587f09..7f406b55fdcfe49bcef22ffaeaee9fcfa1478ee4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,6 @@ gitpython==2.1.11
 idna==2.6
 jdcal==1.3
 olefile==0.44
-pdf2image==1.5.1
 python-gitlab==1.7.0
 python-magic==0.4.15
 python-memcached==1.59