Erwan Rouchet · ed277a36
--- a/arkindex/dataimport/management/commands/import.py

+ 26

− 10
+++ b/arkindex/dataimport/management/commands/import.py

+ 26

− 10
 @@ -3,13 +3,20 @@ from django.core.management.base import BaseCommand
 from arkindex_common.ml_tool import MLToolType
 from arkindex.project.argparse import DataImportArgument
 from arkindex.dataimport.models import DataImportMode
-from arkindex.dataimport.tasks import extract_pdf_images, populate_volume, setup_ml_analysis, check_images
+from arkindex.dataimport.tasks import (
+    download_files,
+    check_images,
+    extract_pdf_images,
+    populate_volume,
+    setup_ml_analysis,
+)
 from arkindex.dataimport.git import GitFlow
 from django.conf import settings
 import tempfile
 import json
 import os
 import logging
+import shutil

 logging.basicConfig(
    level=logging.INFO,
 @@ -39,26 +46,33 @@ class Command(BaseCommand):
        # Fallback to a temp directory while developing
        task_dir = os.environ.get('PONOS_DATA', tempfile.mkdtemp(suffix='-ponos'))
        assert os.path.isdir(task_dir), 'Invalid task dir {}'.format(task_dir)
-        logging.info('Using task dir: {}'.format(task_dir))
+        logger.info('Using task dir: {}'.format(task_dir))
+
+        # Use temp folder for anything that does not need to be shared
+        temp_dir = tempfile.mkdtemp(suffix='-ponostmp')
+        logger.debug('Using temp dir: {}'.format(temp_dir))

        pages, files = None, None
+        if data_import.files.exists():
+            files = download_files(data_import, temp_dir)
+
        if data_import.mode == DataImportMode.Images:
            # Validate images from data import
-            files = check_images(data_import)
+            files = check_images(files)

        elif data_import.mode == DataImportMode.PDF:
-            assert data_import.files.count() == 1, 'Only one file in PDF mode'
-            pdf_file = data_import.files.first()
+            assert len(files) == 1, 'Only one file in PDF mode'
+            pdf_file, pdf_path = files[0]
            assert pdf_file.content_type == 'application/pdf', 'File is not a PDF'

-            # Get images from pdf into the task working dir
-            images = extract_pdf_images(pdf_file, task_dir)
+            # Extract images from the PDF into the task working dir and get their paths
+            images = extract_pdf_images(pdf_file, pdf_path, task_dir)

            # Add those images to the volume
            # all linked to the original pdf file
            files = [
-                (pdf_file, img)
-                for img in images
+                (pdf_file, img_path)
+                for img_path in images
            ]

        elif data_import.mode == DataImportMode.Repository:
 @@ -81,10 +95,12 @@ class Command(BaseCommand):

            # Load or create volume from dataimport
            volume = data_import.get_volume()
-            logging.info('Using volume: {}'.format(volume))
+            logger.info('Using volume: {}'.format(volume))

            pages = populate_volume(volume, files)

+        shutil.rmtree(temp_dir)
+
        # Setup the analysis process through a json config
        ml_analysis = os.path.join(task_dir, 'ml_analysis.json')
        if os.path.exists(ml_analysis) or not ml_tools: