Skip to content
Snippets Groups Projects

Download files from S3 before workflows

Merged Erwan Rouchet requested to merge download-workflow into master
All threads resolved!
7 files
+ 195
63
Compare changes
  • Side-by-side
  • Inline
Files
7
@@ -3,13 +3,20 @@ from django.core.management.base import BaseCommand
from arkindex_common.ml_tool import MLToolType
from arkindex.project.argparse import DataImportArgument
from arkindex.dataimport.models import DataImportMode
from arkindex.dataimport.tasks import extract_pdf_images, populate_volume, setup_ml_analysis, check_images
from arkindex.dataimport.tasks import (
download_files,
check_images,
extract_pdf_images,
populate_volume,
setup_ml_analysis,
)
from arkindex.dataimport.git import GitFlow
from django.conf import settings
import tempfile
import json
import os
import logging
import shutil
logging.basicConfig(
level=logging.INFO,
@@ -39,26 +46,33 @@ class Command(BaseCommand):
# Fallback to a temp directory while developing
task_dir = os.environ.get('PONOS_DATA', tempfile.mkdtemp(suffix='-ponos'))
assert os.path.isdir(task_dir), 'Invalid task dir {}'.format(task_dir)
logging.info('Using task dir: {}'.format(task_dir))
logger.info('Using task dir: {}'.format(task_dir))
# Use temp folder for anything that does not need to be shared
temp_dir = tempfile.mkdtemp(suffix='-ponostmp')
logger.debug('Using temp dir: {}'.format(temp_dir))
pages, files = None, None
if data_import.files.exists():
files = download_files(data_import, temp_dir)
if data_import.mode == DataImportMode.Images:
# Validate images from data import
files = check_images(data_import)
files = check_images(files)
elif data_import.mode == DataImportMode.PDF:
assert data_import.files.count() == 1, 'Only one file in PDF mode'
pdf_file = data_import.files.first()
assert len(files) == 1, 'Only one file in PDF mode'
pdf_file, pdf_path = files[0]
assert pdf_file.content_type == 'application/pdf', 'File is not a PDF'
# Get images from pdf into the task working dir
images = extract_pdf_images(pdf_file, task_dir)
# Extract images from the PDF into the task working dir and get their paths
images = extract_pdf_images(pdf_file, pdf_path, task_dir)
# Add those images to the volume
# all linked to the original pdf file
files = [
(pdf_file, img)
for img in images
(pdf_file, img_path)
for img_path in images
]
elif data_import.mode == DataImportMode.Repository:
@@ -81,10 +95,12 @@ class Command(BaseCommand):
# Load or create volume from dataimport
volume = data_import.get_volume()
logging.info('Using volume: {}'.format(volume))
logger.info('Using volume: {}'.format(volume))
pages = populate_volume(volume, files)
shutil.rmtree(temp_dir)
# Setup the analysis process through a json config
ml_analysis = os.path.join(task_dir, 'ml_analysis.json')
if os.path.exists(ml_analysis) or not ml_tools:
Loading