diff --git a/.gitignore b/.gitignore index 7c465bc5323fb49bcde6898ba2a7926fc640aee4..659d9c69c125a52defc6ec55d8efd995afdfafb1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ build dist .eggs logs +media +.vscode +local_settings.py +arkindex/iiif-users/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 59be52cea6b5a7d85f3433aa1d70a0e735daf317..a96f7a8683efeb6f4b7d2c80fbffd26dd97c08cf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,7 +20,7 @@ backend-tests: DB_PORT: 5432 before_script: - - apk --update add postgresql-dev libjpeg-turbo-dev gcc musl-dev zlib-dev + - apk --update add postgresql-dev libjpeg-turbo-dev gcc musl-dev zlib-dev libmagic script: - python setup.py test diff --git a/Dockerfile b/Dockerfile index 561732db526e3f7770e827c87c18df0b21410f77..d00ddb1cf66ad0605c70c21f6832b5ca2269debc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,25 @@ FROM python:3.5-alpine ENV PYTHONUNBUFFERED 1 +ENV PYTHON_EGG_CACHE /tmp/python-egs ARG FRONTEND_BRANCH=master ARG GITLAB_TOKEN="gTPA5UQYesSuKMCRM2r_" +# Add unprivilegied user +RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark + +# Allow access to python egg cache +RUN mkdir -p $PYTHON_EGG_CACHE && chmod a+rxw $PYTHON_EGG_CACHE + +# Allow access to medias +RUN mkdir -p /medias/staging /medias/iiif +RUN chown -R ark:teklia /medias + +# Allow access to default logs dir +RUN mkdir -p /logs && chmod a+rxw /logs + # Add system dependencies -RUN apk add --update --no-cache postgresql-dev jpeg-dev build-base wget gzip zlib-dev +RUN apk add --update --no-cache postgresql-dev jpeg-dev build-base wget gzip zlib-dev libmagic # Setup frontend ENV FRONTEND_DIR="/frontend/dist" diff --git a/arkindex/dataimport/__init__.py b/arkindex/dataimport/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/arkindex/dataimport/admin.py b/arkindex/dataimport/admin.py new file mode 100644 index 0000000000000000000000000000000000000000..9aa7539ca6730c76c942cd2e3ea96361338221b0 --- /dev/null +++ b/arkindex/dataimport/admin.py @@ -0,0 +1,21 @@ +from django.contrib import admin +from enumfields.admin import EnumFieldListFilter +from arkindex.dataimport.models import DataImport, DataFile + + +class DataImportAdmin(admin.ModelAdmin): + list_display = ('id', 'creator', 'corpus', 'state', 'mode') + list_filter = [('state', EnumFieldListFilter), ('mode', EnumFieldListFilter)] + fields = ('id', 'creator', 'corpus', 'state', 'mode', 'payload', 'root_id') + readonly_fields = ('id', 'root_id', ) + + +class DataFileAdmin(admin.ModelAdmin): + list_display = ('id', 'name', 'corpus') + list_filter = ('corpus', ) + fields = ('id', 'name', 'size', 'hash', 'content_type', 'corpus') + readonly_fields = ('id', 'size', 'hash', ) + + +admin.site.register(DataImport, DataImportAdmin) +admin.site.register(DataFile, DataFileAdmin) diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py new file mode 100644 index 0000000000000000000000000000000000000000..c667164a10ef2a4e4a2493ec4bed8f4bac62ce42 --- /dev/null +++ b/arkindex/dataimport/api.py @@ -0,0 +1,137 @@ +from rest_framework.generics import \ + ListAPIView, ListCreateAPIView, RetrieveUpdateDestroyAPIView +from rest_framework.views import APIView +from rest_framework.parsers import MultiPartParser, FileUploadParser +from rest_framework.permissions import IsAuthenticated +from rest_framework.response import Response +from rest_framework import status +from rest_framework.exceptions import ValidationError +from arkindex.documents.models import Corpus +from arkindex.dataimport.models import DataImport, DataFile, DataImportState, DataImportMode +from arkindex.dataimport.serializers import \ + DataImportLightSerializer, DataImportSerializer, DataFileSerializer +import hashlib +import magic + + +class DataImportsList(ListCreateAPIView): + """ + List data imports + """ + permission_classes = (IsAuthenticated, ) + serializer_class = DataImportLightSerializer + + def get_queryset(self): + return DataImport.objects.filter(creator=self.request.user) + + def perform_create(self, serializer): + # Temporary limitation + if serializer.validated_data['mode'] not in (DataImportMode.Images, ): + raise ValidationError('Unsupported mode for now, sorry.') + + return super().perform_create(serializer) + + +class DataImportDetails(RetrieveUpdateDestroyAPIView): + """ + Retrieve and edit a data import + """ + + permission_classes = (IsAuthenticated, ) + serializer_class = DataImportSerializer + + def get_queryset(self): + return DataImport.objects.filter(creator=self.request.user) + + def perform_update(self, serializer): + dataimport = serializer.instance + if dataimport.state not in (DataImportState.Created, DataImportState.Configured): + raise ValidationError("Cannot update a workflow after it has been started") + + # Allow updating the state from Configured to Running only + if dataimport.state != serializer.validated_data['state']: + if dataimport.state != DataImportState.Configured: + raise ValidationError({'state': ['Cannot change state on non-configured or started workflows']}) + if serializer.validated_data['state'] != DataImportState.Running: + raise ValidationError( + {'state': ['The state of a configured workflow can only be set to configured or running']}) + dataimport.start() + return + + super().perform_update(serializer) + + # Automatically change from Created to Configured if a payload has been set + if dataimport.state == DataImportState.Created and dataimport.payload is not None: + dataimport.state = DataImportState.Configured + dataimport.save() + + def perform_destroy(self, instance): + if instance.state == DataImportState.Running: + raise ValidationError("Cannot delete a workflow while it is running") + super().perform_destroy(instance) + + +class DataFileList(ListAPIView): + """ + List uploaded files + """ + permission_classes = (IsAuthenticated, ) + serializer_class = DataFileSerializer + + def get_queryset(self): + return DataFile.objects.filter(corpus_id=self.kwargs['pk']) + + +class DataFileRetrieve(RetrieveUpdateDestroyAPIView): + """ + Get one file + """ + permission_classes = (IsAuthenticated, ) + serializer_class = DataFileSerializer + # TODO: Restrict to user's corpuses + queryset = DataFile.objects.all() + + +class DataFileUpload(APIView): + """ + Upload a new file to a corpus + """ + permission_classes = (IsAuthenticated, ) + parser_classes = (MultiPartParser, FileUploadParser) + + def post(self, request, pk=None, format=None): + corpus_qs = Corpus.objects.filter(id=pk) + if not corpus_qs.exists(): + raise ValidationError({'corpus': ['Corpus not found']}) + corpus = corpus_qs.get() + + file_obj = request.FILES['file'] + + md5hash = hashlib.md5() + for chunk in file_obj.chunks(): + md5hash.update(chunk) + file_hash = md5hash.hexdigest() + + # Reopen file to reread from beginning + file_obj.open() + file_type = magic.from_buffer(file_obj.read(1024), mime=True) + + if DataFile.objects.filter(hash=file_hash, corpus=corpus).exists(): + raise ValidationError('File already exists') + + df = DataFile.objects.create( + hash=file_hash, + corpus=corpus, + name=file_obj.name, + size=file_obj.size, + content_type=file_type, + ) + + file_obj.open() + with open(df.staging_path, 'wb') as destfile: + for chunk in file_obj.chunks(): + destfile.write(chunk) + + return Response( + data=DataFileSerializer(df).data, + status=status.HTTP_201_CREATED) diff --git a/arkindex/dataimport/apps.py b/arkindex/dataimport/apps.py new file mode 100644 index 0000000000000000000000000000000000000000..67f6c2b447fcf5b4ac2ff5edbd2edf81d357a47f --- /dev/null +++ b/arkindex/dataimport/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class ImportConfig(AppConfig): + name = 'import' diff --git a/arkindex/dataimport/migrations/0001_initial.py b/arkindex/dataimport/migrations/0001_initial.py new file mode 100644 index 0000000000000000000000000000000000000000..f9699764e44f196ee4e3a1e539326520961124a7 --- /dev/null +++ b/arkindex/dataimport/migrations/0001_initial.py @@ -0,0 +1,56 @@ +# Generated by Django 2.0 on 2018-07-18 12:32 + +import arkindex.dataimport.models +from django.conf import settings +import django.contrib.postgres.fields.jsonb +from django.db import migrations, models +import django.db.models.deletion +import enumfields.fields +import uuid + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('documents', '0018_remove_elementlink'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='DataFile', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('name', models.CharField(max_length=100)), + ('size', models.PositiveIntegerField()), + ('hash', models.CharField(max_length=32)), + ('content_type', models.CharField(max_length=50)), + ('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='files', to='documents.Corpus')), + ], + ), + migrations.CreateModel( + name='DataImport', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('created', models.DateTimeField(auto_now_add=True)), + ('updated', models.DateTimeField(auto_now=True)), + ('state', enumfields.fields.EnumField(default='created', enum=arkindex.dataimport.models.DataImportState, max_length=30)), + ('mode', enumfields.fields.EnumField(enum=arkindex.dataimport.models.DataImportMode, max_length=30)), + ('payload', django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True)), + ('root_id', models.UUIDField(blank=True, null=True)), + ('task_count', models.PositiveSmallIntegerField(blank=True, null=True)), + ('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='imports', to='documents.Corpus')), + ('creator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='imports', to=settings.AUTH_USER_MODEL)), + ('files', models.ManyToManyField(related_name='imports', to='dataimport.DataFile')), + ], + options={ + 'abstract': False, + }, + ), + migrations.AlterUniqueTogether( + name='datafile', + unique_together={('corpus', 'hash')}, + ), + ] diff --git a/arkindex/dataimport/migrations/__init__.py b/arkindex/dataimport/migrations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py new file mode 100644 index 0000000000000000000000000000000000000000..4f8aaa13d4cdf3789b41bbcdda604eeb8318314e --- /dev/null +++ b/arkindex/dataimport/models.py @@ -0,0 +1,120 @@ +from django.db import models +from django.contrib.postgres.fields import JSONField +from django.template.defaultfilters import slugify +from django.conf import settings +from celery.canvas import Signature +from enumfields import EnumField, Enum +from celery.result import AsyncResult, GroupResult +from arkindex.project.models import IndexableModel +import uuid +import os + + +class DataImportState(Enum): + Created = 'created' + Configured = 'configured' + Running = 'running' + Done = 'done' + Error = 'error' + + +class DataImportMode(Enum): + Images = 'images' + Annotations = 'annotations' + Surfaces = 'surfaces' + Acts = 'acts' + + +class DataImport(IndexableModel): + """ + A single import workflow + """ + + creator = models.ForeignKey('users.User', on_delete=models.CASCADE, related_name='imports') + corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='imports') + state = EnumField(DataImportState, default=DataImportState.Created, max_length=30) + mode = EnumField(DataImportMode, max_length=30) + files = models.ManyToManyField('dataimport.DataFile', related_name='imports') + payload = JSONField(null=True, blank=True) + root_id = models.UUIDField(null=True, blank=True) + task_count = models.PositiveSmallIntegerField(null=True, blank=True) + + @property + def celery_root(self): + return AsyncResult(str(self.root_id)) if self.root_id else None + + @property + def tasks(self): + """ + Get a list of all tasks associated with the import + """ + return filter( + lambda task: not isinstance(task, GroupResult), + reversed(self.celery_root.build_graph(True).topsort()) + ) if self.celery_root else None + + def build_workflow(self): + # Only Images import is supported + assert self.mode == DataImportMode.Images + + # Prevent circular imports + from arkindex.dataimport.tasks import check_images, import_images + return check_images.s(self) | import_images.s(self) + + def get_task_count(self, signature): + assert isinstance(signature, Signature) + + def _size(sig): + # Recursive calc + out = 0 + if hasattr(sig, 'body'): + out += _size(sig.body) + if hasattr(sig, 'tasks'): + out += sum(map(_size, sig.tasks)) + else: + out = 1 + return out + + return _size(signature) + + def start(self): + assert self.state != DataImportState.Running + signature = self.build_workflow() + + parent_task = signature.delay() + # Loop to get the parent task ID (root ID) + while parent_task.parent is not None: + parent_task = parent_task.parent + self.root_id = parent_task.id + + self.task_count = self.get_task_count(signature) + self.state = DataImportState.Running + self.save() + + @property + def folder_name(self): + if 'folder_name' not in self.payload: + return + return slugify(self.payload['folder_name']) + + @property + def iiif_path(self): + if not self.folder_name: + return + return os.path.join(settings.LOCAL_MEDIA_ROOT, self.folder_name) + + +class DataFile(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4) + name = models.CharField(max_length=100) + size = models.PositiveIntegerField() + hash = models.CharField(max_length=32) + content_type = models.CharField(max_length=50) + corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='files') + + class Meta: + unique_together = (('corpus', 'hash'), ) + + @property + def staging_path(self): + return os.path.join(settings.MEDIA_ROOT, str(self.id)) diff --git a/arkindex/dataimport/serializers.py b/arkindex/dataimport/serializers.py new file mode 100644 index 0000000000000000000000000000000000000000..92b9cdbf535323c9e868e24f0a94da967c11991d --- /dev/null +++ b/arkindex/dataimport/serializers.py @@ -0,0 +1,146 @@ +from rest_framework import serializers +from rest_framework.utils import model_meta +from arkindex.project.serializer_fields import EnumField +from arkindex.dataimport.models import DataImport, DataImportMode, DataImportState, DataFile +import celery.states + + +class TaskSerializer(serializers.Serializer): + """ + Serialize a task and its status info + """ + + id = serializers.UUIDField() + name = serializers.SerializerMethodField() + state = serializers.CharField() + progress = serializers.SerializerMethodField() + messages = serializers.SerializerMethodField() # TODO: use a custom serializer ? + parent_id = serializers.SerializerMethodField() + + def get_name(self, obj): + name = obj._get_task_meta().get('task_name') + + # Remove prefix + prefix = 'arkindex.dataimport.tasks.' + if name and name.startswith(prefix): + return name[len(prefix):] + return name + + def get_messages(self, obj): + return obj._get_task_meta().get('messages', []) + + def get_progress(self, obj): + if obj.state in celery.states.READY_STATES: + return 1.0 + if obj.state == 'PROGRESS': + result = obj._get_task_meta().get('result') or {} + return result.get('progress', 0.0) + return 0.0 + + def get_parent_id(self, obj): + return obj.parent and obj.parent.id + + class Meta: + fields = ( + 'id', + 'state', + 'result', + 'parent_id', + ) + read_only_fields = fields + + +class DataImportLightSerializer(serializers.ModelSerializer): + """ + Serialize a data importing workflow + """ + + state = EnumField(DataImportState, read_only=True) + mode = EnumField(DataImportMode) + creator = serializers.HiddenField(default=serializers.CurrentUserDefault()) + + class Meta: + model = DataImport + fields = ( + 'id', + 'state', + 'mode', + 'corpus', + 'creator', + ) + read_only_fields = ('id', ) + + +class ImagesPayloadSerializer(serializers.Serializer): + """ + Serialize an image importing payload + """ + + folder_name = serializers.CharField() + volume_name = serializers.CharField() + + +class DataImportSerializer(DataImportLightSerializer): + """ + Serialize a data importing workflow with its payload + """ + + # Redefine state as writable + state = EnumField(DataImportState) + payload = serializers.JSONField() + tasks = TaskSerializer(many=True, read_only=True) + task_count = serializers.IntegerField(read_only=True) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.instance is None: + return + payload_serializers = { + DataImportMode.Images: ImagesPayloadSerializer, + } + self.fields['payload'] = payload_serializers.get(self.instance.mode, serializers.JSONField)() + + def update(self, instance, validated_data): + # ModelSerializer disallows updating with nested serializers. + # We dynamically inject a payload serializer, but it is useless for the database + # since the payload field is a JSONField with no particular checks. + # We don't have to care about a nested serializer and we can treat it just as a dict. + # This code is copy-pasted from DRF's ModelSerializer. + info = model_meta.get_field_info(instance) + for attr, value in validated_data.items(): + if attr in info.relations and info.relations[attr].to_many: + field = getattr(instance, attr) + field.set(value) + else: + setattr(instance, attr, value) + instance.save() + return instance + + class Meta(DataImportLightSerializer.Meta): + fields = ( + 'id', + 'state', + 'mode', + 'corpus', + 'creator', + 'payload', + 'files', + 'tasks', + 'task_count', + ) + + +class DataFileSerializer(serializers.ModelSerializer): + """ + Serialize a single uploaded file + """ + + class Meta: + model = DataFile + fields = ( + 'id', + 'name', + 'content_type', + 'size', + ) + read_only_fields = ('id', 'size', 'content_type', ) diff --git a/arkindex/dataimport/tasks.py b/arkindex/dataimport/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..a418ea0c69c98786a4dbd6d1829547eb735311bd --- /dev/null +++ b/arkindex/dataimport/tasks.py @@ -0,0 +1,124 @@ +from celery import shared_task +from celery.utils.log import get_task_logger +from celery.signals import task_postrun +from celery.states import EXCEPTION_STATES +from django.conf import settings +from arkindex.project.celery import ReportingTask +from arkindex.documents.models import Element, ElementType +from arkindex.documents.importer import import_page +from arkindex.images.models import ImageServer +from arkindex.dataimport.models import DataImport, DataImportState +from PIL import Image +from shutil import copyfile +import os +import logging +import urllib.parse + +logger = get_task_logger(__name__) + + +@shared_task(bind=True, base=ReportingTask) +def check_images(self, dataimport): + assert isinstance(dataimport, DataImport) + + datafiles = dataimport.files.all() + valid_files = [] + filecount = len(datafiles) + + for i, datafile in enumerate(datafiles): + self.report_progress(i / filecount, "Checking image {} of {}".format(i + 1, filecount)) + fmt = None + + try: + fmt = Image.open(datafile.staging_path).format + assert fmt in ('JPEG', 'JPEG2000') + except IOError as e: + self.report_message("File {} is not a valid image".format(datafile.name), level=logging.WARNING) + continue + except AssertionError: + self.report_message("File {} is not a valid JPEG image".format(datafile.name), level=logging.WARNING) + continue + + valid_files.append(datafile) + + assert len(valid_files) > 0, "No valid JPEG images in selected files" + return valid_files + + +@shared_task(bind=True, base=ReportingTask) +def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGESERVER_ID): + assert isinstance(dataimport, DataImport) + self.report_progress(0, 'Pre import checks...') + try: + server = ImageServer.objects.get(id=server_id) + except ImageServer.DoesNotExist: + raise ValueError('Image server {} does not exist'.format(server_id)) + + if os.path.exists(dataimport.iiif_path): + raise ValueError('Folder with name {}Â already exists'.format(dataimport.folder_name)) + os.makedirs(dataimport.iiif_path) + + # Get volume + volume_name = dataimport.payload['volume_name'] + vol, _ = Element.objects.get_or_create( + type=ElementType.Volume, + name=volume_name, + corpus=dataimport.corpus, + ) + + datafiles = dataimport.files.all() + for i, datafile in enumerate(datafiles, 1): + self.report_progress(i / len(datafiles), 'Importing image {} of {}'.format(i, len(datafiles))) + + pillow_img = Image.open(datafile.staging_path) + width, height = pillow_img.size + + ext = '.jp2' if pillow_img.format == 'JPEG2000' else '.jpg' + newfilename = str(datafile.id) + ext + copyfile(datafile.staging_path, os.path.join(dataimport.iiif_path, newfilename)) + + img, _ = server.images.get_or_create( + path=urllib.parse.urljoin(dataimport.folder_name + '/', newfilename), + defaults={ + 'width': width, + 'height': height + } + ) + + import_page(vol, img, volume_name, str(i), i) + + self.report_message("Imported files into {}".format(vol)) + return {'volume': str(vol.id)} + + +@task_postrun.connect +def dataimport_postrun(task_id, task, state, args=(), **kwargs): + ''' + Update DataImport state after task exit + ''' + + # Only for dataimport tasks + if not task.name.startswith('arkindex.dataimport.tasks'): + return + + # Look for dataimport in args + imps = [a for a in args if isinstance(a, DataImport)] + assert len(imps) == 1, 'No args on dataimport task.' + dataimport = imps[0] + dataimport.refresh_from_db() # avoid inconsistency + assert isinstance(dataimport, DataImport), \ + 'DataImport not found as first arg of task {}'.format(task) + + # Manage error state + if any(t.failed() for t in dataimport.tasks) or state in EXCEPTION_STATES: + dataimport.state = DataImportState.Error + + # Report exception message + if 'retval' in kwargs: + task.report_message(str(kwargs['retval']), logging.ERROR) + + # Manage completion when all tasks are completed + elif sum(t.successful() for t in dataimport.tasks) == dataimport.task_count: + dataimport.state = DataImportState.Done + + dataimport.save() diff --git a/arkindex/dataimport/tests/__init__.py b/arkindex/dataimport/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/arkindex/dataimport/tests/test_files.py b/arkindex/dataimport/tests/test_files.py new file mode 100644 index 0000000000000000000000000000000000000000..03a2335003d6c013c0cd2728d73bfb6898556402 --- /dev/null +++ b/arkindex/dataimport/tests/test_files.py @@ -0,0 +1,105 @@ +from rest_framework.test import APITestCase +from rest_framework import status +from django.urls import reverse +from django.core.files.uploadedfile import SimpleUploadedFile +from django.conf import settings +from arkindex.documents.models import Corpus +from arkindex.dataimport.models import DataFile +from arkindex.users.models import User +import os + + +class TestFiles(APITestCase): + """ + Test file management + """ + + def setUp(self): + self.corpus = Corpus.objects.create(id='test', name='Unit Tests') + self.user = User.objects.create_user('test@test.test', 'testtest') + + def test_file_list(self): + df = DataFile.objects.create( + name='test.txt', size=42, hash='aaaa', content_type='text/plain', corpus=self.corpus) + self.client.force_login(self.user) + + response = self.client.get(reverse('api:file-list', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = response.json() + self.assertIn('results', data) + self.assertEqual(len(data['results']), 1) + + file = data['results'][0] + self.assertIn('id', file) + self.assertIn('name', file) + self.assertIn('size', file) + self.assertEqual(file['id'], str(df.id)) + self.assertEqual(file['name'], df.name) + self.assertEqual(file['size'], df.size) + + def test_file_list_requires_login(self): + response = self.client.get(reverse('api:file-list', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_file_upload(self): + """ + Assert a file upload creates a database instance and saves the file + """ + f = SimpleUploadedFile('test.txt', b'This is a text file') + self.client.force_login(self.user) + + response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + self.assertIn('id', data) + + df = DataFile.objects.get(id=data['id']) + self.assertEqual(df.name, 'test.txt') + self.assertEqual(df.size, 19) + self.assertEqual(df.content_type, 'text/plain') + self.assertEqual(df.corpus, self.corpus) + self.assertTrue(os.path.exists(os.path.join(settings.MEDIA_ROOT, str(df.id)))) + + def test_file_upload_requires_login(self): + """ + Assert a file upload creates a database instance and saves the file + """ + f = SimpleUploadedFile('test.txt', b'This is a text file') + response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_file_upload_unknown_corpus(self): + """ + Assert file upload fails on an unknown corpus with HTTP 400 + """ + f = SimpleUploadedFile('test.txt', b'This is a text file') + self.client.force_login(self.user) + response = self.client.post(reverse('api:file-upload', kwargs={'pk': 'nope'}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_file_upload_wrong_content_type(self): + """ + Assert file upload does not trust the client defined content type + """ + f = SimpleUploadedFile('test.mp3', b'This actually is a text file', content_type='audio/mpeg') + self.client.force_login(self.user) + + response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + data = response.json() + self.assertIn('id', data) + + df = DataFile.objects.get(id=data['id']) + self.assertEqual(df.name, 'test.mp3') + self.assertEqual(df.content_type, 'text/plain') + + def test_file_upload_unique(self): + self.client.force_login(self.user) + + f = SimpleUploadedFile('test.txt', b'This is a text file') + response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + f = SimpleUploadedFile('test2.txt', b'This is a text file') + response = self.client.post(reverse('api:file-upload', kwargs={'pk': self.corpus.id}), data={'file': f}) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) diff --git a/arkindex/dataimport/tests/test_imports.py b/arkindex/dataimport/tests/test_imports.py new file mode 100644 index 0000000000000000000000000000000000000000..9e05e8e79d22d67f85bbaffe1509a07828ac0aa6 --- /dev/null +++ b/arkindex/dataimport/tests/test_imports.py @@ -0,0 +1,45 @@ +from rest_framework.test import APITestCase +from rest_framework import status +from django.urls import reverse +from arkindex.dataimport.models import DataImport, DataImportMode, DataImportState +from arkindex.documents.models import Corpus +from arkindex.users.models import User + + +class TestImports(APITestCase): + """ + Test data imports management + """ + + def setUp(self): + self.corpus = Corpus.objects.create(id='test', name='Unit Tests') + self.user = User.objects.create_user('test@test.test', 'testtest') + self.dataimport = DataImport.objects.create( + creator=self.user, corpus=self.corpus, mode=DataImportMode.Images) + + def test_list_requires_login(self): + response = self.client.get(reverse('api:import-list')) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_list(self): + self.client.force_login(self.user) + response = self.client.get(reverse('api:import-list')) + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = response.json() + self.assertEqual(len(data['results']), 1) + result = data['results'][0] + self.assertEqual(result['id'], str(self.dataimport.id)) + self.assertEqual(result['state'], DataImportState.Created.value) + self.assertEqual(result['mode'], DataImportMode.Images.value) + self.assertEqual(result['corpus'], str(self.corpus.id)) + + def test_details_requires_login(self): + response = self.client.get(reverse('api:import-details', kwargs={'pk': self.dataimport.id})) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_details(self): + self.client.force_login(self.user) + response = self.client.get(reverse('api:import-details', kwargs={'pk': self.dataimport.id})) + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = response.json() + self.assertEqual(data['id'], str(self.dataimport.id)) diff --git a/arkindex/dataimport/urls.py b/arkindex/dataimport/urls.py new file mode 100644 index 0000000000000000000000000000000000000000..fdbf5b45844275aec7fea7655fd6c07623da4aef --- /dev/null +++ b/arkindex/dataimport/urls.py @@ -0,0 +1,12 @@ +from django.conf.urls import url +from arkindex.dataimport.views import \ + DataImportsList, DataImportCreate, DataImportConfig, DataImportStatus, DataFileList + + +urlpatterns = [ + url(r'^$', DataImportsList.as_view(), name='imports'), + url(r'^new/?$', DataImportCreate.as_view(), name='import-create'), + url(r'^(?P<pk>[\w\-]+)/config/?$', DataImportConfig.as_view(), name='import-config'), + url(r'^(?P<pk>[\w\-]+)/status/?$', DataImportStatus.as_view(), name='import-status'), + url(r'^files/?$', DataFileList.as_view(), name='files'), +] diff --git a/arkindex/dataimport/views.py b/arkindex/dataimport/views.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0c6a77a3f51e3644653eed7ececbb474ba6562 --- /dev/null +++ b/arkindex/dataimport/views.py @@ -0,0 +1,50 @@ +from django.views.generic import TemplateView, DetailView +from django.contrib.auth.mixins import LoginRequiredMixin +from arkindex.dataimport.models import DataImport, DataImportState + + +class DataImportsList(LoginRequiredMixin, TemplateView): + """ + List data imports using Vue JS + API + """ + template_name = 'dataimport/list.html' + + +class DataImportCreate(LoginRequiredMixin, TemplateView): + """ + Create a new workflow + """ + template_name = 'dataimport/new.html' + + +class DataImportConfig(LoginRequiredMixin, DetailView): + """ + Configure a single data import workflow + """ + template_name = 'dataimport/config.html' + context_object_name = 'dataimport' + + def get_queryset(self): + return DataImport.objects.filter( + state__in=[DataImportState.Created, DataImportState.Configured], + creator=self.request.user) + + +class DataImportStatus(LoginRequiredMixin, DetailView): + """ + View a data import workflow's status + """ + template_name = 'dataimport/status.html' + context_object_name = 'dataimport' + + def get_queryset(self): + return DataImport.objects.filter(creator=self.request.user).exclude( + state__in=[DataImportState.Created, DataImportState.Configured], + ) + + +class DataFileList(LoginRequiredMixin, TemplateView): + """ + View and manage uploaded files + """ + template_name = 'dataimport/files.html' diff --git a/arkindex/documents/management/commands/import_images.py b/arkindex/documents/management/commands/import_images.py deleted file mode 100644 index a01ce693b490bd61eb8cfcdc4c4cae640bcd18b5..0000000000000000000000000000000000000000 --- a/arkindex/documents/management/commands/import_images.py +++ /dev/null @@ -1,80 +0,0 @@ -from django.core.management.base import BaseCommand, CommandError -from arkindex.images.models import ImageServer -from arkindex.documents.models import Element, ElementType, Corpus -from arkindex.documents.importer import import_page -import logging -import argparse - -logging.basicConfig( - level=logging.INFO, - format='[%(levelname)s] %(message)s', -) -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - help = 'Import images & documents from a unique CSV file' - volumes = {} # cache - - def add_arguments(self, parser): - parser.add_argument( - 'image_list', - help='Text file that lists all images', - type=argparse.FileType('r'), - ) - parser.add_argument( - '--iiif-server', - type=str, - help='IIIF server ID or name', - required=True, - ) - parser.add_argument( - '--volume-name', - type=str, - help='Name of the volume to import images in', - required=True, - ) - parser.add_argument( - '--corpus-id', - type=str, - help='Slug of the corpus to import images in', - required=True, - ) - parser.add_argument( - '--offline', - action='store_true', - default=False, - help='Do not execute network queries to check images', - ) - - def handle(self, *args, **options): - # TODO: move in task - - # Handle verbosity level - verbosity = int(options['verbosity']) - root_logger = logging.getLogger('') - if verbosity > 1: - root_logger.setLevel(logging.DEBUG) - logger = logging.getLogger(__name__) - - # Loading image server - logger.info('Loading image server') - try: - server_id = options['iiif_server'] - if server_id.isdigit(): - server = ImageServer.objects.get(id=server_id) - else: - server = ImageServer.objects.get(name=server_id) - except ImageServer.DoesNotExist: - raise CommandError('No image server found !') - - logger.info('Loading corpus and volume') - corpus = Corpus.objects.get(id=options['corpus_id']) - vol, _ = Element.objects.get_or_create(type=ElementType.Volume, name=options['volume_name'], corpus=corpus) - - for i, path in enumerate(options['image_list'].readlines(), 1): - img = server.find_image(path.strip(), offline=options['offline']) - p = import_page(vol, img, options['volume_name'], str(i), i) - logger.info('Created page {} for image {}'.format(p, img)) - - logger.info('All done.') diff --git a/arkindex/documents/serializers.py b/arkindex/documents/serializers.py index d4149b3627f93f21fd498a45a5296c19eaefe3b9..fd8d81356bbb593f933db47373f8975041af1703 100644 --- a/arkindex/documents/serializers.py +++ b/arkindex/documents/serializers.py @@ -3,9 +3,9 @@ from django.conf import settings from rest_framework import serializers from arkindex.documents.models import \ Element, ElementType, Transcription, Page, PageType, PageDirection, Act, Corpus -from arkindex.documents.serializer_fields import EnumField, ViewerURLField from arkindex.images.models import Image, Zone from arkindex.images.serializers import ZoneSerializer, ImageSerializer +from arkindex.project.serializer_fields import EnumField, ViewerURLField from arkindex.project.tools import sslify_url import urllib.parse diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 881a383089e04e9ce8f38deb5248fdd183d9d080..9f00ae8136f34418f64aa7e052cb51362a3d4136 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -6,6 +6,8 @@ from arkindex.documents.api import \ PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \ TranscriptionSearch, ActSearch, TranscriptionSearchAnnotationList, \ ActEdit, TranscriptionCreate, TranscriptionBulk, SurfaceDetails +from arkindex.dataimport.api import \ + DataImportsList, DataImportDetails, DataFileList, DataFileRetrieve, DataFileUpload api = [ @@ -64,4 +66,11 @@ api = [ # Ingest transcriptions url(r'^transcription/?$', TranscriptionCreate.as_view(), name='transcription-create'), url(r'^transcription/bulk/?$', TranscriptionBulk.as_view(), name='transcription-bulk'), + + # Import workflows + url(r'^imports/$', DataImportsList.as_view(), name='import-list'), + url(r'^imports/(?P<pk>[\w\-]+)$', DataImportDetails.as_view(), name='import-details'), + url(r'^imports/files/(?P<pk>[\w\-]+)$', DataFileList.as_view(), name='file-list'), + url(r'^imports/file/(?P<pk>[\w\-]+)$', DataFileRetrieve.as_view(), name='file-retrieve'), + url(r'^imports/upload/(?P<pk>[\w\-]+)$', DataFileUpload.as_view(), name='file-upload'), ] diff --git a/arkindex/project/celery.py b/arkindex/project/celery.py index 970f198a89cd581a47776fad5f86093e77fc31b4..02a18c1823f79f8a698d1394268a235c183db725 100644 --- a/arkindex/project/celery.py +++ b/arkindex/project/celery.py @@ -1,6 +1,11 @@ from __future__ import absolute_import, unicode_literals import os -from celery import Celery +import time +import logging +from celery import Celery, Task +from celery.utils.log import get_task_logger +from celery.backends.redis import RedisBackend +from kombu.utils.encoding import bytes_to_str from django.conf import settings # set the default Django settings module for the 'celery' program. @@ -8,6 +13,8 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'arkindex.project.settings') app = Celery('arkindex') +app.loader.override_backends['redis'] = 'arkindex.project.celery:ExtendedRedisBackend' + # Using a string here means the worker doesn't have to serialize # the configuration object to child processes. # - namespace='CELERY' means all celery-related configuration keys @@ -22,6 +29,71 @@ app.conf.ONCE = settings.CELERY_ONCE app.autodiscover_tasks() +logger = get_task_logger(__name__) + + +class ExtendedRedisBackend(RedisBackend): + + def add_message(self, task_id, message): + ''' + Add a message into the backend, on a separate redis key + Used by our code only + ''' + key = self.get_key_for_task(task_id) + existing = self.get(key) + meta = existing and self.decode(existing) or {} + meta['messages'].append(message) + self.set(key, self.encode(meta)) + + def _store_result(self, task_id, result, state, + traceback=None, request=None, **kwargs): + key = self.get_key_for_task(task_id) + + # Load existing messages + existing = self.get(key) + messages = existing and self.decode(existing).get('messages') or [] + + meta = { + 'status': state, + 'result': result, + 'traceback': traceback, + 'children': self.current_task_children(request), + 'task_id': bytes_to_str(task_id), + + # Added for our usage + 'messages': messages, + 'task_name': request and str(request.task), + } + if request and getattr(request, 'group', None): + meta['group_id'] = request.group + self.set(key, self.encode(meta)) + return result + + +class ReportingTask(Task): + + def report_progress(self, progress, message=None): + assert 0.0 <= progress <= 1.0 + self.update_state(state='PROGRESS', meta={'progress': progress}) + + # Report message as info + if not isinstance(message, str): + message = '{:.0%}'.format(progress) + self.report_message(message) + + def report_message(self, message, level=logging.INFO): + assert isinstance(message, str) + logger.log(msg=message, level=level) + self.backend.add_message( + self.request.id, + { + 'level': level, + 'message': message, + 'time': time.time(), + } + ) + + @app.task(bind=True) def debug_task(self): print('Request: {0!r}'.format(self.request)) diff --git a/arkindex/project/runtests.py b/arkindex/project/runtests.py index b6c8e3426f4a7e5be55e938bae793b6d17939c13..0bc48d7c474f059f903acef6878b83887b2f9ee7 100644 --- a/arkindex/project/runtests.py +++ b/arkindex/project/runtests.py @@ -21,6 +21,7 @@ def run(): 'arkindex.documents.tests', 'arkindex.images.tests', 'arkindex.project.tests', + 'arkindex.dataimport.tests', ]) sys.exit(failures) diff --git a/arkindex/documents/serializer_fields.py b/arkindex/project/serializer_fields.py similarity index 85% rename from arkindex/documents/serializer_fields.py rename to arkindex/project/serializer_fields.py index 82a8af19d177dfe2a5d363bd6ddb068836d57702..19c286f470216965afc5549ab3da93f197f5da6c 100644 --- a/arkindex/documents/serializer_fields.py +++ b/arkindex/project/serializer_fields.py @@ -4,7 +4,7 @@ from rest_framework.reverse import reverse from enum import Enum -class EnumField(serializers.Field): +class EnumField(serializers.ChoiceField): """ Serializes an enum field into a JSON value """ @@ -12,7 +12,8 @@ class EnumField(serializers.Field): def __init__(self, enum, *args, **kwargs): assert issubclass(enum, Enum) or not enum self.enum = enum - super().__init__(*args, **kwargs) + choices = [(item.value, item.name) for item in self.enum] + super().__init__(choices, *args, **kwargs) def to_representation(self, obj): return obj.value diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index e71bda688e6af4f481f14e7683018557b8406c7d..a24251d25b7e1690780ee46c0167c7cf415b5249 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -57,6 +57,7 @@ INSTALLED_APPS = [ 'arkindex.images', 'arkindex.documents', 'arkindex.users', + 'arkindex.dataimport', ] MIDDLEWARE = [ @@ -162,6 +163,14 @@ STATICFILES_DIRS = [ FRONTEND_DIR, ] +# File uploads +MEDIA_URL = '/media/' +MEDIA_ROOT = os.environ.get('MEDIA_ROOT', os.path.join(BASE_DIR, 'media')) +if not os.path.isdir(MEDIA_ROOT): + os.makedirs(MEDIA_ROOT) +LOCAL_MEDIA_ROOT = os.environ.get('LOCAL_MEDIA_ROOT', os.path.join(BASE_DIR, 'iiif-users')) +LOCAL_IMAGESERVER_ID = os.environ.get('LOCAL_IMAGESERVER_ID', 1) + # Frontend WEBPACK_LOADER = { 'DEFAULT': { @@ -288,6 +297,10 @@ LOGGING = { # Async Workers CELERY_BROKER_URL = os.environ.get('QUEUE_URL', 'redis://localhost:6379/0') CELERY_RESULT_BACKEND = os.environ.get('QUEUE_URL', 'redis://localhost:6379/0') +CELERY_TRACK_STARTED = True +CELERY_TASK_SERIALIZER = 'pickle' +CELERY_RESULT_SERIALIZER = 'pickle' +CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_ONCE = { 'backend': 'celery_once.backends.Redis', 'settings': { diff --git a/arkindex/project/urls.py b/arkindex/project/urls.py index 394159ad226cc159b2219c2b4855b488c5c34644..337266deec14fa2ef42b92f3d0ca2f6ab81a5837 100644 --- a/arkindex/project/urls.py +++ b/arkindex/project/urls.py @@ -6,5 +6,6 @@ urlpatterns = [ url(r'^api/v1/', include((api, 'api'), namespace='api')), url(r'^admin/', admin.site.urls), url(r'^user/', include('django.contrib.auth.urls')), + url(r'^imports/', include('arkindex.dataimport.urls')), url(r'^', include('arkindex.documents.urls')), ] diff --git a/arkindex/templates/base.html b/arkindex/templates/base.html index 48c3168e26ecc1d72424c54e0066991a4982fa00..097bd77b6c151001d9cd1b0da810521c4210f608 100644 --- a/arkindex/templates/base.html +++ b/arkindex/templates/base.html @@ -26,6 +26,14 @@ <a class="navbar-item" href="{% url 'acts' %}"> Acts </a> + {% if user.is_authenticated %} + <a class="navbar-item" href="{% url 'imports' %}"> + Imports + </a> + <a class="navbar-item" href="{% url 'files' %}"> + Files + </a> + {% endif %} </div> <div class="navbar-end"> diff --git a/arkindex/templates/dataimport/config.html b/arkindex/templates/dataimport/config.html new file mode 100644 index 0000000000000000000000000000000000000000..0018aa27bc481a3ad1df036eb1953dc766bc6f96 --- /dev/null +++ b/arkindex/templates/dataimport/config.html @@ -0,0 +1,9 @@ +{% extends 'base.html' %} + +{% block content %} +<h1 class="title">Workflow configuration</h1> +<h2 class="subtitle">Configure a specific workflow</h2> + +<div id="app"> + <Import-Config id="{{ dataimport.id }}" /> +{% endblock %} diff --git a/arkindex/templates/dataimport/files.html b/arkindex/templates/dataimport/files.html new file mode 100644 index 0000000000000000000000000000000000000000..f9974cc03b2ec378531b484b09d2b76367f1371d --- /dev/null +++ b/arkindex/templates/dataimport/files.html @@ -0,0 +1,10 @@ +{% extends 'base.html' %} + +{% block content %} +<h1 class="title">Data files</h1> +<h2 class="subtitle">List uploaded files in corpora</h2> + +<div id="app"> + <Files-List /> +</div> +{% endblock %} diff --git a/arkindex/templates/dataimport/list.html b/arkindex/templates/dataimport/list.html new file mode 100644 index 0000000000000000000000000000000000000000..5acbdfc0ab7b4badeb6f0b22cabadeec9268cf4c --- /dev/null +++ b/arkindex/templates/dataimport/list.html @@ -0,0 +1,10 @@ +{% extends 'base.html' %} + +{% block content %} +<h1 class="title">Data imports</h1> +<h2 class="subtitle">List all data importation workflows</h2> + +<div id="app"> + <Imports-List /> +</div> +{% endblock %} diff --git a/arkindex/templates/dataimport/new.html b/arkindex/templates/dataimport/new.html new file mode 100644 index 0000000000000000000000000000000000000000..481098d2e8589f62d36a8bcad74f2f6ad96e48d1 --- /dev/null +++ b/arkindex/templates/dataimport/new.html @@ -0,0 +1,9 @@ +{% extends 'base.html' %} + +{% block content %} +<h1 class="title">New workflow</h1> +<h2 class="subtitle">Create an importation workflow on a corpus</h2> + +<div id="app"> + <Import-Create /> +{% endblock %} diff --git a/arkindex/templates/dataimport/status.html b/arkindex/templates/dataimport/status.html new file mode 100644 index 0000000000000000000000000000000000000000..ae580348f2e742a309fd4798ae58b40f06c4624e --- /dev/null +++ b/arkindex/templates/dataimport/status.html @@ -0,0 +1,9 @@ +{% extends 'base.html' %} + +{% block content %} +<h1 class="title">Workflow status</h1> +<h2 class="subtitle">Monitor a workflow's tasks</h2> + +<div id="app"> + <Import-Status id="{{ dataimport.id }}" /> +{% endblock %} diff --git a/requirements.txt b/requirements.txt index c7491ff1a04f361459b3925b7d08849cfb447bca..b28bcfc6927e30a3202e79512259a73cf7d19392 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -celery==4.1.1 +celery==4.2.0 celery_once==2.0.0 certifi==2017.7.27.1 chardet==3.0.4 @@ -15,6 +15,7 @@ olefile==0.44 openpyxl==2.4.9 Pillow==4.3.0 psycopg2==2.7.3.2 +python-magic==0.4.15 python-memcached==1.59 pytz==2017.2 redis==2.10.6