diff --git a/arkindex/dataimport/admin.py b/arkindex/dataimport/admin.py index 70978a4399b6a1c0d40da0d3caf27777c405b05d..f4624405c9498a5c6b88b304c91fb895ab49ff00 100644 --- a/arkindex/dataimport/admin.py +++ b/arkindex/dataimport/admin.py @@ -30,7 +30,7 @@ class RevisionInline(admin.StackedInline): class RepositoryAdmin(admin.ModelAdmin): list_display = ('id', 'url', 'corpus') list_filter = ('corpus', ) - fields = ('id', 'url', 'corpus', 'hook_token', 'watched_branches') + fields = ('id', 'url', 'corpus', 'hook_token') readonly_fields = ('id', ) inlines = [RevisionInline, ] diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 587a12fbda62faa63391180de934cd94d8ef417e..92df9250564ed55c8d68108212345a9b11f5a8f8 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -14,10 +14,11 @@ from rest_framework.exceptions import ValidationError from arkindex.project.mixins import CorpusACLMixin from arkindex.documents.models import Corpus, Right, Element, ElementType from arkindex.dataimport.models import \ - DataImport, DataFile, DataImportState, DataImportMode, DataImportFailure, Repository -from arkindex.dataimport.serializers import \ - DataImportLightSerializer, DataImportSerializer, DataImportFailureSerializer, DataFileSerializer, \ - RepositoryLightSerializer, RepositorySerializer, ExternalRepositorySerializer + DataImport, DataFile, DataImportState, DataImportMode, DataImportFailure, Repository, Event +from arkindex.dataimport.serializers import ( + DataImportLightSerializer, DataImportSerializer, DataImportFailureSerializer, DataFileSerializer, + RepositorySerializer, ExternalRepositorySerializer, EventSerializer +) from arkindex.users.models import OAuthCredentials import hashlib import magic @@ -251,7 +252,7 @@ class GitRepositoryImportHook(APIView): class RepositoryList(ListAPIView): permission_classes = (IsAuthenticated, ) - serializer_class = RepositoryLightSerializer + serializer_class = RepositorySerializer def get_queryset(self): return Repository.objects.filter( @@ -325,3 +326,14 @@ class RepositoryStartImport(RetrieveAPIView): raise ValidationError("An import is already running for the latest revision") return Response(data={'import_id': str(rev.start_import().id)}) + + +class ElementHistory(ListAPIView): + permission_classes = (IsAuthenticated, ) + serializer_class = EventSerializer + + def get_queryset(self): + return Event.objects.filter( + element_id=self.kwargs['pk'], + element__corpus__in=Corpus.objects.readable(self.request.user), + ) diff --git a/arkindex/dataimport/config.py b/arkindex/dataimport/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2b5871c5a453321cc54a81750b0b0d85b69be025 --- /dev/null +++ b/arkindex/dataimport/config.py @@ -0,0 +1,189 @@ +import fnmatch +import yaml +from enum import Enum +from django.core.validators import URLValidator +from django.core.exceptions import ValidationError +from django.utils.functional import cached_property +from arkindex.documents.models import Corpus +from arkindex.images.models import ImageServer + + +class ImportType(Enum): + Volumes = 'volumes' + Transcriptions = 'transcriptions' + Surfaces = 'surfaces' + Acts = 'acts' + Metadata = 'metadata' + + +class VolumesImportFormat(Enum): + IIIF = 'iiif' + TXT = 'txt' + + +class TranscriptionsImportFormat(Enum): + IIIF = 'iiif' + Index = 'index' + GzippedIndex = 'index-gzip' + + +class SurfacesImportFormat(Enum): + XML = 'xml' + + +class ActsImportFormat(Enum): + CSV = 'csv' + + +class MetadataImportFormat(Enum): + TEI = 'tei' + + +class ConfigFile(object): + """ + A .arkindex.yml configuration file + """ + + FILE_NAME = '.arkindex.yml' + REQUIRED_ITEMS = ('version', 'branches', 'corpus') + FORMAT_ENUMS = { + ImportType.Volumes: VolumesImportFormat, + ImportType.Transcriptions: TranscriptionsImportFormat, + ImportType.Surfaces: SurfacesImportFormat, + ImportType.Acts: ActsImportFormat, + ImportType.Metadata: MetadataImportFormat, + } + + def __init__(self, data): + self._config = yaml.load(data) + self.validate() + self.setattrs() + + @staticmethod + def from_path(path): + with open(path, 'rb') as f: + return ConfigFile(f) + + def validate(self): + """ + Validate an Arkindex configuration file. Will raise ValidationErrors for validation errors + """ + + # Required first level items + for item in self.REQUIRED_ITEMS: + if item not in self._config: + raise ValidationError("Missing '{}' setting".format(item)) + + # Format version + if self._config['version'] != 1: + raise ValidationError("Unsupported format version '{}'".format(self._config['version'])) + + # Branches list + if any(not isinstance(branch, str) for branch in self._config['branches']): + raise ValidationError("Bad 'branches' format: should be a list of branch names") + + # Corpus info + for item in ('name', 'description'): + if item not in self._config['corpus'].keys(): + raise ValidationError("Missing '{}' parameter in 'corpus'".format(item)) + + # At least one import type required + if not any(it.value in self._config for it in ImportType): + raise ValidationError("No import types were specified") + + # Required 'paths' for each type + for it in ImportType: + if item not in self._config: + continue + if 'paths' not in self._config[it.value]: + raise ValidationError("Missing 'paths' parameter in '{}'".format(it.value)) + if not all(isinstance(path, str) for path in self._config[it.value]['paths']): + raise ValidationError("Bad 'paths' format in '{}': should be a list of patterns".format(it.value)) + + # Import file formats + for import_type, format_enum in self.FORMAT_ENUMS.items(): + if import_type.value not in self._config: + continue + if 'format' not in self._config[import_type.value]: + continue + try: + format_enum(self._config[import_type.value]['format']) + except KeyError: + raise ValidationError("Format setting in '{}' should be one of '{}'".format( + import_type.value, "', '".join(fmt.value for fmt in format_enum))) + + # Manifest import-specific validation + if ImportType.Volumes.value in self._config: + volumes = self._config[ImportType.Volumes.value] + + if 'image_servers' in volumes: + if len(volumes['image_servers'].values()) != len(set(volumes['image_servers'].values())): + raise ValueError("Duplicate server URLs in 'volumes.image_servers'") + + validate_url = URLValidator( + schemes=['http', 'https'], + message="Invalid IIIF server URL in '{}.image_servers'".format(ImportType.Volumes.value), + ) + for name, url in volumes['image_servers'].items(): + validate_url(url) + + def setattrs(self): + """ + Set attributes on this class from a valid configration + """ + self.version = self._config['version'] + self.branches = self._config['branches'] + self.imports = list(filter(lambda it: it.value in self._config, ImportType)) + + # Default formats + self.volumes_format = VolumesImportFormat.IIIF + self.transcriptions_format = TranscriptionsImportFormat.GzippedIndex + self.surfaces_format = SurfacesImportFormat.XML + self.acts_format = ActsImportFormat.CSV + self.metadata_format = MetadataImportFormat.TEI + + for import_type, format_enum in self.FORMAT_ENUMS.items(): + if import_type.value not in self._config or 'format' not in self._config[import_type.value]: + continue + setattr( + self, + '{}_format'.format(import_type.value), + format_enum(self._config[import_type.value]['format']), + ) + + if ImportType.Volumes in self.imports: + self.volumes_lazy_checks = self._config[ImportType.Volumes.value].get('lazy_checks', False) + self.volumes_autoconvert_https = self._config[ImportType.Volumes.value].get('convert_https', False) + + def path_match(self, import_type, path): + """ + Check a given path matches any paths configured for a given import type. + """ + if import_type not in self.imports: + return False + return any(fnmatch.fnmatch(path, pattern) for pattern in self._config[import_type.value]['paths']) + + @cached_property + def corpus(self): + c, _ = Corpus.objects.get_or_create(name=self._config['corpus']['name'], defaults={'public': False}) + if 'description' in self._config['corpus']: + c.description = self._config['corpus']['description'] + if 'public' in self._config['corpus']: + c.public = bool(self._config['corpus']['public']) + c.save() + return c + + @cached_property + def volumes_image_servers(self): + if ImportType.Volumes not in self.imports: + return [] + + servers_config = self._config[ImportType.Volumes.value].get('image_servers') + if not servers_config: + return list(ImageServer.objects.all()) + + servers = [] + for name, url in servers_config.items(): + s, _ = ImageServer.objects.get_or_create(url=url, defaults={'name': name}) + servers.append(s) + return servers diff --git a/arkindex/dataimport/filetypes.py b/arkindex/dataimport/filetypes.py new file mode 100644 index 0000000000000000000000000000000000000000..b7af7bd234e1430c8584cd3e8ea2f51733a793e4 --- /dev/null +++ b/arkindex/dataimport/filetypes.py @@ -0,0 +1,221 @@ +import os +import logging +from abc import ABC, abstractmethod +from arkindex.dataimport.config import ConfigFile, ImportType, VolumesImportFormat, TranscriptionsImportFormat +from arkindex.dataimport.iiif import ManifestParser + + +logger = logging.getLogger(__name__) + + +class FileType(ABC): + """ + A file type that can be handled by Git import workflows + """ + + @staticmethod + def identify(path, config): + """ + Return a FileType class corresponding to a given file path, or None if it is unknown. + """ + return next((ft for ft in file_types if ft.match(path, config)), None) + + @classmethod + @abstractmethod + def match(cls, path, config): + """ + Returns True if the specified path matches this file type. + """ + + @classmethod + @abstractmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + """ + Handle a Git diff on a single file. Should run synchronously. + """ + + +class ManifestFileType(FileType): + """ + IIIF manifests describing volumes and pages, with the volume name as their file name + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Volumes, path) and \ + config.volumes_format == VolumesImportFormat.IIIF and \ + path.endswith('.json') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + assert isinstance(config, ConfigFile) + + # TODO: Handle a pure deletion diff + cls.run(dataimport, os.path.join(dataimport.revision.repo.clone_dir, new_path), config) + + @classmethod + def run(cls, dataimport, path, config): + ManifestParser( + path, + dataimport.revision, + config.corpus, + lazy=config.volumes_lazy_checks, + servers=config.volumes_image_servers, + autocreate_servers=not config.volumes_image_servers, # Autocreate if there are no listed servers + autoconvert_https=config.volumes_autoconvert_https, + ).run() + + +class ManifestListFileType(FileType): + """ + Lists of IIIF manifest URLs in text files + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Volumes, path) and \ + config.volumes_format == VolumesImportFormat.TXT and \ + path.endswith('.txt') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + assert isinstance(config, ConfigFile) + + # TODO: Handle a pure deletion diff + # TODO: Maybe look at the actual Git diff to see which manifests in the list were modified? + with open(os.path.join(dataimport.revision.repo.clone_dir, new_path)) as f: + paths = [line.strip() for line in f.read().splitlines()] + + for path in paths: + logger.info('Parsing manifest {}'.format(path)) + ManifestFileType.run(dataimport, path, config) + + +class GzippedIndexFileType(FileType): + """ + Index files, organized in folders corresponding to volumes and compressed with gzip, + with a part of the image path as their file name + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Transcriptions, path) and \ + config.transcriptions_format == TranscriptionsImportFormat.GzippedIndex and \ + path.endswith('.idx.gz') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Transcription imports are not yet supported') + + +class IndexFileType(FileType): + """ + Index files, organized in folders corresponding to volumes, + with a part of the image path as their file name + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Transcriptions, path) and \ + config.transcriptions_format == TranscriptionsImportFormat.Index and \ + path.endswith('.idx') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Transcription imports are not yet supported') + + +class AnnotationListFileType(FileType): + """ + IIIF annotation lists, organized in folders corresponding to volumes, + with a part of the image path as their file name + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Transcriptions, path) and \ + config.transcriptions_format == TranscriptionsImportFormat.IIIF and \ + path.endswith('.json') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Transcription imports are not yet supported') + + +class ActsListFileType(FileType): + """ + CSV files with a volume name as their filename and two columns: [act number, first folio] + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Acts, path) and path.endswith('.csv') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Acts imports are not yet supported') + + +class SurfaceFileType(FileType): + """ + XML surface files + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Surfaces, path) and path.endswith('.xml') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Surface imports are not yet supported') + + +class MetadataFileType(FileType): + """ + TEI-XML files + """ + + @classmethod + def match(cls, path, config): + assert isinstance(config, ConfigFile) + return config.path_match(ImportType.Metadata, path) and path.endswith('.xml') + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + logger.warning('Metadata imports are not yet supported') + + +class ConfigFileType(FileType): + """ + An Arkindex repository configuration file + """ + + @classmethod + def match(cls, path, config): + return os.path.basename(path) == ConfigFile.FILE_NAME + + @classmethod + def handle(cls, dataimport, change_type, old_path, new_path, config): + # Do nothing (config files are already handled in every other import) + return + + +# Registered file types for the FileType.identify static method +file_types = [ + ManifestFileType, + ManifestListFileType, + GzippedIndexFileType, + IndexFileType, + AnnotationListFileType, + ActsListFileType, + SurfaceFileType, + MetadataFileType, + ConfigFileType, +] diff --git a/arkindex/dataimport/iiif.py b/arkindex/dataimport/iiif.py new file mode 100644 index 0000000000000000000000000000000000000000..940e8afb4c859b03824be4f0367c124c20c3014a --- /dev/null +++ b/arkindex/dataimport/iiif.py @@ -0,0 +1,387 @@ +import os +import ijson +import logging +import requests +import urllib.parse +from io import BytesIO +from django.core.validators import URLValidator +from django.core.exceptions import ValidationError +from django.db import transaction +from arkindex.documents.models import Corpus, Element, ElementType, Page, MetaData, MetaType +from arkindex.documents.importer import parse_folio +from arkindex.images.models import ImageServer, Zone +from arkindex.dataimport.models import Revision, EventType +from arkindex.project.polygon import Polygon +from arkindex.project.tools import random_string + + +logger = logging.getLogger(__name__) + + +class ManifestParser(object): + """ + A class that parses a single IIIF manifest into a volume, a register, pages and images. + """ + + def __init__(self, path, revision, corpus, + lazy=False, servers=[], + autocreate_servers=False, autoconvert_https=False, volume_name=None): + """ + path: Path or URL to the IIIF manifest to parse. + revision: A Revision instance that should be used for change events on elements + corpus: An instance of Corpus which will hold all the elements. + lazy: Boolean indicating whether the parser should not perform checks on image existence + servers: An optional list of servers to restrict image server search + autocreate_servers: If the ImageServer for a given image URL cannot be found, + will try to autodetect the server's URL and create new image servers + autoconvert_https: If the ImageServer for a given image URL cannot be found and the URL uses HTTP, + will check the HTTPS version exists and try to match a server. + volume_name: Set to a non-empty str to override the default volume name. + """ + self.path = path + assert isinstance(revision, Revision) + self.revision = revision + assert isinstance(corpus, Corpus) + self.corpus = corpus + assert isinstance(lazy, bool) + self.lazy = lazy + if servers: + assert isinstance(servers, list) + self.servers = servers + else: + self.servers = list(ImageServer.objects.all()) + assert isinstance(autoconvert_https, bool) + self.autoconvert_https = autoconvert_https + assert isinstance(autocreate_servers, bool) + self.autocreate_servers = autocreate_servers + self.stream = None + + # ImageServer instances that may get created on autocreate + self.created_servers = [] + + self.volume_changed, self.register_changed = False, False + self.volume_name = None + if volume_name: + assert isinstance(volume_name, str) + self.volume_name = volume_name + + def _get_or_instance(self, model, defaults={}, **filters): + """ + Like model.objects.get_or_create(), + except it creates a Python instance that is not saved into DB. + """ + try: + return model.objects.get(**filters), False + except model.DoesNotExist: + filters.update(defaults) + kwargs = { # Filter to remove Django lookups + k: v for k, v in filters.items() + if '__' not in k + } + return model(**kwargs), True + + def _first_or_instance(self, model, defaults={}, **filters): + """ + Like model.objects.get_or_create(), + except it does not fail if there are multiple items (using filter().first()), + and it creates a Python instance that is not saved into DB. + """ + result = model.objects.filter(**filters).first() + if result: + return result, False + filters.update(defaults) + kwargs = { # Filter to remove Django lookups + k: v for k, v in filters.items() + if '__' not in k + } + return model(**kwargs), True + + def open(self): + """ + Open a stream for the given manifest and save the volume name. + """ + try: + URLValidator(schemes=['http', 'https'])(self.path) + resp = requests.get(self.path) + resp.raise_for_status() + # Cannot use stream=True here: the parser uses the seek(int) method which is unsupported + self.stream = BytesIO(resp.content) + name, ext = os.path.splitext(os.path.basename(urllib.parse.urlparse(self.path).path)) + except ValidationError: + assert os.path.isfile(self.path), "File does not exist" + name, ext = os.path.splitext(os.path.basename(self.path)) + self.stream = open(self.path, 'rb') + assert ext == '.json', "File does not have a JSON extension" + self.volume_name = self.volume_name or name + + def close(self): + """ + Close the stream if it is opened. + """ + if self.stream and not self.stream.closed: + self.stream.close() + + def check_manifest_type(self): + """ + Check the file is an actual IIIF manifest. + """ + jsonld_type = next(( + value + for prefix, event, value in ijson.parse(self.stream) + if (prefix, event) == ('@type', 'string') + ), None) + + if jsonld_type is None: + raise ValueError("Missing @type property in JSON data") + + if jsonld_type != 'sc:Manifest': + raise ValueError("JSON file is not a IIIF manifest") + + self.stream.seek(0) + + def make_parents(self): + self.volume, new_volume = self._get_or_instance( + Element, type=ElementType.Volume, name=self.volume_name, corpus=self.corpus) + self.register, new_register = self._get_or_instance( + Element, type=ElementType.Register, name=self.volume_name, corpus=self.corpus) + self.is_new = new_volume and new_register + + def find_image_server(self, image_url): + try: # Look in the specified servers + return next(server for server in self.servers if image_url.startswith(server.url)) + except StopIteration: + pass + + splat = urllib.parse.urlsplit(image_url) + if self.autoconvert_https and splat.scheme == 'http': # Try again but with HTTPS + splat.scheme = 'https' + new_url = urllib.parse.urlunparse(splat) + logger.info("Trying '{}' instead of '{}'".format(new_url, image_url)) + try: + # Look for a server before checking the server exists; + # no need to wait for a HTTP request if there is no associated server + serv = next(server for server in self.servers if new_url.startswith(server.url)) + requests.head(new_url, timeout=5).raise_for_status() + return serv + except (requests.exceptions.RequestException, StopIteration): + pass + + if not self.autocreate_servers: + return + + logger.warning("No known image server for image {} - attempting autodetection".format(image_url)) + + if splat.path.startswith('/iiif'): + server_url = "{0}://{1}/iiif".format(splat.scheme, splat.netloc) + server_name = '_'.join(self.corpus.name, random_string(5)) + new_server = ImageServer(name=server_name, url=server_url) + self.created_servers.append(new_server) + + logger.info("Created IIIF image server '{0}' with URL '{1}'".format(server_name, server_url)) + return new_server + + def parse_metadata(self): + """ + Parse a manifest's metadata property into MetaData elements linked to the volume. + """ + logger.info("Parsing metadata") + self.metadata = [] + + for item in ijson.items(self.stream, 'metadata.item'): + if not all(prop in item for prop in ('label', 'value')): + logger.warning('Metadata does not have the required label and value properties') + continue + + md, created = self._get_or_instance( + MetaData, + element=self.volume, + type=MetaType.Text, + name=item['label'], + defaults={ + 'revision': self.revision, + 'value': item['value'], + } + ) + # Set volume as changed whenever any metadata is created + self.volume_changed = self.volume_changed or created + # Only update existing metadata revisions when values get updated + if not created and md.value != item['value']: + # Set volume as changed whenever any metadata is updated + self.volume_changed = True + md.value = item['value'] + md.revision = self.revision + self.metadata.append(md) + + self.stream.seek(0) + + def parse_canvases(self): + """ + Parse all canvases in all sequences of the manifest into Page + """ + logger.info("Parsing canvases") + self.change_count, self.pages, self.images = 0, [], [] + for canvas in ijson.items(self.stream, 'sequences.item.canvases.item'): + # Label contains the folio + folio = canvas.get('label') + if folio is None: + logger.warning( + "Found an image canvas with ID {}, but no folio (label) was specified".format(canvas.get('@id'))) + continue + + if 'images' not in canvas or len(canvas['images']) < 1: + logger.warning("Canvas {} has no image".format(canvas.get('@id'))) + continue + + # Get the image resource + resource = canvas['images'][0].get('resource') + if resource is None: + logger.warning("Canvas {} has no image resource".format(canvas.get('@id'))) + continue + + # Go find the service ID to get the image URL + if 'service' not in resource: + logger.warning("Found an image resource with ID {} on canvas {}, but no service was specified".format( + resource.get('@id'), canvas.get('@id'))) + continue + if '@id' not in resource['service']: + logger.warning( + "Found an image service on canvas {}, but no service ID was specified".format(canvas.get('@id'))) + continue + service_id = resource['service']['@id'] + + # Find the right server + image_server = self.find_image_server(service_id) + if image_server is None: + logger.warning("No image server found for image {}".format(service_id)) + continue + + # Strip server URL to get just the path + image_path = service_id[len(image_server.url):].lstrip('/') + image = image_server.find_image( + image_path, + offline=self.lazy, + width=canvas['width'], + height=canvas['height'], + ) + self.images.append(image) + + page_name = "Page {0} du volume {1}".format(folio, self.volume.name) + page_type, page_nb, page_direction, page_complement = parse_folio(folio) + + poly = Polygon.from_coords(0, 0, canvas['width'], canvas['height']) + zone, created = self._first_or_instance( + Zone, + image=image, + polygon=poly, + elements__type=ElementType.Page, + elements__corpus=self.corpus, + ) + + new_page = Page( + zone=zone, + corpus_id=self.corpus.id, + name=page_name, + folio=folio, + page_type=page_type, + nb=page_nb, + direction=page_direction, + complement=page_complement, + ) + + if created: + self.change_count += 1 + self.pages.append((new_page, EventType.Addition)) + else: + # Get a page in that zone + old_page = zone.elements.filter(type=ElementType.Page, corpus_id=self.corpus.id).first().page + if old_page.same_as(new_page): + self.pages.append((old_page, None)) + else: + new_page.id = old_page.id + new_page.created = old_page.created + self.change_count += 1 + self.pages.append((new_page, EventType.Edit)) + + logger.debug('Parsed page {}'.format(folio)) + + def parse(self): + """ + Run the full parsing process. + """ + self.open() + self.check_manifest_type() + self.make_parents() + self.parse_metadata() + self.parse_canvases() + + @transaction.atomic + def save(self): + logger.info('Saving volume and register...') + # There is no point of comparison between volumes and registers if their names changed; + # we can only recreate them. + self.volume.save() + self.register.save() + self.register.add_parent(self.volume) + + # Volume metadata can on the other hand get updated or deleted + logger.info('Saving metadata...') + for md in self.metadata: + md.save() + deleted_metadatas = self.volume.metadatas.exclude(id__in=[md.id for md in self.metadata]) + if deleted_metadatas.exists(): + self.volume_changed = True + deleted_metadatas.delete() + + if self.created_servers: + logger.info('Saving new servers...') + ImageServer.objects.bulk_create(self.created_servers) + + if self.images: + logger.info('Saving images...') + for image in self.images: + image.save() + + if self.pages: + logger.info('Saving changed pages...') + for i, (page, event_type) in enumerate(self.pages): + if event_type: # Addition or edit + # Set volume and register as changed since a page changed + self.volume_changed, self.register_changed = True, True + page.zone.save() + page.save() + page.events.create( + revision=self.revision, + type=event_type, + ) + # Create path or update ordering + page.add_parent(self.register, order=i) + + if not self.is_new: # Deleted pages cannot happen on a new volume + deleted_pages = Page.objects.get_descending(self.volume.id).exclude(id__in=[p.id for p, _ in self.pages]) + if deleted_pages.exists(): + # Set volume and register as changed since one or more pages got deleted + self.volume_changed, self.register_changed = True, True + logger.info('Removing {} deleted pages...'.format(len(deleted_pages))) + deleted_pages.delete() + + if self.is_new or self.volume_changed: + self.volume.events.create( + revision=self.revision, + type=EventType.Addition if self.is_new else EventType.Edit, + ) + if self.is_new or self.register_changed: + self.register.events.create( + revision=self.revision, + type=EventType.Addition if self.is_new else EventType.Edit, + ) + + def run(self): + try: + self.parse() + logger.info("Parsed volume {}: {} metadata in volume, {} pages ({} changed), {} new servers".format( + self.volume.name, + len(self.metadata), len(self.pages), self.change_count, len(self.created_servers) + )) + self.save() + finally: + self.close() diff --git a/arkindex/dataimport/management/commands/import_repo.py b/arkindex/dataimport/management/commands/import_repo.py new file mode 100644 index 0000000000000000000000000000000000000000..4d3f343c15152874f6ac4c0874b0845c94544327 --- /dev/null +++ b/arkindex/dataimport/management/commands/import_repo.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +from django.core.management.base import BaseCommand, CommandError +from arkindex.dataimport.models import DataImport, DataImportMode, DataImportState, Repository +from arkindex.dataimport.tasks import clone_repo, diff_repo, cleanup_repo +from pprint import pprint + + +class Command(BaseCommand): + help = 'Detect changes in a repository' + + def add_arguments(self, parser): + parser.add_argument( + 'repository', + help='ID of the repository to check on', + ) + parser.add_argument( + '--hash', + help='Hash of a revision in the repository to check on. Defaults to latest commit on master', + default=None, + ) + parser.add_argument( + '--sync', + help='Run synchronously and show the diff, but do not actually import', + action='store_true', + default=False, + ) + + def handle(self, *args, **options): + try: + repo = Repository.objects.get(id=options['repository']) + except Repository.DoesNotExist: + raise CommandError('Repository {} not found'.format(options['repository'])) + + if repo.provider_class is None: + raise ValueError("No repository provider found for {}".format(repo.url)) + + if 'hash' in options and options['hash'] is not None: + rev, created = repo.provider.get_or_create_revision(repo, options['hash']) + else: + rev, created = repo.provider.get_or_create_latest_revision(repo) + + if created: + print('Created revision {} "{}" on repository {}'.format(rev.hash, rev.message, repo.url)) + + di = DataImport.objects.create( + creator=repo.credentials.user, + corpus=repo.corpus, + mode=DataImportMode.Repository, + state=DataImportState.Configured, + revision=rev, + task_count=3, + ) + + if options['sync']: + # Run synchronously and print results + # For now, sync run means no progress, no messages, no logging, nothing. + di.state = DataImportState.Running + di.save() + print('Cloning repo...') + clone_repo(di) + print('Computing diff...') + pprint(diff_repo(di)) + print('Cleaning up...') + cleanup_repo(di) + di.state = DataImportState.Done + di.save() + else: + di.start() diff --git a/arkindex/dataimport/migrations/0002_repository_revision.py b/arkindex/dataimport/migrations/0002_repository_revision.py index e97c33f85bb45d43abf2e99cc6f2d6674cb886f9..9f14677c0680653b374405728b15419a9445d028 100644 --- a/arkindex/dataimport/migrations/0002_repository_revision.py +++ b/arkindex/dataimport/migrations/0002_repository_revision.py @@ -82,7 +82,8 @@ class Migration(migrations.Migration): name='watched_branches', field=arkindex.project.fields.ArrayField( base_field=models.CharField(max_length=50), - default=arkindex.dataimport.models.repository_default_branches, + # arkindex.dataimport.models.repository_default_branches has been removed + default=lambda: ['refs/heads/master'], size=None, ), ), diff --git a/arkindex/dataimport/migrations/0008_events.py b/arkindex/dataimport/migrations/0008_events.py new file mode 100644 index 0000000000000000000000000000000000000000..83d0746242876c6da45295efa18a3f2df153f918 --- /dev/null +++ b/arkindex/dataimport/migrations/0008_events.py @@ -0,0 +1,66 @@ +# Generated by Django 2.1 on 2018-10-02 08:33 + +import arkindex.dataimport.models +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import enumfields.fields +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0025_avoid_doublons'), + ('dataimport', '0007_datafile_image'), + ] + + operations = [ + migrations.CreateModel( + name='Event', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('type', enumfields.fields.EnumField(enum=arkindex.dataimport.models.EventType, max_length=10)), + ('element', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='events', + to='documents.Element', + )), + ('revision', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='events', + to='dataimport.Revision', + )), + ('created', models.DateTimeField(auto_now_add=True)), + ('updated', models.DateTimeField(auto_now=True)), + ], + ), + migrations.AddField( + model_name='revision', + name='elements', + field=models.ManyToManyField(related_name='revisions', through='dataimport.Event', to='documents.Element'), + ), + migrations.AlterUniqueTogether( + name='event', + unique_together={('element', 'revision')}, + ), + migrations.AlterModelOptions( + name='event', + options={'ordering': ['element_id', 'created']}, + ), + migrations.AddField( + model_name='revision', + name='created', + field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now), + preserve_default=False, + ), + migrations.AddField( + model_name='revision', + name='updated', + field=models.DateTimeField(auto_now=True), + ), + migrations.RemoveField( + model_name='repository', + name='watched_branches', + ), + ] diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 91af5dccbaa819b299d4ba38b595e18c62194071..b13d3bc8d1c84fe6ab92ffc114c2b923e60b8e99 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -9,7 +9,6 @@ from enumfields import EnumField, Enum from arkindex.project.celery import app as celery_app from arkindex.dataimport.providers import git_providers, get_provider from arkindex.project.models import IndexableModel -from arkindex.project.fields import ArrayField import uuid import os import re @@ -81,8 +80,8 @@ class DataImport(IndexableModel): return workflow elif self.mode == DataImportMode.Repository: - from arkindex.dataimport.tasks import download_repo, import_repo, cleanup_repo - return download_repo.si(self) | import_repo.si(self) | cleanup_repo.si(self) + from arkindex.dataimport.tasks import clone_repo, diff_repo, dispatch_imports, cleanup_repo + return clone_repo.si(self) | diff_repo.si(self) | dispatch_imports.s(self) | cleanup_repo.si(self) else: raise NotImplementedError @@ -179,15 +178,6 @@ class DataFile(models.Model): return os.path.join(settings.MEDIA_ROOT, str(self.id)) -def repository_default_branches(): - ''' - This is needed to avoid re-using the same list instance - as Repository.watched_branches default on new instances - See Django warning postgres.E003 - ''' - return ['refs/heads/master'] - - class Repository(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4) url = models.URLField(unique=True) @@ -195,7 +185,6 @@ class Repository(models.Model): corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='repos') credentials = models.ForeignKey( 'users.OAuthCredentials', on_delete=models.CASCADE, related_name='repos', blank=True, null=True) - watched_branches = ArrayField(models.CharField(max_length=50), default=repository_default_branches) provider_name = models.CharField( max_length=50, choices=[(p.__name__, p.display_name) for p in git_providers], @@ -217,13 +206,14 @@ class Repository(models.Model): return os.path.join(settings.CELERY_WORKING_DIR, str(self.id)) -class Revision(models.Model): +class Revision(IndexableModel): id = models.UUIDField(primary_key=True, default=uuid.uuid4) repo = models.ForeignKey('dataimport.Repository', on_delete=models.CASCADE, related_name='revisions') hash = models.CharField(max_length=50) ref = models.CharField(max_length=50) message = models.TextField() author = models.CharField(max_length=50) + elements = models.ManyToManyField('documents.Element', through='dataimport.Event', related_name='revisions') class Meta: unique_together = (('repo', 'hash'), ) @@ -241,3 +231,25 @@ class Revision(models.Model): ) dataimport.start() return dataimport + + def __str__(self): + return '{} "{}" by {}'.format(self.hash[:8], self.message.splitlines()[0], self.author) + + +class EventType(Enum): + Addition = 'A' + Edit = 'M' + Deletion = 'D' + + +class Event(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4) + created = models.DateTimeField(auto_now_add=True) + updated = models.DateTimeField(auto_now=True) + element = models.ForeignKey('documents.Element', on_delete=models.CASCADE, related_name='events') + revision = models.ForeignKey('dataimport.Revision', on_delete=models.CASCADE, related_name='events') + type = EnumField(EventType, max_length=10) + + class Meta: + unique_together = (('element', 'revision'), ) + ordering = ['element_id', 'created'] diff --git a/arkindex/dataimport/providers.py b/arkindex/dataimport/providers.py index a73ce6c2a6be83628a18de7efb3e3cb3c77a1cd1..12457a72747c96819c29b65700d2a911a9530656 100644 --- a/arkindex/dataimport/providers.py +++ b/arkindex/dataimport/providers.py @@ -3,9 +3,11 @@ from django.urls import reverse from rest_framework.exceptions import NotAuthenticated, AuthenticationFailed, APIException, ValidationError from gitlab import Gitlab, GitlabGetError, GitlabCreateError from arkindex.documents.models import Corpus +from arkindex.dataimport.config import ConfigFile import urllib.parse import base64 import uuid +import git class GitProvider(ABC): @@ -33,16 +35,44 @@ class GitProvider(ABC): Create a Repository instance from an external repository """ + @abstractmethod + def clone_repo(self, repo, dest_dir, **kwargs): + """ + Get a git.Repo instance for a repository cloned in a given destination directory. + """ + @abstractmethod def download_archive(self, revision, path): """ Download an archive for a given Revision instance. """ + def get_or_create_revision(self, repo, sha): + from arkindex.dataimport.models import Revision + try: + return self.get_revision(repo, sha), False + except Revision.DoesNotExist: + return self.create_revision(repo, sha), True + + def get_revision(self, repo, sha): + return repo.revisions.get(hash=sha) + + @abstractmethod + def create_revision(self, repo, sha): + """ + Create a Revision instance for a given commit hash of a given repository. + """ + @abstractmethod def get_or_create_latest_revision(self, repo): """ - Get a Revision instance for the last revision on the main branch of a given repository. + Get or create a Revision instance for the last revision on the main branch of a given repository. + """ + + @abstractmethod + def get_file_content(self, repo, path, ref="master"): + """ + Get the contents of a given file on a given repository. """ @abstractmethod @@ -57,6 +87,12 @@ class GitLabProvider(GitProvider): display_name = "GitLab" url = 'https://gitlab.com' + def _try_get_project(self, gl, id): + try: + return gl.projects.get(id) + except GitlabGetError as e: + raise APIException("Error while fetching GitLab project: {}".format(str(e))) + def list_repos(self, query=None): if not self.credentials: raise NotAuthenticated() @@ -68,10 +104,7 @@ class GitLabProvider(GitProvider): if not self.credentials and request: raise NotAuthenticated() gl = Gitlab(self.url, oauth_token=self.credentials.token) - try: - project = gl.projects.get(int(id)) - except GitlabGetError as e: - raise APIException("Error while fetching GitLab project: {}".format(str(e))) + project = self._try_get_project(gl, int(id)) from arkindex.dataimport.models import Repository if Repository.objects.filter(url=project.web_url).exists(): @@ -80,7 +113,6 @@ class GitLabProvider(GitProvider): repo = self.credentials.repos.create( corpus=corpus, url=project.web_url, - watched_branches=['refs/heads/{}'.format(project.default_branch)], hook_token=str(base64.b64encode(uuid.uuid4().bytes)), provider_name=self.__class__.__name__, ) @@ -98,26 +130,41 @@ class GitLabProvider(GitProvider): return repo + def clone_repo(self, repo, dest_dir, **kwargs): + parsed = list(urllib.parse.urlsplit(repo.url)) + + # Clone over HTTPS using an OAuth token is an undocumented feature supported since GitLab 8.12 + # https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/10677 + parsed[1] = 'oauth2:{}@{}'.format(repo.credentials.token, parsed[1]) + + return git.Repo.clone_from(urllib.parse.urlunsplit(parsed), dest_dir, **kwargs) + def download_archive(self, revision, path): gl = Gitlab(self.url, oauth_token=revision.repo.credentials.token) - try: - project = gl.projects.get(urllib.parse.urlsplit(revision.repo.url).path.strip('/')) - except GitlabGetError as e: - raise APIException("Error while fetching GitLab project: {}".format(str(e))) + project = self._try_get_project(gl, urllib.parse.urlsplit(revision.repo.url).path.strip('/')) with open(path, 'wb') as f: project.repository_archive(sha=revision.hash, streamed=True, action=f.write) + def create_revision(self, repo, sha): + gl = Gitlab(self.url, oauth_token=repo.credentials.token) + project = self._try_get_project(gl, urllib.parse.urlsplit(repo.url).path.strip('/')) + + commit = project.commits.get(sha) + + return repo.revisions.create( + hash=sha, + ref=commit.refs()[0]['name'], + message=commit.message, + author=commit.author_name, + ) + def get_or_create_latest_revision(self, repo): gl = Gitlab(self.url, oauth_token=repo.credentials.token) - try: - project = gl.projects.get(urllib.parse.urlsplit(repo.url).path.strip('/')) - except GitlabGetError as e: - raise APIException("Error while fetching GitLab project: {}".format(str(e))) + project = self._try_get_project(gl, urllib.parse.urlsplit(repo.url).path.strip('/')) latest_commit = project.commits.list()[0] return repo.revisions.get_or_create( - repo=repo, hash=latest_commit.id, defaults={ 'ref': latest_commit.refs()[0]['name'], @@ -126,6 +173,12 @@ class GitLabProvider(GitProvider): }, ) + def get_file_content(self, repo, path, ref="master"): + gl = Gitlab(self.url, oauth_token=repo.credentials.token) + project = self._try_get_project(gl, urllib.parse.urlsplit(repo.url).path.strip('/')) + + return project.files.get(file_path=path, ref=ref).decode() + def handle_webhook(self, repo, request): if 'HTTP_X_GITLAB_EVENT' not in request.META: raise ValidationError("Missing GitLab event type") @@ -140,15 +193,20 @@ class GitLabProvider(GitProvider): assert isinstance(request.data, dict) assert request.data['object_kind'] == 'push' - if request.data['ref'] not in repo.watched_branches: - return - # Already took care of this event if repo.revisions.filter( ref=request.data['ref'], hash=request.data['checkout_sha']).exists(): return + # Filter on configured branches + config = ConfigFile(self.get_file_content(repo, ConfigFile.FILE_NAME, ref=request.data['ref'])) + human_ref = request.data['ref'] + if human_ref.startswith('refs/heads/'): + human_ref = human_ref[11:] + if human_ref not in config.branches: + return + rev = repo.revisions.create( hash=request.data['checkout_sha'], ref=request.data['ref'], diff --git a/arkindex/dataimport/serializers.py b/arkindex/dataimport/serializers.py index 17c79039f97496403429c679fac97c947b8938c7..1280e8ebf90c0d93e075dceebe14efcaa3d56fdb 100644 --- a/arkindex/dataimport/serializers.py +++ b/arkindex/dataimport/serializers.py @@ -1,8 +1,10 @@ from rest_framework import serializers from rest_framework.utils import model_meta from arkindex.project.serializer_fields import EnumField -from arkindex.dataimport.models import \ - DataImport, DataImportMode, DataImportState, DataImportFailure, DataFile, Repository, Revision +from arkindex.dataimport.models import ( + DataImport, DataImportMode, DataImportState, DataImportFailure, DataFile, + Repository, Revision, Event, EventType +) from arkindex.documents.models import Corpus from arkindex.documents.serializers.light import ElementLightSerializer import gitlab.v4.objects @@ -94,10 +96,13 @@ class RevisionSerializer(serializers.ModelSerializer): Serialize a repository revision """ + date = serializers.DateTimeField(source='created') + class Meta: model = Revision fields = ( 'id', + 'date', 'hash', 'ref', 'message', @@ -191,7 +196,7 @@ class DataImportFailureSerializer(serializers.ModelSerializer): ) -class RepositoryLightSerializer(serializers.ModelSerializer): +class RepositorySerializer(serializers.ModelSerializer): """ Serialize a repository """ @@ -209,20 +214,6 @@ class RepositoryLightSerializer(serializers.ModelSerializer): } -class RepositorySerializer(RepositoryLightSerializer): - """ - Fully serialize a repository - """ - - class Meta(RepositoryLightSerializer.Meta): - fields = ( - 'id', - 'url', - 'corpus', - 'watched_branches', - ) - - class ExternalRepositorySerializer(serializers.BaseSerializer): """ Serialize a Git repository from an external API @@ -253,6 +244,23 @@ class ExternalRepositorySerializer(serializers.BaseSerializer): return { 'id': data['id'], - 'corpus': Corpus.objects.writable(self.request.user) + 'corpus': Corpus.objects.writable(self.context['request'].user) .get(id=data['corpus']) } + + +class EventSerializer(serializers.ModelSerializer): + """ + Serialize a diff event for an element on a revision + """ + + type = EnumField(EventType) + revision = RevisionSerializer() + + class Meta: + model = Event + fields = ( + 'id', + 'type', + 'revision', + ) diff --git a/arkindex/dataimport/tasks.py b/arkindex/dataimport/tasks.py index 616f2bb053aed3b646f0b6bcea6ad561cde8c7a3..179a9cf3f3d34f851e338dcae28e6cc89dda2220 100644 --- a/arkindex/dataimport/tasks.py +++ b/arkindex/dataimport/tasks.py @@ -4,18 +4,24 @@ from celery.signals import task_postrun from celery.states import EXCEPTION_STATES from django.conf import settings from django.db import transaction +from django.core.exceptions import ValidationError from arkindex.project.celery import ReportingTask from arkindex.documents.models import Element, ElementType, Page from arkindex.documents.importer import import_page from arkindex.documents.tei import TeiParser from arkindex.images.models import ImageServer, ImageStatus from arkindex.dataimport.models import DataImport, DataImportState, DataImportMode +from arkindex.dataimport.config import ConfigFile +from arkindex.dataimport.filetypes import FileType +from collections import namedtuple +from enum import Enum from PIL import Image import os import glob import logging import shutil import urllib.parse +import git root_logger = logging.getLogger(__name__) logger = get_task_logger(__name__) @@ -165,6 +171,7 @@ def download_repo(self, dataimport): repo_dir = dataimport.revision.repo.clone_dir if os.path.exists(repo_dir): shutil.rmtree(repo_dir) + archive_path = "{}.tar.gz".format(repo_dir) commit_hash = dataimport.revision.hash @@ -180,7 +187,139 @@ def download_repo(self, dataimport): @shared_task(bind=True, base=ReportingTask) -def import_repo(self, dataimport): +def clone_repo(self, dataimport): + assert isinstance(dataimport, DataImport) + assert dataimport.mode == DataImportMode.Repository + assert dataimport.revision is not None + + repo_dir = dataimport.revision.repo.clone_dir + commit_hash = dataimport.revision.hash + + if os.path.exists(repo_dir): + shutil.rmtree(repo_dir) + + if dataimport.revision.repo.provider_class is None: + raise ValueError("No repository provider found for {}".format(dataimport.revision.repo.url)) + + self.report_progress(0, "Cloning repository...") + try: + repo = dataimport.revision.repo.provider.clone_repo(dataimport.revision.repo, repo_dir, no_checkout=True) + except Exception: + raise Exception("An error occured while cloning the repository.") + + self.report_progress(0.5, "Checking out commit {}...".format(commit_hash)) + try: + repo.head.reference = repo.create_head('commit_{}'.format(commit_hash), commit_hash) + repo.head.reset(index=True, working_tree=True) + except Exception: + raise Exception("An error occured while checking out commit {}".format(commit_hash)) + + config_path = os.path.join(repo_dir, ConfigFile.FILE_NAME) + if not os.path.isfile(config_path): + raise IOError("An Arkindex repository configuration file (.arkindex.yml) is required.") + + try: + ConfigFile.from_path(config_path) + except ValidationError as e: + raise ValueError("Invalid Arkindex config file (.arkindex.yml): {}".format(str(e))) + except Exception: + raise Exception("An error occured while opening the .arkindex.yml file.") + + +class DiffType(Enum): + Addition = 'A' + Modification = 'M' + Deletion = 'D' + Rename = 'R' + Transtype = 'T' + # The following types exist but should never happen in a simple cloning + Copy = 'C' + Unmerged = 'U' + BrokenPairing = 'B' + Unknown = 'X' + + +SimpleDiff = namedtuple('SimpleDiff', 'type, old_path, new_path') + + +@shared_task(bind=True, base=ReportingTask) +def diff_repo(self, dataimport): + assert isinstance(dataimport, DataImport) + assert dataimport.mode == DataImportMode.Repository + assert dataimport.revision is not None + + commit_hash = dataimport.revision.hash + repo = git.Repo(dataimport.revision.repo.clone_dir) + current_commit = repo.commit(commit_hash) + + # Iterate over the commit's tree and exclude the current commit to get all parents + parent_commits = { + c.hexsha: c + for c in repo.iter_commits(commit_hash) + if not c.hexsha == commit_hash + } + + # Look for revisions that match the commit hashes + parent_revisions = dataimport.revision.repo.revisions.filter(hash__in=parent_commits.keys()).order_by('-created') + + if not parent_revisions.exists(): + self.report_message("No known parent revision found.") + # No known revision, just return all the repo's files as additions + # Call git ls-files directly + return [SimpleDiff(DiffType.Addition, path, path) for path in repo.git.ls_files().splitlines()] + + # Pick the Git commit from the latest revision and perform the diff + diffs = parent_commits[parent_revisions.first().hash].diff(current_commit) + + # Return diff types and paths + return [SimpleDiff(DiffType(diff.change_type), diff.a_path, diff.b_path) for diff in diffs] + + +@shared_task(bind=True, base=ReportingTask) +def dispatch_imports(self, diffs, dataimport): + assert isinstance(dataimport, DataImport) + assert all(isinstance(diff, SimpleDiff) for diff in diffs) + + handler = TaskLoggingHandler(self) + root_logger.addHandler(handler) + + try: + config = ConfigFile.from_path(os.path.join(dataimport.revision.repo.clone_dir, ConfigFile.FILE_NAME)) + except ValidationError as e: + raise ValueError("YAML configuration file validation failed: {}".format(str(e))) + except Exception: + raise Exception("An error occured while loading the .arkindex.yml file.") + + self.report_progress(0, "Fetching file types...") + actions = [( + diff, + FileType.identify(diff.old_path, config), + FileType.identify(diff.new_path, config), + ) for diff in diffs] + + count = len(actions) + for i, (diff, old_type, new_type) in enumerate(actions): + self.report_progress(i / count, "Parsing diff {} from '{}' to '{}'".format( + diff.type.value, diff.old_path, diff.new_path)) + + if not new_type: + self.report_message("Unknown file type for '{}'".format(diff.new_path), level=logging.WARNING) + continue + + if old_type != new_type: + self.report_message("File type changes are not supported", level=logging.WARNING) + continue + try: + new_type.handle(dataimport, *diff, config) + except Exception as e: + self.report_message("Error while parsing diff {} from '{}' to '{}': {}".format( + diff.type.value, diff.old_path, diff.new_path, str(e)), level=logging.WARNING) + + root_logger.removeHandler(handler) + + +@shared_task(bind=True, base=ReportingTask) +def import_metadata_repo(self, dataimport): handler = TaskLoggingHandler(self) root_logger.addHandler(handler) diff --git a/arkindex/dataimport/tests/manifest_samples/.arkindex.yml b/arkindex/dataimport/tests/manifest_samples/.arkindex.yml new file mode 100644 index 0000000000000000000000000000000000000000..558726ce7b8370daf6deee1bfb98c670f991a1f9 --- /dev/null +++ b/arkindex/dataimport/tests/manifest_samples/.arkindex.yml @@ -0,0 +1,14 @@ +version: 1 +branches: + - master + +corpus: + name: Unit Tests + description: Unit Tests corpus + +volumes: + format: iiif + paths: + - "*" + lazy_checks: true + autoconvert_https: false diff --git a/arkindex/dataimport/tests/manifest_samples/base.json b/arkindex/dataimport/tests/manifest_samples/base.json new file mode 100644 index 0000000000000000000000000000000000000000..f5bdb03b7fde48ea29b97dbf8fcb7c9c81523741 --- /dev/null +++ b/arkindex/dataimport/tests/manifest_samples/base.json @@ -0,0 +1,98 @@ +{ + "@context": "http://iiif.io/api/presentation/2/context.json", + "@type": "sc:Manifest", + "@id": "http://server/manifest", + "viewingDirection": "left-to-right", + "viewingHint": "individuals", + "label": "Manifest label", + "description": "", + "thumbnail": { + "@id": "http://server/thumbnail/full/150,/0/default.jpg", + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "profile": "http://iiif.io/api/image/2/level2.json", + "@id": "http://server/thumbnail" + } + }, + "metadata": [ + { + "label": "Label 1", + "value": "Value 1" + }, + { + "label": "Label 2", + "value": "Value 2" + }, + { + "label": "Label 3", + "value": "Value 3" + } + ], + "license": "http://creativecommons.org/licenses/by-nc/3.0/deed.fr", + "attribution": "Archives nationales de PARIS", + "logo": "http://server/logo", + "related": [], + "seeAlso": "", + "within": "http://server/collection", + "sequences": [ + { + "@id": "http://server/sequence", + "@type": "sc:Sequence", + "label": "Reproduction intégrale", + "canvases": [ + { + "@id": "http://server/canvas-1", + "@type": "sc:Canvas", + "label": "plat supérieur", + "height": 1000, + "width": 2000, + "images": [ + { + "@type": "oa:Annotation", + "motivation": "sc:painting", + "resource": { + "@id": "http://server/image-1/full/full/0/default.jpg", + "@type": "dctypes:Image", + "format": "image/jpeg", + "height": 1000, + "width": 2000, + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "@id": "http://server/image-1", + "profile": "http://iiif.io/api/image/2/level2.json" + } + }, + "on": "http://server/canvas-1" + } + ] + }, + { + "@id": "http://server/canvas-2", + "@type": "sc:Canvas", + "label": "001r", + "height": 1000, + "width": 2000, + "images": [ + { + "@type": "oa:Annotation", + "motivation": "sc:painting", + "resource": { + "@id": "http://server/image-2/full/full/0/default.jpg", + "@type": "dctypes:Image", + "format": "image/jpeg", + "height": 1000, + "width": 2000, + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "@id": "http://server/image-2", + "profile": "http://iiif.io/api/image/2/level2.json" + } + }, + "on": "http://server/canvas-2" + } + ] + } + ] + } + ] +} diff --git a/arkindex/dataimport/tests/manifest_samples/changed.json b/arkindex/dataimport/tests/manifest_samples/changed.json new file mode 100644 index 0000000000000000000000000000000000000000..2c42ee53bd5c59000f6b59f47701c90d9b86d5e1 --- /dev/null +++ b/arkindex/dataimport/tests/manifest_samples/changed.json @@ -0,0 +1,98 @@ +{ + "@context": "http://iiif.io/api/presentation/2/context.json", + "@type": "sc:Manifest", + "@id": "http://server/manifest", + "viewingDirection": "left-to-right", + "viewingHint": "individuals", + "label": "Manifest label", + "description": "", + "thumbnail": { + "@id": "http://server/thumbnail/full/150,/0/default.jpg", + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "profile": "http://iiif.io/api/image/2/level2.json", + "@id": "http://server/thumbnail" + } + }, + "metadata": [ + { + "label": "Label 1", + "value": "Updated value 1" + }, + { + "label": "Label 2", + "value": "Value 2" + }, + { + "label": "Label 4", + "value": "Value 4" + } + ], + "license": "http://creativecommons.org/licenses/by-nc/3.0/deed.fr", + "attribution": "Archives nationales de PARIS", + "logo": "http://server/logo", + "related": [], + "seeAlso": "", + "within": "http://server/collection", + "sequences": [ + { + "@id": "http://server/sequence", + "@type": "sc:Sequence", + "label": "Reproduction intégrale", + "canvases": [ + { + "@id": "http://server/canvas-3", + "@type": "sc:Canvas", + "label": "plat supérieur", + "height": 1000, + "width": 2000, + "images": [ + { + "@type": "oa:Annotation", + "motivation": "sc:painting", + "resource": { + "@id": "http://server/image-3/full/full/0/default.jpg", + "@type": "dctypes:Image", + "format": "image/jpeg", + "height": 1000, + "width": 2000, + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "@id": "http://server/image-3", + "profile": "http://iiif.io/api/image/2/level2.json" + } + }, + "on": "http://server/image-3" + } + ] + }, + { + "@id": "http://server/canvas-2", + "@type": "sc:Canvas", + "label": "002r", + "height": 1000, + "width": 2000, + "images": [ + { + "@type": "oa:Annotation", + "motivation": "sc:painting", + "resource": { + "@id": "http://server/image-2/full/full/0/default.jpg", + "@type": "dctypes:Image", + "format": "image/jpeg", + "height": 1000, + "width": 2000, + "service": { + "@context": "http://iiif.io/api/image/2/context.json", + "@id": "http://server/image-2", + "profile": "http://iiif.io/api/image/2/level2.json" + } + }, + "on": "http://server/canvas-2" + } + ] + } + ] + } + ] +} diff --git a/arkindex/dataimport/tests/test_gitlab_provider.py b/arkindex/dataimport/tests/test_gitlab_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..aa65cbe983e49e1c21c39901eaa42796552a6b7a --- /dev/null +++ b/arkindex/dataimport/tests/test_gitlab_provider.py @@ -0,0 +1,462 @@ +from arkindex.project.tests import FixtureTestCase, RedisMockMixin +from arkindex.dataimport.models import Repository +from arkindex.users.models import OAuthCredentials +from arkindex.dataimport.providers import GitLabProvider +from rest_framework.exceptions import APIException, NotAuthenticated, AuthenticationFailed, ValidationError +from gitlab.exceptions import GitlabGetError, GitlabCreateError +from unittest.mock import patch, MagicMock +import yaml + + +class TestGitLabProvider(RedisMockMixin, FixtureTestCase): + """ + Test the GitLabProvider class + """ + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.creds = OAuthCredentials.objects.create( + user=cls.user, + provider_name='GitLabOAuthProvider', + provider_url='https://somewhere', + token='oauth-token' + ) + cls.repo = Repository.objects.create( + url='http://gitlab/repo', + hook_token='hook-token', + corpus=cls.corpus, + credentials=cls.creds, + provider_name='GitLabProvider', + ) + cls.rev = cls.repo.revisions.create( + hash='42', + ref='refs/heads/master', + message='a', + author='me', + ) + cls.gl_patch = patch('arkindex.dataimport.providers.Gitlab') + + def setUp(self): + super().setUp() + self.gl_mock = self.gl_patch.start() + + def tearDown(self): + super().tearDown() + self.gl_patch.stop() + + def test_list_repos(self): + """ + Test GitLabProvider can list repositories from GitLab + """ + GitLabProvider(url='http://aaa', credentials=self.creds).list_repos() + + self.assertEqual(self.gl_mock.call_count, 1) + args, kwargs = self.gl_mock.call_args + self.assertTupleEqual(args, ('http://aaa', )) + self.assertDictEqual(kwargs, {'oauth_token': self.creds.token}) + + self.assertEqual(self.gl_mock().projects.list.call_count, 1) + args, kwargs = self.gl_mock().projects.list.call_args + self.assertTupleEqual(args, ()) + self.assertDictEqual(kwargs, {'membership': True, 'search': None}) + + def test_list_repos_query(self): + """ + Test GitLabProvider can search repositories from GitLab + """ + GitLabProvider(url='http://aaa', credentials=self.creds).list_repos(query='meh') + + self.assertEqual(self.gl_mock.call_count, 1) + args, kwargs = self.gl_mock.call_args + self.assertTupleEqual(args, ('http://aaa', )) + self.assertDictEqual(kwargs, {'oauth_token': self.creds.token}) + + self.assertEqual(self.gl_mock().projects.list.call_count, 1) + args, kwargs = self.gl_mock().projects.list.call_args + self.assertTupleEqual(args, ()) + self.assertDictEqual(kwargs, {'membership': True, 'search': 'meh'}) + + def test_list_repos_requires_credentials(self): + """ + Test GitLabProvider checks for credentials when requesting repositories list + """ + with self.assertRaises(NotAuthenticated): + GitLabProvider(url='http://aaa').list_repos() + + def test_create_repo(self): + """ + Test GitLabProvider can create a Repository instance from a GitLab repo + """ + self.gl_mock().projects.get.return_value.web_url = 'http://new_repo_url' + self.gl_mock().projects.get.return_value.default_branch = 'branchname' + + request_mock = MagicMock() + request_mock.build_absolute_uri.return_value = 'http://hook' + new_repo = GitLabProvider(url='http://aaa', credentials=self.creds).create_repo( + id='1337', request=request_mock, corpus=self.corpus) + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + args, kwargs = self.gl_mock().projects.get.call_args + self.assertTupleEqual(args, (1337, )) + self.assertDictEqual(kwargs, {}) + + self.assertEqual(new_repo.url, 'http://new_repo_url') + self.assertEqual(new_repo.corpus, self.corpus) + self.assertEqual(new_repo.provider_name, 'GitLabProvider') + + self.assertEqual(self.gl_mock().projects.get().hooks.create.call_count, 1) + args, kwargs = self.gl_mock().projects.get().hooks.create.call_args + self.assertEqual(len(args), 1) + self.assertDictEqual(kwargs, {}) + self.assertDictEqual(args[0], { + 'url': 'http://hook', + 'push_events': True, + 'token': new_repo.hook_token, + }) + + def test_create_repo_requires_credentials(self): + """ + Test GitLabProvider checks for credentials when requesting a repository creation + """ + request_mock = MagicMock() + request_mock.build_absolute_uri.return_value = 'http://hook' + with self.assertRaises(NotAuthenticated): + GitLabProvider(url='http://aaa').create_repo( + id='repo_id', request=request_mock, corpus=self.corpus) + + def test_create_repo_already_exists(self): + """ + Test GitLabProvider checks for duplicate repositories + """ + self.gl_mock().projects.get.return_value.web_url = 'http://new_repo_url' + self.gl_mock().projects.get.return_value.default_branch = 'branchname' + + request_mock = MagicMock() + request_mock.build_absolute_uri.return_value = 'http://hook' + GitLabProvider(url='http://aaa', credentials=self.creds).create_repo( + id='1337', request=request_mock, corpus=self.corpus) + + with self.assertRaises(ValidationError): + GitLabProvider(url='http://aaa', credentials=self.creds).create_repo( + id='1337', request=request_mock, corpus=self.corpus) + + def test_create_repo_handle_get_error(self): + """ + Test GitLabProvider handles GitLab repo GET errors + """ + self.gl_mock().projects.get.side_effect = GitlabGetError + + request_mock = MagicMock() + request_mock.build_absolute_uri.return_value = 'http://hook' + + with self.assertRaises(APIException): + GitLabProvider(url='http://aaa', credentials=self.creds).create_repo( + id='1337', request=request_mock, corpus=self.corpus) + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + + def test_create_repo_handle_hook_create_error(self): + """ + Test GitLabProvider handles GitLab hook creation errors + """ + self.gl_mock().projects.get.return_value.web_url = 'http://new_repo_url' + self.gl_mock().projects.get.return_value.default_branch = 'branchname' + self.gl_mock().projects.get.return_value.hooks.create.side_effect = GitlabCreateError + + request_mock = MagicMock() + request_mock.build_absolute_uri.return_value = 'http://hook' + + with self.assertRaises(APIException): + GitLabProvider(url='http://aaa', credentials=self.creds).create_repo( + id='1337', request=request_mock, corpus=self.corpus) + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().hooks.create.call_count, 1) + + @patch('arkindex.dataimport.providers.git') + def test_clone_repo(self, git_mock): + """ + Test GitLabProvider can clone a repository + """ + GitLabProvider(url='http://aaa', credentials=self.creds).clone_repo(self.repo, 'somewhere', a='a', b='b') + + self.assertEqual(git_mock.Repo.clone_from.call_count, 1) + args, kwargs = git_mock.Repo.clone_from.call_args + self.assertTupleEqual(args, ('http://oauth2:oauth-token@gitlab/repo', 'somewhere')) + self.assertDictEqual(kwargs, {'a': 'a', 'b': 'b'}) + + @patch('arkindex.dataimport.providers.open') + def test_download_archive(self, open_mock): + """ + Test GitLabProvider can download an archive of a revision onto a specified path + """ + GitLabProvider(url='http://aaa', credentials=self.creds).download_archive(self.rev, 'somewhere') + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + args, kwargs = self.gl_mock().projects.get.call_args + self.assertTupleEqual(args, ('repo', )) + self.assertDictEqual(kwargs, {}) + + self.assertEqual(open_mock.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().repository_archive.call_count, 1) + args, kwargs = self.gl_mock().projects.get().repository_archive.call_args + self.assertTupleEqual(args, ()) + self.assertDictEqual(kwargs, { + 'sha': '42', + 'streamed': True, + 'action': open_mock().__enter__().write, + }) + + @patch('arkindex.dataimport.providers.open') + def test_download_archive_get_error(self, open_mock): + """ + Test GitLabProvider can handle GitLab errors while downloading an archive + """ + self.gl_mock().projects.get.side_effect = GitlabGetError + + with self.assertRaises(APIException): + GitLabProvider(url='http://aaa', credentials=self.creds).download_archive(self.rev, 'somewhere') + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + self.assertEqual(open_mock.call_count, 0) + + def test_get_revision(self): + """ + Test GitLabProvider can create a Revision instance for a repo by hash + """ + revision, created = GitLabProvider(url='http://aaa', credentials=self.creds) \ + .get_or_create_revision(self.repo, '42') + + self.assertEqual(revision, self.rev) + self.assertFalse(created) + self.assertEqual(self.gl_mock.call_count, 0) + + def test_create_revision(self): + """ + Test GitLabProvider can create a Revision instance for a repo by hash + """ + self.gl_mock().projects.get.return_value.commits.get.return_value.refs.return_value = [ + {'name': 'refs/heads/branch1'}, + {'name': 'refs/heads/branch2'}, + ] + self.gl_mock().projects.get.return_value.commits.get.return_value.message = 'commit message' + self.gl_mock().projects.get.return_value.commits.get.return_value.author_name = 'bob' + + revision, created = GitLabProvider(url='http://aaa', credentials=self.creds) \ + .get_or_create_revision(self.repo, '1337') + + self.assertTrue(created) + self.assertEqual(revision.hash, '1337') + self.assertEqual(revision.ref, 'refs/heads/branch1') + self.assertEqual(revision.message, 'commit message') + self.assertEqual(revision.author, 'bob') + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().commits.get.call_count, 1) + args, kwargs = self.gl_mock().projects.get().commits.get.call_args + self.assertTupleEqual(args, ('1337', )) + self.assertDictEqual(kwargs, {}) + + def test_get_latest_revision(self): + """ + Test GitLabProvider can get the latest revision on a repo + """ + latest_commit = MagicMock() + latest_commit.id = '42' + latest_commit.refs.return_value = [ + {'name': 'refs/heads/master'}, + ] + latest_commit.message = 'a' + latest_commit.author_name = 'me' + + self.gl_mock().projects.get.return_value.commits.list.return_value = [latest_commit, ] + + revision, created = GitLabProvider(url='http://aaa', credentials=self.creds) \ + .get_or_create_latest_revision(self.repo) + + self.assertFalse(created) + self.assertEqual(revision, self.rev) + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().commits.list.call_count, 1) + + def test_create_latest_revision(self): + """ + Test GitLabProvider can create the latest revision on a repo + """ + latest_commit = MagicMock() + latest_commit.id = '1337' + latest_commit.refs.return_value = [ + {'name': 'refs/heads/branch1'}, + {'name': 'refs/heads/branch2'}, + ] + latest_commit.message = 'commit message' + latest_commit.author_name = 'bob' + + self.gl_mock().projects.get.return_value.commits.list.return_value = [latest_commit, ] + + revision, created = GitLabProvider(url='http://aaa', credentials=self.creds) \ + .get_or_create_latest_revision(self.repo) + + self.assertTrue(created) + self.assertEqual(revision.hash, '1337') + self.assertEqual(revision.ref, 'refs/heads/branch1') + self.assertEqual(revision.message, 'commit message') + self.assertEqual(revision.author, 'bob') + + self.assertEqual(self.gl_mock().projects.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().commits.list.call_count, 1) + + def test_handle_webhook(self): + """ + Test GitLabProvider correctly handles GitLab webhook push events + """ + request_mock = MagicMock() + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Push Hook', + 'HTTP_X_GITLAB_TOKEN': 'hook-token', + } + request_mock.data = { + 'object_kind': 'push', + 'ref': 'refs/heads/master', + 'checkout_sha': '1337', + 'commits': [ + { + 'message': 'commit message', + 'author': { + 'name': 'bob', + } + } + ] + } + self.gl_mock().projects.get().files.get.return_value.decode.return_value = yaml.dump({ + 'version': 1, + 'branches': ['master'], + 'corpus': {'name': 'Unit Tests', 'description': 'Unit Tests', 'public': False}, + 'volumes': {'paths': ['*'], 'format': 'iiif'}, + }) + + GitLabProvider(url='http://aaa', credentials=self.creds).handle_webhook(self.repo, request_mock) + self.assertGreater(self.redis.llen('celery'), 0) + + self.assertEqual(self.gl_mock().projects.get().files.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().files.get().decode.call_count, 1) + + revision = self.repo.revisions.get(hash='1337') + self.assertEqual(revision.ref, 'refs/heads/master') + self.assertEqual(revision.message, 'commit message') + self.assertEqual(revision.author, 'bob') + self.assertEqual(len(revision.dataimports.all()), 1) + + def test_handle_webhook_missing_headers(self): + """ + Test GitLabProvider checks HTTP headers on webhooks + """ + glp = GitLabProvider(url='http://aaa', credentials=self.creds) + + request_mock = MagicMock() + request_mock.data = { + 'object_kind': 'push', + 'ref': 'refs/heads/master', + 'checkout_sha': '1337', + 'commits': [ + { + 'message': 'commit message', + 'author': { + 'name': 'bob', + } + } + ] + } + + # Missing HTTP_X_GITLAB_TOKEN + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Push Hook', + } + with self.assertRaises(NotAuthenticated): + glp.handle_webhook(self.repo, request_mock) + + # Missing HTTP_X_GITLAB_EVENT + request_mock.META = { + 'HTTP_X_GITLAB_TOKEN': 'hook-token', + } + with self.assertRaises(ValidationError): + glp.handle_webhook(self.repo, request_mock) + + # Wrong HTTP_X_GITLAB_EVENT + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Not a Push Hook', + 'HTTP_X_GITLAB_TOKEN': 'hook-token', + } + with self.assertRaises(ValidationError): + glp.handle_webhook(self.repo, request_mock) + + # Wrong HTTP_X_GITLAB_TOKEN + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Push Hook', + 'HTTP_X_GITLAB_TOKEN': 'not-the-hook-token', + } + with self.assertRaises(AuthenticationFailed): + glp.handle_webhook(self.repo, request_mock) + + def test_handle_webhook_duplicate_events(self): + """ + Test GitLabProvider checks for already handled events + """ + request_mock = MagicMock() + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Push Hook', + 'HTTP_X_GITLAB_TOKEN': 'hook-token', + } + request_mock.data = { + 'object_kind': 'push', + 'ref': 'refs/heads/master', + 'checkout_sha': '42', + 'commits': [ + { + 'message': 'a', + 'author': { + 'name': 'me', + } + } + ] + } + + GitLabProvider(url='http://aaa', credentials=self.creds).handle_webhook(self.repo, request_mock) + self.assertEqual(self.redis.llen('celery'), 0) + + def test_handle_webhook_watched_branches(self): + """ + Test GitLabProvider only accepts events for watched branches + """ + request_mock = MagicMock() + request_mock.META = { + 'HTTP_X_GITLAB_EVENT': 'Push Hook', + 'HTTP_X_GITLAB_TOKEN': 'hook-token', + } + request_mock.data = { + 'object_kind': 'push', + 'ref': 'refs/heads/nope', + 'checkout_sha': '1337', + 'commits': [ + { + 'message': 'commit message', + 'author': { + 'name': 'bob', + } + } + ] + } + self.gl_mock().projects.get().files.get.return_value.decode.return_value = yaml.dump({ + 'version': 1, + 'branches': ['master'], + 'corpus': {'name': 'Unit Tests', 'description': 'Unit Tests', 'public': False}, + 'volumes': {'paths': ['*'], 'format': 'iiif'}, + }) + + GitLabProvider(url='http://aaa', credentials=self.creds).handle_webhook(self.repo, request_mock) + + self.assertEqual(self.gl_mock().projects.get().files.get.call_count, 1) + self.assertEqual(self.gl_mock().projects.get().files.get().decode.call_count, 1) + self.assertEqual(self.redis.llen('celery'), 0) diff --git a/arkindex/dataimport/tests/test_iiif.py b/arkindex/dataimport/tests/test_iiif.py new file mode 100644 index 0000000000000000000000000000000000000000..654d459ee24d341ddf76aa18af54d1f0004ff5cc --- /dev/null +++ b/arkindex/dataimport/tests/test_iiif.py @@ -0,0 +1,271 @@ +from unittest.mock import patch +from arkindex.project.tests import FixtureTestCase +from arkindex.documents.models import Element, ElementType, Page, MetaType +from arkindex.images.models import ImageStatus +from arkindex.dataimport.models import EventType, DataImportMode, DataImportState +from arkindex.dataimport.iiif import ManifestParser +import os.path +import git +import shutil + +FIXTURES = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + 'manifest_samples', +) + + +class TestManifestParser(FixtureTestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.creds = cls.user.credentials.create( + provider_name='GitLabOAuthProvider', + provider_url='https://somewhere', + token='oauth-token' + ) + cls.repo = cls.creds.repos.create( + url='http://gitlab/repo', + hook_token='hook-token', + corpus=cls.corpus, + provider_name='GitLabProvider', + ) + cls.rev = cls.repo.revisions.create( + hash='42', + ref='refs/heads/master', + message='a', + author='me', + ) + + def _assert_first_import(self, first_rev): + """ + Check importing base.json + """ + + vol = Element.objects.get(type=ElementType.Volume, name='ParserTest') + reg = Element.objects.get(type=ElementType.Register, name='ParserTest') + pages = Page.objects.get_descending(vol.id) + + # Volume metadata + self.assertEqual(vol.metadatas.count(), 3) + self.assertCountEqual( + vol.metadatas.values_list('name', 'type', 'revision', 'value'), + [ + ('Label 1', MetaType.Text, first_rev.id, 'Value 1'), + ('Label 2', MetaType.Text, first_rev.id, 'Value 2'), + ('Label 3', MetaType.Text, first_rev.id, 'Value 3'), + ] + ) + self.assertEqual(reg.metadatas.count(), 0) + + # Pages + self.assertEqual(pages.count(), 2) + first_page, second_page = pages + self.assertEqual(first_page.folio, "plat supérieur") + self.assertEqual(second_page.folio, "001r") + self.assertEqual(first_page.zone.image.path, 'image-1') + self.assertEqual(second_page.zone.image.path, 'image-2') + + for p in (first_page, second_page): + self.assertEqual(p.zone.polygon.x, 0) + self.assertEqual(p.zone.polygon.y, 0) + self.assertEqual(p.zone.polygon.width, 2000) + self.assertEqual(p.zone.polygon.height, 1000) + self.assertEqual(p.zone.image.status, ImageStatus.Unchecked) + self.assertEqual(p.zone.image.server, self.imgsrv) + self.assertEqual(p.zone.image.width, 2000) + self.assertEqual(p.zone.image.height, 1000) + + # Events + for elt in (vol, reg, first_page, second_page): + self.assertEqual(elt.events.count(), 1) + event = elt.events.get() + self.assertEqual(event.type, EventType.Addition) + self.assertEqual(event.revision, first_rev) + + def _assert_second_import(self, first_rev, second_rev): + """ + Check importing changed.json after base.json + """ + vol = Element.objects.get(type=ElementType.Volume, name='ParserTest') + reg = Element.objects.get(type=ElementType.Register, name='ParserTest') + pages = Page.objects.get_descending(vol.id) + + # Volume metadata + self.assertEqual(vol.metadatas.count(), 3) + self.assertCountEqual( + vol.metadatas.values_list('name', 'type', 'revision', 'value'), + [ + ('Label 1', MetaType.Text, second_rev.id, 'Updated value 1'), + ('Label 2', MetaType.Text, first_rev.id, 'Value 2'), + ('Label 4', MetaType.Text, second_rev.id, 'Value 4'), + ] + ) + self.assertEqual(reg.metadatas.count(), 0) + + # Pages + self.assertEqual(pages.count(), 2) + first_page, second_page = pages + self.assertEqual(first_page.folio, "plat supérieur") + self.assertEqual(second_page.folio, "002r") + self.assertEqual(first_page.zone.image.path, 'image-3') + self.assertEqual(second_page.zone.image.path, 'image-2') + + for p in (first_page, second_page): + self.assertEqual(p.zone.polygon.x, 0) + self.assertEqual(p.zone.polygon.y, 0) + self.assertEqual(p.zone.polygon.width, 2000) + self.assertEqual(p.zone.polygon.height, 1000) + self.assertEqual(p.zone.image.status, ImageStatus.Unchecked) + self.assertEqual(p.zone.image.server, self.imgsrv) + self.assertEqual(p.zone.image.width, 2000) + self.assertEqual(p.zone.image.height, 1000) + + # Events + self.assertCountEqual( + vol.events.values_list('type', 'revision'), + [ + (EventType.Addition, first_rev.id), + (EventType.Edit, second_rev.id), + ], + ) + self.assertCountEqual( + reg.events.values_list('type', 'revision'), + [ + (EventType.Addition, first_rev.id), + (EventType.Edit, second_rev.id), + ], + ) + self.assertCountEqual( + first_page.events.values_list('type', 'revision'), + [(EventType.Addition, second_rev.id)], + ) + self.assertCountEqual( + second_page.events.values_list('type', 'revision'), + [ + (EventType.Addition, first_rev.id), + (EventType.Edit, second_rev.id), + ], + ) + + def test_import_once(self): + """ + Import a manifest file from scratch + """ + self.assertFalse(Element.objects.filter(type=ElementType.Volume, name='ParserTest').exists()) + self.assertFalse(Element.objects.filter(type=ElementType.Register, name='ParserTest').exists()) + ManifestParser( + os.path.join(FIXTURES, 'base.json'), + self.rev, + self.corpus, + servers=[self.imgsrv], + lazy=True, + autocreate_servers=False, + autoconvert_https=False, + volume_name='ParserTest', + ).run() + self._assert_first_import(self.rev) + + def test_import_changes(self): + """ + Import a manifest file from scratch, then apply another manifest with some changes + """ + self.assertFalse(Element.objects.filter(type=ElementType.Volume, name='ParserTest').exists()) + self.assertFalse(Element.objects.filter(type=ElementType.Register, name='ParserTest').exists()) + + # First import + ManifestParser( + os.path.join(FIXTURES, 'base.json'), + self.rev, + self.corpus, + servers=[self.imgsrv], + lazy=True, + autocreate_servers=False, + autoconvert_https=False, + volume_name='ParserTest', + ).run() + self._assert_first_import(self.rev) + + # Second import + new_rev = self.repo.revisions.create( + hash='1337', + ref='refs/heads/master', + message='b', + author='me', + ) + ManifestParser( + os.path.join(FIXTURES, 'changed.json'), + new_rev, + self.corpus, + servers=[self.imgsrv], + lazy=True, + autocreate_servers=False, + autoconvert_https=False, + volume_name='ParserTest', + ).run() + self._assert_second_import(self.rev, new_rev) + + @patch('arkindex.dataimport.providers.git.Repo.clone_from') + def test_git_import(self, clone_mock): + """ + Import manifest files from a Git repo + """ + # Create a Git repo + repo_dir = os.path.join(FIXTURES, 'repo') + if os.path.exists(repo_dir): + shutil.rmtree(repo_dir) + os.makedirs(repo_dir, exist_ok=True) + repo = git.Repo.init(repo_dir) + + # Prevent cloning from anywhere else but this repo + clone_mock.side_effect = lambda src, dest, **kwargs: repo.clone(dest, **kwargs) + + def copy_commit(message, src=[], dst=[]): + src = [os.path.join(FIXTURES, path) for path in src] + dst = [os.path.join(repo_dir, path) for path in dst] + list(map(shutil.copyfile, src, dst)) + repo.index.add(dst) + return repo.index.commit(message) + + def run_import(commit): + """ + Create a revision and run a synchronous import + """ + new_rev = self.repo.revisions.create( + hash=commit.hexsha, + message=commit.message, + ref='refs/heads/master', + author=commit.author, + ) + workflow = new_rev.dataimports.create( + corpus=self.corpus, + creator=self.user, + state=DataImportState.Running, + mode=DataImportMode.Repository, + ).build_workflow() + # Run synchronously + workflow.apply() + return new_rev + + # Make commits + first_commit = copy_commit( + 'First commit', + src=['.arkindex.yml', 'base.json'], + dst=['.arkindex.yml', 'ParserTest.json'], + ) + second_commit = copy_commit( + 'Second commit', + src=['changed.json'], + dst=['ParserTest.json'], + ) + + # Run first import + first_rev = run_import(first_commit) + self._assert_first_import(first_rev) + + # Run second import + second_rev = run_import(second_commit) + self._assert_second_import(first_rev, second_rev) + + # Remove the repo + shutil.rmtree(repo_dir) diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py index 9c4580c069f7174439578eb3493ad5d25ed4c7a7..8b3fcbd6811db1b077359eca51821c031ca3b560 100644 --- a/arkindex/dataimport/tests/test_tasks.py +++ b/arkindex/dataimport/tests/test_tasks.py @@ -1,7 +1,7 @@ from django.core.management import call_command from arkindex.project.tests import RedisMockAPITestCase, FixtureMixin from arkindex.dataimport.tasks import save_ml_results -from arkindex.documents.models import Page, Corpus, Element, ElementType +from arkindex.documents.models import Page, Element, ElementType class TestTasks(FixtureMixin, RedisMockAPITestCase): @@ -9,9 +9,8 @@ class TestTasks(FixtureMixin, RedisMockAPITestCase): Test data imports tasks """ def test_save_ml_results(self): - corpus = Corpus.objects.create(name='test class') - dog = Page.objects.create(corpus=corpus, name='A dog') - cat = Page.objects.create(corpus=corpus, name='A cat') + dog = Page.objects.create(corpus=self.corpus, name='A dog') + cat = Page.objects.create(corpus=self.corpus, name='A cat') classification = { dog.id: { diff --git a/arkindex/documents/admin.py b/arkindex/documents/admin.py index 67aad328999ae8c6ac8c1839b81eb354a0fdd128..d016502169b3bb66663368673442b841fedd7ae6 100644 --- a/arkindex/documents/admin.py +++ b/arkindex/documents/admin.py @@ -4,6 +4,7 @@ from django.urls import reverse from django.utils.html import format_html from arkindex.documents.models import Corpus, Page, Element, ElementType, Act, Transcription, MetaData from arkindex.documents.views import DumpActs +from arkindex.dataimport.models import Event from enumfields.admin import EnumFieldListFilter @@ -11,11 +12,16 @@ class CorpusAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'public', ) +class EventInline(admin.TabularInline): + model = Event + + class PageAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'page_type', 'nb', 'direction', ) list_filter = [('page_type', EnumFieldListFilter)] fields = ('id', 'name', 'folio', 'page_type', 'nb', 'direction', 'classification', 'text') readonly_fields = ('id', ) + inlines = (EventInline, ) class MetaDataInline(admin.TabularInline): @@ -27,7 +33,7 @@ class ElementAdmin(admin.ModelAdmin): list_filter = [('type', EnumFieldListFilter), 'corpus'] fields = ('id', 'type', 'name', 'corpus') readonly_fields = ('id', 'element_actions') - inlines = (MetaDataInline, ) + inlines = (MetaDataInline, EventInline) def get_urls(self): urls = super().get_urls() @@ -53,7 +59,7 @@ class ActAdmin(admin.ModelAdmin): list_display = ('id', 'name') fields = ('id', 'name', 'folio', 'number') readonly_fields = ('id', ) - inlines = (MetaDataInline, ) + inlines = (MetaDataInline, EventInline) class TranscriptionAdmin(admin.ModelAdmin): diff --git a/arkindex/documents/apps.py b/arkindex/documents/apps.py index ad5872afc6428463700cfb2e9884fe49c7405c63..22b2c724fdc7f9cc9f05e449f5341786e4cdcdd1 100644 --- a/arkindex/documents/apps.py +++ b/arkindex/documents/apps.py @@ -11,7 +11,8 @@ class DocumentsConfig(AppConfig): def _package_version(self, name): try: - return open(os.path.join(settings.BASE_DIR, '..', 'VERSION')).read() + with open(os.path.join(settings.BASE_DIR, '..', 'VERSION')) as f: + return f.read() except (OSError, AttributeError, ImproperlyConfigured) as e: # File not found or settings module not ready pass diff --git a/arkindex/documents/importer.py b/arkindex/documents/importer.py index 2f3fd3a3bf40e2c80ca19d87f019783af8ed2ac6..98cbd24727d257567c7878204d009ba100c85962 100644 --- a/arkindex/documents/importer.py +++ b/arkindex/documents/importer.py @@ -11,6 +11,7 @@ import os import fnmatch import uuid import ijson +import warnings logger = logging.getLogger(__name__) @@ -132,6 +133,11 @@ class ManifestsImporter(ABC): """Initialize a manifest importer `imgserv` can be either one ImageServer or a list of ImageServers. When `volume_name` is set, it overrides the manifest volume name.""" + + warnings.warn( + "ManifestsImporter and subclasses are deprecated; use arkindex.dataimport.iiif.ManifestParser instead", + category=DeprecationWarning, stacklevel=2) + if isinstance(imgserv, ImageServer): self.imgserv = [imgserv] else: diff --git a/arkindex/documents/migrations/0026_corpus_description.py b/arkindex/documents/migrations/0026_corpus_description.py new file mode 100644 index 0000000000000000000000000000000000000000..1433476e6a024c16ce3465f61ea5948f9ccf20ad --- /dev/null +++ b/arkindex/documents/migrations/0026_corpus_description.py @@ -0,0 +1,18 @@ +# Generated by Django 2.1 on 2018-10-02 13:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0025_avoid_doublons'), + ] + + operations = [ + migrations.AddField( + model_name='corpus', + name='description', + field=models.TextField(default=''), + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 5115a00d096eb15d7d6cf65040daa8e289d18ed8..48937a7061a029c4a781e5398f7ba17fc5261733 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -27,6 +27,7 @@ class Corpus(models.Model): ''' id = models.UUIDField(default=uuid.uuid4, primary_key=True) name = models.CharField(max_length=250) + description = models.TextField(default="") # Is this corpus publicly readable ? public = models.BooleanField(default=False) @@ -331,6 +332,19 @@ class Page(Element): self.type = ElementType.Page super().save(*args, **kwargs) + def same_as(self, other): + """ + Python's comparison methods are used on this class to perform sorting. + This method checks for equality as in there are no differences between the two pages. + Used by Git imports to tell if a page has been modified in a newer revision. + """ + # Prevent AttributeError when a Page is an Element and its subclass has not been used + if not isinstance(other, Page): + if not isinstance(other, Element) and hasattr(other, 'page'): + return False + other = other.page + return self.zone_id == other.zone_id and self.folio == other.folio + def __lt__(self, other): if self.page_type == other.page_type: if self.nb == other.nb: diff --git a/arkindex/documents/serializers/elements.py b/arkindex/documents/serializers/elements.py index 4b0112d4e536304aeb4b33080925169fe3faaa39..6d660013ac5bfa3fcb29e7d88aecebfbf9ee1c07 100644 --- a/arkindex/documents/serializers/elements.py +++ b/arkindex/documents/serializers/elements.py @@ -69,6 +69,7 @@ class CorpusSerializer(serializers.ModelSerializer): fields = ( 'id', 'name', + 'description', 'public', 'rights', ) diff --git a/arkindex/documents/tests/test_corpus.py b/arkindex/documents/tests/test_corpus.py index 8c5eaaf0fe9efd9e0fe8c2dce4b409fedb92ca3b..d0fb5dbbbe8f11ac48d6272e3bfbf9430e86f89a 100644 --- a/arkindex/documents/tests/test_corpus.py +++ b/arkindex/documents/tests/test_corpus.py @@ -30,6 +30,7 @@ class TestCorpus(FixtureAPITestCase): 'rights': ['read'], 'public': True, 'name': 'Unit Tests', + 'description': '', } ] ) @@ -48,12 +49,14 @@ class TestCorpus(FixtureAPITestCase): 'rights': ['read', 'write'], 'public': False, 'name': 'B Private', + 'description': '', }, { 'id': str(self.corpus_public.id), 'rights': ['read', 'write', 'admin'], 'public': True, 'name': 'Unit Tests', + 'description': '', } ] ) @@ -72,6 +75,7 @@ class TestCorpus(FixtureAPITestCase): 'rights': ['read', 'write', 'admin'], 'public': c.public, 'name': c.name, + 'description': '', } for c in Corpus.objects.all().order_by('name') ] diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 3fecb557ddda83ca70b9461a43d0ed3717edff91..007353308b5f8c1661bbe78c505c856d75503741 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -26,61 +26,30 @@ class ImageServer(models.Model): def __str__(self): return self.name - def find_image(self, path, offline=False, width=None, height=None): + def find_image(self, path, offline=False, width=None, height=None, save=True): """ - Lookup an image on server + Lookup an image on a server This is the preferred way to construct an image """ img = None # Try direct access to path - try: - img = self.images.get(path=path) - except Image.DoesNotExist: - pass + img = self.images.filter(path=path).first() if img is None: # Try the url encoded path - try: - img = self.images.get( - path=urllib.parse.quote_plus(path), - ) - except Image.DoesNotExist: - pass + img = self.images.filter( + path=urllib.parse.quote_plus(path), + ).first() - # Support offline queries - if offline: - if img is None: - img = self.images.create(path=path, width=width, height=height) - return img + if img is None: + img = Image(server=self, path=path, width=width, height=height) - # Check the source - if not path.endswith('/'): - path += '/' - info_url = urllib.parse.urljoin(self.build_url(path), 'info.json') - resp = requests.get(info_url, allow_redirects=True) - resp.raise_for_status() - data = resp.json() + # Support offline queries + if not offline: + img.perform_check(save=save) - if img is None: - # Use Image id from IIIF server and create image - image_id = data.get('@id') - assert image_id is not None, \ - 'Missing image id in server response' - assert image_id.startswith(self.url), \ - 'Image id does not start with server url ({} vs. {})'.format( - image_id, self.url) - image_path = image_id[len(self.url) + 1:] - img = self.images.create(path=image_path) - - assert 'width' in data, 'Missing image width in server response' - assert 'height' in data, 'Missing image height in server response' - if img.width != data['width'] or img.height != data['height']: - # Missing width or height data in image - img.width = data['width'] - img.height = data['height'] - - img.status = ImageStatus.Checked - img.save() + if save: + img.save() return img @@ -144,6 +113,37 @@ class Image(IndexableModel): resp.raise_for_status() return PIL.Image.open(resp.raw) + def perform_check(self, save=True): + """ + Check the image's existence and update width, height and status properties + """ + + path = self.path + if not path.endswith('/'): + path += '/' + info_url = urllib.parse.urljoin(self.url, 'info.json') + resp = requests.get(info_url, allow_redirects=True) + if not resp.ok: + self.status = ImageStatus.Error + return + + data = resp.json() + if any(item not in data for item in ('@id', 'width', 'height')): + self.status = ImageStatus.Error + return + + image_id = data['@id'] + assert image_id.startswith(self.url), \ + 'Image id does not start with server url ({} vs. {})'.format(image_id, self.url) + + # Use image resource ID from IIIF server to update the image path if needed + self.path = image_id[len(self.url) + 1:] + self.width, self.height = int(data['width']), int(data['height']) + + self.status = ImageStatus.Checked + if save: + self.save() + def __str__(self): return '{} - {}'.format(self.id, self.url) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index e49f47af73e2bb28a73e8371cef8642b710989d4..5d932eb86ad23d43e5aaadf4470fcb6657578429 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -1,21 +1,26 @@ from django.conf.urls import url from django.views.generic.base import RedirectView -from arkindex.documents.api.elements import \ - ElementsList, RelatedElementsList, ElementRetrieve, ElementPages, ElementSurfaces, CorpusList, CorpusPages, \ - ActEdit, PageDetails, SurfaceDetails +from arkindex.documents.api.elements import ( + ElementsList, RelatedElementsList, ElementRetrieve, ElementPages, ElementSurfaces, + CorpusList, CorpusPages, ActEdit, PageDetails, SurfaceDetails, +) from arkindex.documents.api.search import PageSearch, ActSearch from arkindex.documents.api.transcriptions import TranscriptionCreate, TranscriptionBulk -from arkindex.documents.api.iiif import \ - VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \ - TranscriptionSearchAnnotationList -from arkindex.dataimport.api import \ - DataImportsList, DataImportDetails, DataImportFailures, DataImportDemo, \ - DataFileList, DataFileRetrieve, DataFileUpload, \ - GitRepositoryImportHook, RepositoryList, AvailableRepositoriesList, RepositoryRetrieve, RepositoryStartImport -from arkindex.users.api import \ - ProvidersList, CredentialsList, CredentialsRetrieve, \ +from arkindex.documents.api.iiif import ( + VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, + TranscriptionSearchAnnotationList, +) +from arkindex.dataimport.api import ( + DataImportsList, DataImportDetails, DataImportFailures, DataImportDemo, + DataFileList, DataFileRetrieve, DataFileUpload, + RepositoryList, RepositoryRetrieve, RepositoryStartImport, + GitRepositoryImportHook, AvailableRepositoriesList, ElementHistory, +) +from arkindex.users.api import ( + ProvidersList, CredentialsList, CredentialsRetrieve, UserRetrieve, UserCreate, UserEmailLogin, UserEmailVerification +) api = [ @@ -28,6 +33,7 @@ api = [ RelatedElementsList.as_view(), name='related-elements'), url(r'elements/$', ElementsList.as_view(), name='elements'), url(r'element/(?P<pk>[\w\-]+)/$', ElementRetrieve.as_view(), name='element-retrieve'), + url(r'element/(?P<pk>[\w\-]+)/history/$', ElementHistory.as_view(), name='element-history'), url(r'page/(?P<pk>[\w\-]+)/$', PageDetails.as_view(), name='page-details'), url(r'surface/(?P<pk>[\w\-]+)/$', SurfaceDetails.as_view(), name='surface-details'), url(r'corpus/$', CorpusList.as_view(), name='corpus'), diff --git a/arkindex/project/celery.py b/arkindex/project/celery.py index 702479d4cbe07a09b0bc7216bc073f3573ff4814..0539a52d8d3deff25e6d3eedd8b5974a0af4de36 100644 --- a/arkindex/project/celery.py +++ b/arkindex/project/celery.py @@ -76,7 +76,9 @@ class ReportingTask(Task): def report_progress(self, progress, message=None): assert 0.0 <= progress <= 1.0 - self.update_state(state='PROGRESS', meta={'progress': progress}) + # State only works on tasks run in workers, not locally + if not self.request.is_eager and not self.request.called_directly: + self.update_state(state='PROGRESS', meta={'progress': progress}) # Report message as info if not isinstance(message, str): @@ -86,6 +88,9 @@ class ReportingTask(Task): def report_message(self, message, level=logging.INFO): assert isinstance(message, str) logger.log(msg=message, level=level) + # Do not use a result backend for tasks run locally + if self.request.is_eager or self.request.called_directly: + return self.backend.add_message( self.request.id, { diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 856019d9dd787af3377e19c149760e4bbbe0cd15..a8f9f60cef63f50af6bce66955520245e912b1bf 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -363,6 +363,11 @@ CORS_ORIGIN_WHITELIST = env2list('CORS_ORIGIN_WHITELIST') CORS_ALLOW_CREDENTIALS = True CORS_URLS_REGEX = r'^/api/.*$' +# Show all warnings in debug mode +if DEBUG: + import warnings + warnings.simplefilter('default') + # Optional unit tests runner with code coverage try: import django_nose # noqa diff --git a/requirements.txt b/requirements.txt index 4c81764cd7d48b9fefa5c00354c851aacb620f1d..ec73c8bdf8dff1b456a0e57e9bd8b1b7a7be9e69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,7 @@ python-gitlab==1.5.1 python-magic==0.4.15 python-memcached==1.59 pytz==2017.2 +PyYAML==3.13 requests==2.18.4 roman==2.0.0 urllib3==1.22