Skip to content
Snippets Groups Projects
Commit 211bdf65 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Import TEI metadata from Git repos

parent 2c4e3734
No related branches found
No related tags found
No related merge requests found
Showing
with 412 additions and 50 deletions
......@@ -12,11 +12,11 @@ RUN addgroup -g 1000 teklia && adduser -D -u 1000 -G teklia ark
RUN mkdir -p $PYTHON_EGG_CACHE && chmod a+rxw $PYTHON_EGG_CACHE
# Allow access to medias and logs
RUN mkdir -p /medias/staging /medias/iiif /logs
RUN chown -R ark:teklia /medias /logs
RUN mkdir -p /medias/staging /medias/iiif /logs /workers
RUN chown -R ark:teklia /medias /logs /workers
# Add system dependencies
RUN apk add --update --no-cache postgresql-dev jpeg-dev build-base wget gzip zlib-dev libmagic libxml2-dev libxslt-dev
RUN apk add --update --no-cache postgresql-dev jpeg-dev build-base wget gzip zlib-dev libmagic libxml2-dev libxslt-dev git
# Setup frontend
ENV FRONTEND_DIR="/frontend/dist"
......
include arkindex/documents/*.xsl
from django.contrib import admin
from enumfields.admin import EnumFieldListFilter
from arkindex.dataimport.models import DataImport, DataFile
from arkindex.dataimport.models import DataImport, DataFile, Repository, Revision
class DataFileInline(admin.StackedInline):
......@@ -23,5 +23,18 @@ class DataFileAdmin(admin.ModelAdmin):
inlines = [DataFileInline, ]
class RevisionInline(admin.StackedInline):
model = Revision
class RepositoryAdmin(admin.ModelAdmin):
list_display = ('id', 'url', 'user', 'corpus')
list_filter = ('corpus', )
fields = ('id', 'url', 'user', 'corpus', 'clone_user', 'clone_token', 'hook_token', 'watched_branches')
readonly_fields = ('id', )
inlines = [RevisionInline, ]
admin.site.register(DataImport, DataImportAdmin)
admin.site.register(DataFile, DataFileAdmin)
admin.site.register(Repository, RepositoryAdmin)
from django.shortcuts import get_object_or_404
from rest_framework.generics import \
ListAPIView, ListCreateAPIView, RetrieveUpdateDestroyAPIView
from rest_framework.views import APIView
......@@ -5,9 +6,10 @@ from rest_framework.parsers import MultiPartParser, FileUploadParser
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework import status
from rest_framework.exceptions import ValidationError
from rest_framework.exceptions import ValidationError, NotAuthenticated, AuthenticationFailed
from arkindex.documents.models import Corpus
from arkindex.dataimport.models import DataImport, DataFile, DataImportState, DataImportMode
from arkindex.dataimport.models import \
DataImport, DataFile, DataImportState, DataImportMode, Repository, RepositorySource, Revision
from arkindex.dataimport.serializers import \
DataImportLightSerializer, DataImportSerializer, DataFileSerializer
import hashlib
......@@ -135,3 +137,49 @@ class DataFileUpload(APIView):
return Response(
data=DataFileSerializer(df).data,
status=status.HTTP_201_CREATED)
class GitRepositoryImportHook(APIView):
"""
Handle Git push events
"""
def post(self, request, pk=None, **kwargs):
repo = get_object_or_404(Repository, id=pk)
if repo.source == RepositorySource.GitLab:
if 'HTTP_X_GITLAB_EVENT' not in request.META:
raise ValidationError("Missing GitLab event type")
if request.META['HTTP_X_GITLAB_EVENT'] != 'Push Hook':
raise ValidationError("Unsupported GitLab event type")
if 'HTTP_X_GITLAB_TOKEN' not in request.META:
raise NotAuthenticated("Missing GitLab secret token")
if request.META['HTTP_X_GITLAB_TOKEN'] != repo.hook_token:
raise AuthenticationFailed("Invalid GitLab secret token")
assert isinstance(request.data, dict)
assert request.data['object_kind'] == 'push'
if request.data['ref'] not in repo.watched_branches:
return Response(status=status.HTTP_204_NO_CONTENT)
# Already took care of this event
if Revision.objects.filter(
repo=repo,
ref=request.data['ref'],
hash=request.data['checkout_sha']).exists():
return Response(status=status.HTTP_204_NO_CONTENT)
rev = Revision.objects.create(
repo=repo,
hash=request.data['checkout_sha'],
ref=request.data['ref'],
message=request.data['commits'][-1]['message'],
author=request.data['commits'][-1]['author']['name'],
)
else:
raise NotImplementedError
rev.start_import()
return Response(status=status.HTTP_204_NO_CONTENT)
# Generated by Django 2.0 on 2018-07-26 09:46
import arkindex.project.fields
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import uuid
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('dataimport', '0001_initial'),
('documents', '0019_metadatas'),
]
operations = [
migrations.CreateModel(
name='Repository',
fields=[
('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('url', models.URLField(unique=True)),
('hook_token', models.CharField(max_length=250, unique=True)),
('clone_user', models.CharField(max_length=100)),
('clone_token', models.CharField(max_length=250)),
('corpus', models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='repos',
to='documents.Corpus',
)),
('user', models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='repos',
to=settings.AUTH_USER_MODEL,
)),
],
options={
'verbose_name_plural': 'repositories',
},
),
migrations.AlterModelOptions(
name='datafile',
options={'ordering': ['corpus', 'name']},
),
migrations.AlterModelOptions(
name='dataimport',
options={'ordering': ['corpus', '-created']},
),
migrations.CreateModel(
name='Revision',
fields=[
('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('hash', models.CharField(max_length=50)),
('ref', models.CharField(max_length=50)),
('message', models.TextField()),
('author', models.CharField(max_length=50)),
('repo', models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='revisions',
to='dataimport.Repository',
)),
],
),
migrations.AddField(
model_name='dataimport',
name='revision',
field=models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='dataimport',
to='dataimport.Revision',
),
),
migrations.AlterUniqueTogether(
name='revision',
unique_together={('repo', 'hash')},
),
migrations.AddField(
model_name='repository',
name='watched_branches',
field=arkindex.project.fields.ArrayField(
base_field=models.CharField(max_length=50),
default=['refs/heads/master'],
size=None,
),
),
]
......@@ -7,8 +7,10 @@ from celery.canvas import Signature
from celery.result import AsyncResult, GroupResult
from enumfields import EnumField, Enum
from arkindex.project.models import IndexableModel
from arkindex.project.fields import ArrayField
import uuid
import os
import urllib.parse
class DataImportState(Enum):
......@@ -24,6 +26,7 @@ class DataImportMode(Enum):
Annotations = 'annotations'
Surfaces = 'surfaces'
Acts = 'acts'
Repository = 'repository'
class DataImport(IndexableModel):
......@@ -36,6 +39,8 @@ class DataImport(IndexableModel):
state = EnumField(DataImportState, default=DataImportState.Created, max_length=30)
mode = EnumField(DataImportMode, max_length=30)
files = models.ManyToManyField('dataimport.DataFile', related_name='imports')
revision = models.OneToOneField(
'dataimport.Revision', related_name='dataimport', on_delete=models.CASCADE, blank=True, null=True)
payload = JSONField(null=True, blank=True)
root_id = models.UUIDField(null=True, blank=True)
task_count = models.PositiveSmallIntegerField(null=True, blank=True)
......@@ -66,12 +71,15 @@ class DataImport(IndexableModel):
return self.tasks[-1].result
def build_workflow(self):
# Only Images import is supported
assert self.mode == DataImportMode.Images
# Prevent circular imports
from arkindex.dataimport.tasks import check_images, import_images
return check_images.s(self) | import_images.s(self)
if self.mode == DataImportMode.Images:
# Prevent circular imports
from arkindex.dataimport.tasks import check_images, import_images
return check_images.s(self) | import_images.s(self)
elif self.mode == DataImportMode.Repository:
from arkindex.dataimport.tasks import clone_repo, import_repo, cleanup_repo
return clone_repo.si(self) | import_repo.si(self) | cleanup_repo.si(self)
else:
raise NotImplementedError
def get_task_count(self, signature):
assert isinstance(signature, Signature)
......@@ -131,3 +139,70 @@ class DataFile(models.Model):
@property
def staging_path(self):
return os.path.join(settings.MEDIA_ROOT, str(self.id))
class RepositorySource(Enum):
GitHub = 'github'
GitLab = 'gitlab'
class Repository(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4)
url = models.URLField(unique=True)
hook_token = models.CharField(max_length=250, unique=True)
clone_user = models.CharField(max_length=100)
clone_token = models.CharField(max_length=250)
corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='repos')
user = models.ForeignKey('users.User', on_delete=models.CASCADE, related_name='repos')
watched_branches = ArrayField(models.CharField(max_length=50), default=['refs/heads/master'])
class Meta:
verbose_name_plural = 'repositories'
@property
def auth_url(self):
"""Repository URL with added credentials"""
parsed = list(urllib.parse.urlsplit(self.url))
if '@' in parsed[1]: # URL seems to already have credentials
return self.url
parsed[1] = '{}:{}@{}'.format(self.clone_user, self.clone_token, parsed[1])
return urllib.parse.urlunsplit(parsed)
@property
def source(self):
parsed = urllib.parse.urlsplit(self.url)
if parsed.netloc == 'gitlab.com':
return RepositorySource.GitLab
elif parsed.netloc == 'github.com':
return RepositorySource.GitHub
else:
raise ValueError('Unknown repository source')
@property
def clone_dir(self):
return os.path.join(settings.CELERY_WORKING_DIR, str(self.id))
class Revision(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4)
repo = models.ForeignKey('dataimport.Repository', on_delete=models.CASCADE, related_name='revisions')
hash = models.CharField(max_length=50)
ref = models.CharField(max_length=50)
message = models.TextField()
author = models.CharField(max_length=50)
class Meta:
unique_together = (('repo', 'hash'), )
@property
def commit_url(self):
return '{}/commit/{}'.format(self.repo.url.rstrip('/'), self.hash)
def start_import(self):
DataImport.objects.create(
creator=self.repo.user,
corpus=self.repo.corpus,
mode=DataImportMode.Repository,
state=DataImportState.Configured,
revision=self,
).start()
from rest_framework import serializers
from rest_framework.utils import model_meta
from arkindex.project.serializer_fields import EnumField
from arkindex.dataimport.models import DataImport, DataImportMode, DataImportState, DataFile
from arkindex.dataimport.models import DataImport, DataImportMode, DataImportState, DataFile, Revision
import celery.states
......@@ -145,3 +145,20 @@ class DataFileSerializer(serializers.ModelSerializer):
'size',
)
read_only_fields = ('id', 'size', 'content_type', )
class RevisionSerializer(serializers.ModelSerializer):
"""
Serialize a repository revision
"""
class Meta:
model = Revision
fields = (
'id',
'hash',
'ref',
'message',
'author',
'commit_url',
)
......@@ -3,20 +3,36 @@ from celery.utils.log import get_task_logger
from celery.signals import task_postrun
from celery.states import EXCEPTION_STATES
from django.conf import settings
from django.db import transaction
from arkindex.project.celery import ReportingTask
from arkindex.documents.models import Element, ElementType
from arkindex.documents.importer import import_page
from arkindex.documents.tei import TeiParser
from arkindex.images.models import ImageServer, ImageStatus
from arkindex.dataimport.models import DataImport, DataImportState
from arkindex.dataimport.models import DataImport, DataImportState, DataImportMode
from PIL import Image
from shutil import copyfile
import os
import glob
import logging
import shutil
import git
import urllib.parse
root_logger = logging.getLogger(__name__)
logger = get_task_logger(__name__)
class TaskLoggingHandler(logging.Handler):
def __init__(self, task, level=logging.WARNING):
assert isinstance(task, ReportingTask)
super().__init__(level=level)
self.task = task
def emit(self, record):
self.task.report_message(record.getMessage(), level=record.levelno)
@shared_task(bind=True, base=ReportingTask)
def check_images(self, dataimport):
assert isinstance(dataimport, DataImport)
......@@ -75,7 +91,7 @@ def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGES
ext = '.jp2' if pillow_img.format == 'JPEG2000' else '.jpg'
newfilename = str(datafile.id) + ext
copyfile(datafile.staging_path, os.path.join(dataimport.iiif_path, newfilename))
shutil.copyfile(datafile.staging_path, os.path.join(dataimport.iiif_path, newfilename))
img, _ = server.images.get_or_create(
path=urllib.parse.urljoin(dataimport.folder_name + '/', newfilename),
......@@ -92,6 +108,69 @@ def import_images(self, valid_files, dataimport, server_id=settings.LOCAL_IMAGES
return {'volume': str(vol.id)}
@shared_task(bind=True, base=ReportingTask)
def clone_repo(self, dataimport):
assert isinstance(dataimport, DataImport)
assert dataimport.mode == DataImportMode.Repository
assert dataimport.revision is not None
self.report_progress(0, "Cloning repository...")
repo_dir = dataimport.revision.repo.clone_dir
if os.path.exists(repo_dir):
shutil.rmtree(repo_dir)
repo = git.Repo.clone_from(dataimport.revision.repo.auth_url, repo_dir, no_checkout=True)
commit_hash = dataimport.revision.hash
self.report_progress(0.5, "Checking out commit {}...".format(commit_hash))
repo.head.reference = repo.create_head('commit_{}'.format(commit_hash), commit_hash)
repo.head.reset(index=True, working_tree=True)
@shared_task(bind=True, base=ReportingTask)
def import_repo(self, dataimport):
handler = TaskLoggingHandler(self)
root_logger.addHandler(handler)
self.report_progress(0, "Finding XML files...")
xml_files = glob.glob(os.path.join(dataimport.revision.repo.clone_dir, '**/*.xml'), recursive=True)
for i, xml_file in enumerate(xml_files, 1):
filename = os.path.basename(xml_file)
self.report_progress(i / len(xml_files), 'Importing file {} of {}: {}'.format(i, len(xml_files), filename))
try:
parser = TeiParser(xml_file)
parser.check()
matches = parser.match_database(dataimport.corpus)
for db_elt, tei_elt in matches:
with transaction.atomic():
# Remove old metadatas
db_elt.metadatas.all().delete()
# Create new ones
for name, meta in tei_elt.build_metadata().items():
if not meta[1]:
continue
db_elt.metadatas.create(
name=name,
type=meta[0],
value=meta[1],
revision=dataimport.revision,
)
except Exception as e:
self.report_message(
"Importing of {} failed: {}".format(filename, str(e)), level=logging.WARNING)
root_logger.removeHandler(handler)
@shared_task(bind=True, base=ReportingTask)
def cleanup_repo(self, dataimport):
shutil.rmtree(dataimport.revision.repo.clone_dir)
@task_postrun.connect
def dataimport_postrun(task_id, task, state, args=(), **kwargs):
'''
......
......@@ -25,7 +25,7 @@ class Command(BaseCommand):
parser.add_argument(
'--corpus',
type=str,
help='Slug of corpus to import manifests into',
help='Slug of corpus to import metadata into',
required=True,
)
parser.add_argument(
......
# Generated by Django 2.0 on 2018-07-25 09:52
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('dataimport', '0002_repository_revision'),
('documents', '0019_metadatas'),
]
operations = [
migrations.AddField(
model_name='metadata',
name='revision',
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
to='dataimport.Revision',
),
),
]
......@@ -415,6 +415,7 @@ class MetaData(models.Model):
name = models.CharField(max_length=250)
type = EnumField(MetaType, max_length=50, db_index=True)
value = models.TextField()
revision = models.ForeignKey('dataimport.Revision', on_delete=models.CASCADE, blank=True, null=True)
class Meta:
ordering = ('element', 'name')
......
......@@ -5,6 +5,7 @@ from arkindex.documents.models import \
Element, ElementType, Transcription, Page, PageType, PageDirection, Act, Corpus, MetaData, MetaType
from arkindex.images.models import Image, Zone
from arkindex.images.serializers import ZoneSerializer, ImageSerializer
from arkindex.dataimport.serializers import RevisionSerializer
from arkindex.project.serializer_fields import EnumField, ViewerURLField
from arkindex.project.tools import sslify_url
import urllib.parse
......@@ -15,6 +16,7 @@ class MetaDataSerializer(serializers.ModelSerializer):
Serialises some Metadata for any Element
"""
type = EnumField(MetaType)
revision = RevisionSerializer()
class Meta:
model = MetaData
......@@ -23,6 +25,7 @@ class MetaDataSerializer(serializers.ModelSerializer):
'type',
'name',
'value',
'revision',
)
......
......@@ -247,9 +247,9 @@ class TeiParser(object):
self.corpus = Corpus(root)
def check(self):
logging.info(self.corpus)
logger.info(self.corpus)
for tei in self.corpus.tei:
logging.info('{} - completion {:.1%}'.format(tei, tei.completion))
logger.info('{} - completion {:.1%}'.format(tei, tei.completion))
def match_database(self, corpus):
'''
......@@ -262,37 +262,40 @@ class TeiParser(object):
# Match volumes
out = []
for tei in self.corpus.tei:
if not tei.witness:
logger.warning('No witness in {}'.format(str(tei)))
continue
tei_name = tei.witness.id or tei.witness.repository_id
volume = find_closest(tei_name, volumes)
if volume:
out.append((volume, tei))
logger.info('Matched {} with {}'.format(volume, tei))
# Load volume acts
volume_acts = Element.objects.get_descending(volume.id, type=ElementType.Act)
if not volume_acts.exists():
logger.warn('No acts in DB for {}'.format(volume))
if not volume:
logger.warning('No match for {}'.format(tei))
continue
out.append((volume, tei))
logger.info('Matched {} with {}'.format(volume, tei))
# Load volume acts
volume_acts = Element.objects.get_descending(volume.id, type=ElementType.Act)
if not volume_acts.exists():
logger.warning('No acts in DB for {}'.format(volume))
continue
# Match acts
for text in tei.texts:
if text.witness is None:
logger.warning('No witness on text, skipping.')
continue
# Match acts
for text in tei.texts:
if text.witness is None:
logger.warn('No witness on text, skipping.')
continue
act = Act.objects.filter(
id__in=volume_acts,
number=text.witness.id,
).first()
if act:
out.append((act, text))
logger.info('Matched {} with {}'.format(act, text))
else:
logger.warn('No match for {}'.format(text))
else:
logger.warn('No match for {}'.format(tei))
act = Act.objects.filter(
id__in=volume_acts,
number=text.witness.id,
).first()
if act:
out.append((act, text))
logger.info('Matched {} with {}'.format(act, text))
else:
logger.warning('No match for {}'.format(text))
return out
......
......@@ -85,13 +85,16 @@ class TestAct(FixtureAPITestCase):
[{'id': str(metas[1].id),
'name': 'origin',
'type': 'date',
'value': '2010/01'},
'value': '2010/01',
'revision': None},
{'id': str(metas[2].id),
'name': 'place',
'type': 'location',
'value': 'somewhere'},
'value': 'somewhere',
'revision': None},
{'id': str(metas[0].id),
'name': 'test',
'type': 'text',
'value': 'aha'}]
'value': 'aha',
'revision': None}]
)
......@@ -7,7 +7,7 @@ from arkindex.documents.api import \
TranscriptionSearch, ActSearch, TranscriptionSearchAnnotationList, \
ActEdit, TranscriptionCreate, TranscriptionBulk, SurfaceDetails
from arkindex.dataimport.api import \
DataImportsList, DataImportDetails, DataFileList, DataFileRetrieve, DataFileUpload
DataImportsList, DataImportDetails, DataFileList, DataFileRetrieve, DataFileUpload, GitRepositoryImportHook
api = [
......@@ -73,4 +73,5 @@ api = [
url(r'^imports/files/(?P<pk>[\w\-]+)$', DataFileList.as_view(), name='file-list'),
url(r'^imports/file/(?P<pk>[\w\-]+)$', DataFileRetrieve.as_view(), name='file-retrieve'),
url(r'^imports/upload/(?P<pk>[\w\-]+)$', DataFileUpload.as_view(), name='file-upload'),
url(r'^imports/hook/(?P<pk>[\w\-]+)$', GitRepositoryImportHook.as_view(), name='import-hook'),
]
......@@ -227,6 +227,7 @@ IIIF_TRANSCRIPTION_LIST = False
# TEI XSLT file path
TEI_XSLT_PATH = os.path.join(BASE_DIR, 'documents/teitohtml.xsl')
# Cache into memcached
CACHES = {
'default': {
......@@ -311,6 +312,7 @@ CELERY_ONCE = {
'default_timeout': 3600,
}
}
CELERY_WORKING_DIR = os.environ.get('CELERY_WORKING_DIR', os.path.join(BASE_DIR, 'workers'))
# Email
EMAIL_SUBJECT_PREFIX = '[Arkindex {}] '.format(ARKINDEX_ENV)
......
......@@ -8,6 +8,7 @@ djangorestframework==3.7.1
django-webpack-loader==0.5.0
elasticsearch==6.2.0
et-xmlfile==1.0.1
gitpython==2.1.11
idna==2.6
ijson==2.3
jdcal==1.3
......
......@@ -24,6 +24,7 @@ setup(
'test': tests_requirements,
},
packages=find_packages(),
include_package_data=True,
py_modules=['arkindex', ],
scripts=[
'arkindex/manage.py',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment