Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/backend
1 result
Show changes
Commits on Source (13)
Showing
with 906 additions and 449 deletions
0.14.3-beta2
0.14.3-rc1
......@@ -694,9 +694,11 @@ class WorkerVersionList(ListCreateAPIView):
}
def get_queryset(self):
return WorkerVersion.objects.filter(
worker_id=self.kwargs['pk']
).prefetch_related('revision').order_by('-revision__created')
return WorkerVersion.objects \
.filter(worker_id=self.kwargs['pk']) \
.select_related('revision__repo', 'worker__repository') \
.prefetch_related('revision__refs', 'revision__versions') \
.order_by('-revision__created')
def create(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
......
......@@ -352,13 +352,24 @@ class Revision(IndexableModel):
@property
def state(self):
# Computes revision state according to its versions one
"""
Computes revision state according to its versions'
# If there is one version in error, revision state is too
# Else if there is one version processing, revision state is too
# Else if all versions are available, then the revision is too
# Else, the revision is created since it has either no version or versions mixing processing/created states
states = set(self.versions.values_list('state', flat=True))
If there is one version in error, revision state is too
Else if there is one version processing, revision state is too
Else if all versions are available, then the revision is too
Else, the revision is created since it has either no version or versions mixing processing/created states
"""
# This prevents performing another SQL request when versions have already been prefetched.
# See https://stackoverflow.com/a/19651840/5990435
if (
hasattr(self, "_prefetched_objects_cache")
and self.versions.field.remote_field.get_cache_name()
in self._prefetched_objects_cache
):
states = set(version.state for version in self.versions.all())
else:
states = set(self.versions.values_list('state', flat=True))
if WorkerVersionState.Error in states:
return WorkerVersionState.Error
......
......@@ -46,6 +46,11 @@ class WorkerVersionSerializer(serializers.ModelSerializer):
'worker',
)
read_only_fields = ('docker_image_name',)
# Avoid loading all revisions and all Ponos artifacts when opening this endpoint in a browser
extra_kwargs = {
'revision': {'style': {'base_template': 'input.html'}},
'docker_image': {'style': {'base_template': 'input.html'}},
}
def to_representation(self, instance):
self.fields['revision'] = RevisionWithRefsSerializer(read_only=True)
......
......@@ -679,7 +679,7 @@ class TestProcessElements(FixtureAPITestCase):
def test_elements_count(self):
"""
Elements count can be retrieved when no cursor is set
Elements count can be retrieved with with_count parameter
"""
self.client.force_login(self.superuser)
with self.assertNumQueries(7):
......@@ -697,7 +697,8 @@ class TestProcessElements(FixtureAPITestCase):
second_page = self.client.get(next_url)
data = second_page.json()
self.assertIsNone(data['count'])
# Count should be present in the next URL
self.assertEqual(data['count'], 12)
self.assertIsNone(data['next'])
self.assertEqual(len(data['results']), 6)
......
......@@ -162,12 +162,14 @@ class TestWorkersWorkerVersions(FixtureAPITestCase):
# Tests on get_query_set for WorkerVersionList
def test_versions_list_requires_login(self):
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(self.worker_1.id)}))
with self.assertNumQueries(0):
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(self.worker_1.id)}))
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
def test_versions_list(self):
self.client.force_login(self.user)
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(self.worker_1.id)}))
with self.assertNumQueries(5):
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(self.worker_1.id)}))
self.assertEqual(response.status_code, status.HTTP_200_OK)
data = response.json()
self.assertEqual(len(data), 1)
......@@ -190,7 +192,8 @@ class TestWorkersWorkerVersions(FixtureAPITestCase):
configuration={"test": "test2"}
)
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(worker_2.id)}))
with self.assertNumQueries(5):
response = self.client.get(reverse('api:worker-versions', kwargs={'pk': str(worker_2.id)}))
self.assertEqual(response.status_code, status.HTTP_200_OK)
data = response.json()
self.assertEqual(len(data), 1)
......
......@@ -74,9 +74,9 @@ class ElementAdmin(admin.ModelAdmin):
class TranscriptionAdmin(admin.ModelAdmin):
list_display = ('id', 'text', 'score', 'element', )
list_filter = [('type', EnumFieldListFilter), 'source']
fields = ('id', 'text', 'score', 'element', 'zone', 'source', )
fields = ('id', 'text', 'score', 'element', 'source', )
readonly_fields = ('id', )
raw_id_fields = ('element', 'zone', )
raw_id_fields = ('element', )
class MLClassAdmin(admin.ModelAdmin):
......
......@@ -3,7 +3,8 @@ from datetime import datetime, timezone
from psycopg2.extras import execute_values
from django.conf import settings
from django.db import transaction, connection
from django.db.models import Q, Prefetch, Max, QuerySet
from django.db.models import Q, Prefetch, Max, QuerySet, CharField
from django.db.models.functions import Cast
from django.shortcuts import get_object_or_404
from django.utils.functional import cached_property
from rest_framework.exceptions import ValidationError, NotFound
......@@ -790,13 +791,11 @@ class ElementTranscriptions(ListAPIView):
))
self.check_object_permissions(self.request, element)
# ORDER BY casting IDs as char to avoid the PostgreSQL optimizer's inefficient scan
queryset = Transcription.objects \
.prefetch_related('zone__image__server', 'source') \
.extra(
# ORDER BY casting IDs as char to avoid PostgreSQL optimizer inefficient scan
select={'char_id': 'CAST(id AS CHAR(36))'},
order_by=['char_id']
)
.prefetch_related('element__zone__image__server', 'source') \
.annotate(char_id=Cast('id', output_field=CharField())) \
.order_by('char_id')
if self.is_recursive:
queryset = queryset.filter(
......@@ -1020,7 +1019,7 @@ class ElementBulkCreate(CreateAPIView):
# Use WKB representation to compare existing zones
# to avoid comparing references or slower coordinates
polygon.wkb: zone_id
for polygon, zone_id in Zone.objects.filter(image_id=image_id).values_list('polygon', 'id')
for polygon, zone_id in Zone.objects.using('default').filter(image_id=image_id).values_list('polygon', 'id')
}
# Retrieve or create required zones
......@@ -1047,6 +1046,7 @@ class ElementBulkCreate(CreateAPIView):
int,
ElementPath
.objects
.using('default')
.filter(
path__last=self.element.id,
element__type_id__in=set(element_data['type'] for element_data in elements)
......
......@@ -109,14 +109,14 @@ class EntityElements(ListAPIView):
corpus__in=Corpus.objects.readable(self.request.user),
metadatas__entity_id=pk
) \
.select_related('type') \
.prefetch_related('metadatas__entity', 'metadatas__revision', 'corpus')
.select_related('type', 'corpus') \
.prefetch_related('metadatas__entity', 'metadatas__revision', 'zone__image__server')
transcription_elements = Element.objects \
.filter(
corpus__in=Corpus.objects.readable(self.request.user),
transcriptions__transcription_entities__entity_id=pk
).select_related('type') \
.prefetch_related('metadatas__entity', 'metadatas__revision', 'corpus')
).select_related('type', 'corpus') \
.prefetch_related('metadatas__entity', 'metadatas__revision', 'zone__image__server')
return metadata_elements.union(transcription_elements) \
.order_by('name', 'type')
......
......@@ -278,7 +278,6 @@ class ElementTranscriptionsBulk(CreateAPIView):
transcriptions.append(Transcription(
element=annotation['element'],
type=tr_type,
zone=None,
worker_version=worker_version,
text=annotation['text'],
score=annotation['score']
......@@ -508,7 +507,12 @@ class ClassificationReject(ClassificationModerationActionsMixin):
def put(self, request, *args, **kwargs):
instance = self.get_object()
if instance.source.slug == 'manual':
manual = (
instance.source and instance.source.slug == 'manual'
or not instance.source and not instance.worker_version
)
if manual:
# Delete manual classifications upon rejection
instance.delete()
return Response(None, status=status.HTTP_204_NO_CONTENT)
......
......@@ -75,29 +75,28 @@ class ReindexConsumer(SyncConsumer):
elif element_id or corpus_id:
if element_id:
# Pick this element, and all its children
elements_queryset = list(Element.objects.get_descending(element_id))
elements_queryset.append(Element.objects.get(id=element_id))
elements_queryset = Element.objects.filter(Q(id=element_id) | Q(paths__path__contains=[element_id]))
else:
# Pick all elements in the corpus
elements_queryset = Element.objects.filter(corpus_id=corpus_id)
transcriptions_queryset = Transcription.objects.filter(
element__in=elements_queryset,
zone__isnull=False
)
transcriptions_queryset = Transcription.objects.filter(element__in=elements_queryset)
entities_queryset = Entity.objects.filter(
Q(metadatas__element__in=elements_queryset)
| Q(transcriptions__element__in=elements_queryset)
)
else:
transcriptions_queryset = Transcription.objects.filter(zone__isnull=False)
transcriptions_queryset = Transcription.objects.all()
elements_queryset = Element.objects.all()
entities_queryset = Entity.objects.all()
if transcriptions:
indexer.run_index(transcriptions_queryset, bulk_size=400)
indexer.run_index(transcriptions_queryset.select_related('element'), bulk_size=400)
if elements:
indexer.run_index(elements_queryset, bulk_size=100)
indexer.run_index(
elements_queryset.select_related('type').prefetch_related('metadatas', 'transcriptions'),
bulk_size=100,
)
if entities:
indexer.run_index(entities_queryset, bulk_size=400)
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
from django.core.management.base import BaseCommand
from arkindex_common.ml_tool import MLToolType
from arkindex_common.enums import TranscriptionType, MetaType
from arkindex.documents.models import Corpus, Element, Transcription, DataSource, MetaData
from arkindex.documents.models import Corpus, Element, DataSource, MetaData
from arkindex.dataimport.models import RepositoryType, WorkerVersion, WorkerVersionState, Workflow
from arkindex.images.models import ImageServer, Image, Zone
from arkindex.users.models import User, CorpusRight
......@@ -150,6 +150,11 @@ class Command(BaseCommand):
# Allows manual transcriptions of type 'line' for text lines
allowed_transcription=TranscriptionType.Line
)
word_type = corpus.types.create(
slug='word',
display_name='Word',
allowed_transcription=TranscriptionType.Word
)
# Create 2 volumes
vol1 = Element.objects.create(
......@@ -217,21 +222,24 @@ class Command(BaseCommand):
# Create transcriptions on images of volume 1
for page in (p1_1, p1_2, p1_3):
for word, pos in [("PARIS", 100), ("ROY", 400), ("DATUM", 700)]:
Transcription.objects.create(
element=page,
element = corpus.elements.create(
type=word_type,
name=word,
zone=makezone(page.zone.image, pos, pos + 100)
)
element.add_parent(page)
element.transcriptions.create(
source=recognizer_source,
text=word,
type=TranscriptionType.Word,
zone=makezone(page.zone.image, pos, pos + 100),
score=1.0,
)
# Create a page transcriptions on page 1 with no zone
# Create a page transcription on page 1
p1_1.transcriptions.create(
source=recognizer_source,
text='Lorem ipsum dolor sit amet',
type=TranscriptionType.Page,
zone=None,
score=1.0,
)
......
......@@ -16,13 +16,15 @@ logger = logging.getLogger(__name__)
def get_transcriptions(corpus=None, folder=None):
if folder:
# Lookup all the transcriptions linked to a folder
return Transcription.objects.filter(
queryset = Transcription.objects.filter(
element__in=Element.objects.get_descending(folder.id)
).distinct()
elif corpus:
return Transcription.objects.filter(element__corpus=corpus)
queryset = Transcription.objects.filter(element__corpus=corpus)
else:
queryset = Transcription.objects.all()
return Transcription.objects.all()
return queryset.select_related('element')
def get_elements(corpus=None, folder=None):
......@@ -33,7 +35,7 @@ def get_elements(corpus=None, folder=None):
else:
queryset = Element.objects.all()
return queryset.prefetch_related('metadatas', 'transcriptions')
return queryset.select_related('type').prefetch_related('metadatas', 'transcriptions')
def get_entities(corpus=None, folder=None):
......
# Generated by Django 3.1 on 2020-09-01 07:48
from django.db import migrations, models
from arkindex_common.enums import TranscriptionType
def preflight_checks(apps, schema_editor):
ElementType = apps.get_model('documents', 'ElementType')
existing_types = []
for ts_type in TranscriptionType:
if ElementType.objects.filter(slug=f'transcription_{ts_type.value}').exists():
existing_types.append(f'`transcription_{ts_type.value}`')
if existing_types:
raise AssertionError(
'This migration could not be run because one or more element types use the reserved slug(s) '
+ ', '.join(existing_types)
)
FORWARD_SQL = [
'CREATE EXTENSION IF NOT EXISTS "uuid-ossp";',
# Early handling for the edge case of transcriptions already on the correct element
"""
UPDATE documents_transcription transcription
SET zone_id = NULL
FROM documents_element element
WHERE transcription.element_id = element.id
AND transcription.zone_id IS NOT NULL
AND transcription.zone_id = element.zone_id;
""",
# Create element types starting with transcription_* as needed
"""
INSERT INTO documents_elementtype (id, corpus_id, slug, display_name, folder, allowed_transcription)
SELECT
uuid_generate_v4(),
element.corpus_id,
'transcription_' || transcription.type,
initcap(transcription.type) || ' Transcription',
FALSE,
transcription.type
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE transcription.zone_id IS NOT NULL
GROUP BY element.corpus_id, transcription.type;
""",
# Create new elements
"""
INSERT INTO documents_element (id, corpus_id, type_id, name, zone_id, source_id, worker_version_id, created, updated)
SELECT
transcription.id,
element.corpus_id,
type.id,
(ROW_NUMBER() OVER (
PARTITION BY
transcription.element_id,
transcription.source_id,
transcription.worker_version_id,
transcription.type
ORDER BY
ST_Y(ST_StartPoint(polygon)),
ST_X(ST_StartPoint(polygon))
))::varchar,
transcription.zone_id,
transcription.source_id,
transcription.worker_version_id,
NOW(),
NOW()
FROM
documents_transcription transcription
INNER JOIN documents_element element on (transcription.element_id = element.id)
INNER JOIN documents_elementtype type ON (type.corpus_id = element.corpus_id AND type.slug = 'transcription_' || transcription.type)
INNER JOIN images_zone zone ON (transcription.zone_id = zone.id);
""",
# Create element paths
# Append to existing parent paths of the parent element, or create one new element path with the parent element itself in it
"""
INSERT INTO documents_elementpath (id, element_id, path, ordering)
SELECT
uuid_generate_v4(),
transcription.id,
COALESCE(path.path, ARRAY[]::uuid[]) || transcription.element_id,
ROW_NUMBER() OVER (
PARTITION BY transcription.element_id
ORDER BY
ST_Y(ST_StartPoint(polygon)),
ST_X(ST_StartPoint(polygon))
)
FROM
documents_transcription transcription
INNER JOIN images_zone zone ON (zone.id = transcription.zone_id)
LEFT JOIN documents_elementpath path ON (path.element_id = transcription.element_id);
""",
# Move transcriptions to their new elements
"""
UPDATE documents_transcription
SET element_id = id
WHERE zone_id IS NOT NULL;
""",
# At this point, we can drop the zone column, but this would fail due to 'pending trigger events'
# Postgres does not allow editing the schema *after* editing the data in the same transcription;
# This migration is continued in documents.0021 to allow a new database transaction to happen.
]
class Migration(migrations.Migration):
dependencies = [
('documents', '0020_remove_source_xor_version_constraint'),
('images', '0005_polygon_index')
]
operations = [
migrations.AddConstraint(
model_name='transcription',
constraint=models.CheckConstraint(
check=~models.Q(source_id__isnull=False, worker_version_id__isnull=False),
name='transcription_source_not_worker_version',
)
),
migrations.RunPython(
preflight_checks,
reverse_code=migrations.RunPython.noop,
elidable=True,
),
migrations.RunSQL(
FORWARD_SQL,
reverse_sql=migrations.RunSQL.noop,
elidable=True,
),
]
# Generated by Django 3.1 on 2020-09-01 07:48
from django.db import migrations
FORWARD_SQL = [
# Use a temporary table here to iterate over transcriptions just once before deleting,
# causing this migration to only take a few minutes
# Note the strange join conditions as either source_id or worker_version_id are NULL,
# which causes a NATURAL JOIN or a JOIN … USING to fail since comparing NULLs returns NULL.
"""
CREATE TEMPORARY TABLE duplicate_ids AS
WITH filters AS (
SELECT
sub.*,
FIRST_VALUE(id) OVER (
PARTITION BY
transcription.element_id,
transcription.source_id,
transcription.worker_version_id
) AS keep_id
FROM documents_transcription transcription
INNER JOIN (
SELECT element_id, source_id, worker_version_id
FROM documents_transcription
GROUP BY element_id, source_id, worker_version_id
HAVING COUNT(*) > 1
) sub ON (
sub.element_id = transcription.element_id AND (
sub.source_id = transcription.source_id
OR sub.worker_version_id = transcription.worker_version_id
)
)
)
SELECT id
FROM documents_transcription transcription
INNER JOIN filters ON (
filters.element_id = transcription.element_id AND (
filters.source_id = transcription.source_id
OR filters.worker_version_id = transcription.worker_version_id
)
)
WHERE keep_id != id;
""",
# Remove any TranscriptionEntity that could be linked to the duplicate transcriptions
"""
DELETE FROM documents_transcriptionentity transcriptionentity
USING duplicate_ids
WHERE transcriptionentity.transcription_id = duplicate_ids.id;
""",
# Remove duplicate transcriptions
"""
DELETE FROM documents_transcription transcription
USING duplicate_ids
WHERE transcription.id = duplicate_ids.id;
""",
'DROP TABLE duplicate_ids;',
]
class Migration(migrations.Migration):
dependencies = [
('documents', '0021_move_transcriptions'),
]
operations = [
migrations.RemoveField(
model_name='transcription',
name='zone',
),
# Remove the few remaining transcriptions that would break the unique constraints we will add in documents.0023.
# Those are transcriptions from the same source, on the same element, with the exact same zones.
# This query is rather complex as we want to only remove duplicates, and window functions have their limits,
# but the GROUP BY…HAVING will quickly exclude most of the table so it isn't slow.
migrations.RunSQL(
FORWARD_SQL,
reverse_sql=migrations.RunSQL.noop,
elidable=True,
),
]
......@@ -440,12 +440,6 @@ class Transcription(models.Model):
max_length=50,
db_index=True,
)
zone = models.ForeignKey(
'images.Zone',
on_delete=models.PROTECT,
related_name='transcriptions',
null=True,
)
source = models.ForeignKey(
DataSource,
on_delete=models.CASCADE,
......@@ -468,17 +462,18 @@ class Transcription(models.Model):
related_name='transcriptions',
)
class Meta:
# The following index was attempted with md5(text) in a manual migration
# but it causes too many performance issues.
# unique_together = (
# ('element', 'zone', 'text')
# )
pass
def __str__(self):
return 'Transcription: {}'.format(self.text[:20])
class Meta:
constraints = [
# Require either a source, a worker version, or none (manual), but not both at once
models.CheckConstraint(
check=~Q(source_id__isnull=False, worker_version_id__isnull=False),
name='transcription_source_not_worker_version',
)
]
class TranscriptionEntity(models.Model):
"""
......
from arkindex.documents.models import Transcription, Element, Entity
from itertools import chain
from django.db.models import prefetch_related_objects
from arkindex.documents.models import Transcription, Element, Entity
import uuid
......@@ -12,7 +13,7 @@ def search_transcriptions_post(data):
ts = Transcription.objects \
.filter(id__in=transcription_ids) \
.order_by('-score') \
.prefetch_related('zone__image__server', 'element', 'source')
.prefetch_related('element__zone__image__server', 'source')
element_ids = list(ts.values_list('element_id', flat=True))
all_parent_paths = Element.objects.get_ascendings_paths(*element_ids)
for trans in ts:
......@@ -62,7 +63,7 @@ def search_elements_post(data):
transcriptions = {
t.id: t
for t in Transcription.objects.filter(id__in=tr_ids).prefetch_related('zone__image__server', 'source')
for t in Transcription.objects.filter(id__in=tr_ids).prefetch_related('source')
}
elts_tr_ids = {
......@@ -78,11 +79,15 @@ def search_elements_post(data):
for result in data
}
elts = list(Element.objects.filter(id__in=elt_ids).prefetch_related('corpus', 'type'))
elts = list(Element.objects.filter(id__in=elt_ids).prefetch_related('corpus', 'type', 'zone__image__server'))
# Preserve the ordering given by ElasticSearch
ordered_elts = list(filter(None, map(lambda eid: next((e for e in elts if e.id == eid), None), elt_ids)))
all_paths = Element.objects.get_ascendings_paths(*(e.id for e in ordered_elts))
prefetch_related_objects(
[element for paths in all_paths.values() for path in paths for element in path],
'type',
)
for elt in ordered_elts:
elt.transcriptions_results = list(filter(None, [transcriptions.get(tid) for tid in elts_tr_ids[elt.id]]))
......
from abc import ABC, abstractmethod
from django.conf import settings
from django.db.models import Q
from rest_framework import serializers
from arkindex.documents.models import Element, Transcription
from arkindex.project.tools import build_absolute_url, bounding_box
......@@ -77,7 +78,7 @@ class TranscriptionSearchAnnotationSerializer(TranscriptionAnnotationSerializer)
def get_target(self, ts):
assert isinstance(ts, Transcription)
url = build_absolute_url(ts.element, self.context['request'], 'api:iiif-canvas')
x, y, w, h = bounding_box(ts.zone.polygon)
x, y, w, h = bounding_box(ts.element.zone.polygon)
return f'{url}#xywh={x},{y},{w},{h}'
......@@ -103,8 +104,11 @@ class AnnotationListSerializer(serializers.BaseSerializer):
}
def get_elements(self, element):
return element.transcriptions.all()
"Get a list of elements to serialize as annotations."
return Transcription.objects.filter(
Q(element=element)
| Q(element__in=Element.objects.get_descending(element.id))
)
class ElementAnnotationListSerializer(AnnotationListSerializer):
......
......@@ -10,7 +10,6 @@ from arkindex.documents.models import (
Corpus, Element, ElementType, Transcription, DataSource, MLClass, Classification, ClassificationState
)
from arkindex.project.serializer_fields import EnumField, LinearRingField
from arkindex.images.serializers import ZoneSerializer
from arkindex.documents.serializers.light import ElementZoneSerializer
import uuid
......@@ -232,18 +231,16 @@ class TranscriptionSerializer(serializers.ModelSerializer):
Serialises a Transcription
"""
type = EnumField(TranscriptionType, read_only=True)
zone = ZoneSerializer(read_only=True)
source = DataSourceSerializer(read_only=True)
class Meta:
model = Transcription
read_only_fields = ('id', 'type', 'score', 'zone', 'source')
read_only_fields = ('id', 'type', 'score', 'source')
fields = (
'id',
'type',
'text',
'score',
'zone',
'source',
'worker_version_id',
)
......@@ -428,6 +425,7 @@ class ClassificationsSerializer(serializers.Serializer):
ml_classes = dict(
MLClass
.objects
.using('default')
.filter(corpus_id=parent.corpus_id, name__in=ml_class_names)
.values_list('name', 'id')
)
......