From ea03418bb5697db205dcb33f13ac0f25239dc0fa Mon Sep 17 00:00:00 2001 From: Bastien Abadie <bastien@nextcairn.com> Date: Fri, 29 Jun 2018 10:54:09 +0200 Subject: [PATCH] Expose bulk_transcriptions through API --- arkindex/documents/api.py | 44 ++++++++++++++++++++++++++++--- arkindex/documents/models.py | 6 ----- arkindex/documents/serializers.py | 30 ++++++++++++++++++++- arkindex/images/importer.py | 22 +++++++++++----- arkindex/project/api_v1.py | 5 ++-- 5 files changed, 88 insertions(+), 19 deletions(-) diff --git a/arkindex/documents/api.py b/arkindex/documents/api.py index 395a8352e5..7c638bf723 100644 --- a/arkindex/documents/api.py +++ b/arkindex/documents/api.py @@ -14,7 +14,7 @@ from arkindex.documents.serializers import \ PageAnnotationListSerializer, PageActAnnotationListSerializer, \ SurfaceAnnotationListSerializer, TranscriptionSearchAnnotationListSerializer, \ ActSerializer, ElementLinkSerializer, SurfaceSerializer, \ - TextCreationSerializer + TranscriptionCreateSerializer, TranscriptionsSerializer from arkindex.documents.models import \ Element, ElementType, Page, Act, Transcription, ElementLink, Corpus from arkindex.documents.search import \ @@ -22,6 +22,7 @@ from arkindex.documents.search import \ from arkindex.documents.tasks import refresh_db_cache from arkindex.documents.indexer import Indexer from arkindex.images.models import Zone +from arkindex.images.importer import bulk_transcriptions from arkindex.project.elastic import ESQuerySet @@ -141,8 +142,45 @@ class LinkRetrieveDestroy(RetrieveDestroyAPIView): refresh_db_cache.delay() -class TextElementCreate(CreateAPIView): - serializer_class = TextCreationSerializer +class TranscriptionBulk(CreateAPIView): + ''' + Create transcriptions in bulk, all linked to the same image + and parent element + ''' + serializer_class = TranscriptionsSerializer + permission_classes = (IsAuthenticated, ) + + def perform_create(self, serializer): + + parent = serializer.validated_data['parent'] + image = serializer.validated_data['image'] + transcriptions = bulk_transcriptions( + image=image, + parent=parent, + items=[ + { + 'polygon': tr['polygon'], + 'text': tr['text'], + 'score': tr['score'], + 'type': tr['type'], + } + for tr in serializer.validated_data['transcriptions'] + ], + use_polygons=True, + ) + + # Reindex all created + if transcriptions: + Indexer().run_index( + settings.ES_INDEX_TRANSCRIPTIONS, + Transcription.INDEX_TYPE, + Transcription.objects.filter( + id__in=[t.id for t in transcriptions] + ), + ) + +class TranscriptionCreate(CreateAPIView): + serializer_class = TranscriptionCreateSerializer permission_classes = (IsAuthenticated, ) def perform_create(self, serializer): diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 2a8722674d..413743c95c 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -334,12 +334,6 @@ class Transcription(Element): def __str__(self): return 'Transcription: {}'.format(self.text[:20]) - def save(self, *args, **kwargs): - # TODO: move this in Element through introspection - if self.type is None: - self.type = ElementType.Word - super().save(*args, **kwargs) - def build_search_index(self): """ Structure indexed into Elastic Search diff --git a/arkindex/documents/serializers.py b/arkindex/documents/serializers.py index 8a29748dda..e4c3424a1d 100644 --- a/arkindex/documents/serializers.py +++ b/arkindex/documents/serializers.py @@ -149,7 +149,7 @@ class CorpusSerializer(serializers.ModelSerializer): fields = ('id', 'name') -class TextCreationSerializer(serializers.Serializer): +class TranscriptionCreateSerializer(serializers.Serializer): """ Allows for insertion of new transcriptions and zones """ @@ -167,6 +167,34 @@ class TextCreationSerializer(serializers.Serializer): type = EnumField(ElementType) +class TranscriptionBulkSerializer(serializers.Serializer): + """ + Allows for insertion of new transcriptions and zones + in Bulk (used by serializer below) + Note: no element ! + """ + polygon = serializers.ListField( + child=serializers.ListField( + child=serializers.IntegerField(min_value=0), + min_length=2, + max_length=2 + ), + min_length=3 + ) + text = serializers.CharField() + score = serializers.FloatField(min_value=0, max_value=1) + type = EnumField(ElementType) + + +class TranscriptionsSerializer(serializers.Serializer): + """ + Allows for insertion of new transcriptions and zones + in Bulk (uses serializer above) on a common parent + """ + transcriptions = TranscriptionBulkSerializer(many=True) + parent = serializers.PrimaryKeyRelatedField(queryset=Element.objects.all()) + image = serializers.PrimaryKeyRelatedField(queryset=Image.objects.all()) + class TranscriptionSearchResultSerializer(serializers.ModelSerializer): """ Link between objects & their search indexation diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index 988c444995..dda625b2f3 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -60,7 +60,7 @@ def import_indexes(image, page, index_path, extension='jpg'): logger.info('Created {} transcriptions '.format(len(new_transcriptions))) -def bulk_transcriptions(image, parent, items): +def bulk_transcriptions(image, parent, items, use_polygons=False): """ Create transcriptions and zones in bulk """ @@ -68,16 +68,23 @@ def bulk_transcriptions(image, parent, items): # Link a transcription data with a bounding box # This is hashable (box is hashable) - TrBox = namedtuple('TrBox', 'box, line, text, score') + TrBox = namedtuple('TrBox', 'type, box, line, text, score') + + # TODO : hash raw polygons + def _tuple(data): + if isinstance(data, list): + return tuple(_tuple(x) for x in data) + return data # Build all TrBox from items required = { TrBox( - BoundingBox( + i.get('type', ElementType.Word), + _tuple(i['polygon']) if use_polygons else BoundingBox( [[i['x'], i['y']], [i['x'] + i['width'], i['y'] + i['height']]] ), - int(i['line']), + int(i.get('line', 0)), i['text'], float(i['score']), ) @@ -93,7 +100,8 @@ def bulk_transcriptions(image, parent, items): # Build all TrBox from existing existing = { TrBox( - BoundingBox(tr.zone.polygon), + tr.type, + _tuple(tr.zone.polygon) if use_polygons else BoundingBox(tr.zone.polygon), tr.line, tr.text, tr.score, @@ -112,7 +120,7 @@ def bulk_transcriptions(image, parent, items): elements = Element.objects.bulk_create( Element( corpus=parent.corpus, - type=ElementType.Word, + type=n.type, name=n.text, zone_id=uuid.uuid4() ) @@ -131,7 +139,7 @@ def bulk_transcriptions(image, parent, items): Zone( id=elt.zone_id, image=image, - polygon=n.box.to_polygon(), + polygon=n.box if use_polygons else n.box.to_polygon(), ) ) for elt, n in zip(elements, needed) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 4acafaa1b6..addbbd8b4f 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -5,7 +5,7 @@ from arkindex.documents.api import \ VolumeManifest, ActManifest, \ PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \ TranscriptionSearch, ActSearch, TranscriptionSearchAnnotationList, \ - ActEdit, LinkCreate, LinkRetrieveDestroy, TextElementCreate, SurfaceDetails + ActEdit, LinkCreate, LinkRetrieveDestroy, TranscriptionCreate, TranscriptionBulk, SurfaceDetails api = [ @@ -66,5 +66,6 @@ api = [ url(r'^link/(?P<pk>[\w\-]+)/?$', LinkRetrieveDestroy.as_view(), name='link-manage'), # Ingest transcriptions - url(r'^element/?$', TextElementCreate.as_view(), name='element'), + url(r'^transcription/?$', TranscriptionCreate.as_view(), name='transcription-create'), + url(r'^transcription/bulk/?$', TranscriptionBulk.as_view(), name='transcription-bulk'), ] -- GitLab