diff --git a/VERSION b/VERSION index 0bfccb08040473231c42ec1ff3b9e773528a43f5..ef52a648073dd38aebdd7505edb3ba36e8bfd230 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.4.5 +0.4.6 diff --git a/arkindex/documents/api.py b/arkindex/documents/api.py index 395a8352e570bfc8a1d72b7fd35788ad71aa4b03..e8413386dc67b463e31a96dc64904a86c7c1f0f1 100644 --- a/arkindex/documents/api.py +++ b/arkindex/documents/api.py @@ -14,7 +14,7 @@ from arkindex.documents.serializers import \ PageAnnotationListSerializer, PageActAnnotationListSerializer, \ SurfaceAnnotationListSerializer, TranscriptionSearchAnnotationListSerializer, \ ActSerializer, ElementLinkSerializer, SurfaceSerializer, \ - TextCreationSerializer + TranscriptionCreateSerializer, TranscriptionsSerializer from arkindex.documents.models import \ Element, ElementType, Page, Act, Transcription, ElementLink, Corpus from arkindex.documents.search import \ @@ -22,7 +22,9 @@ from arkindex.documents.search import \ from arkindex.documents.tasks import refresh_db_cache from arkindex.documents.indexer import Indexer from arkindex.images.models import Zone +from arkindex.images.importer import bulk_transcriptions from arkindex.project.elastic import ESQuerySet +from arkindex.project.polygon import Polygon class ElementsList(ListAPIView): @@ -141,8 +143,45 @@ class LinkRetrieveDestroy(RetrieveDestroyAPIView): refresh_db_cache.delay() -class TextElementCreate(CreateAPIView): - serializer_class = TextCreationSerializer +class TranscriptionBulk(CreateAPIView): + ''' + Create transcriptions in bulk, all linked to the same image + and parent element + ''' + serializer_class = TranscriptionsSerializer + permission_classes = (IsAuthenticated, ) + + def perform_create(self, serializer): + + parent = serializer.validated_data['parent'] + image = serializer.validated_data['image'] + transcriptions = bulk_transcriptions( + image=image, + parent=parent, + items=[ + { + 'polygon': tr['polygon'], + 'text': tr['text'], + 'score': tr['score'], + 'type': tr['type'], + } + for tr in serializer.validated_data['transcriptions'] + ], + ) + + # Reindex all created + if transcriptions: + Indexer().run_index( + settings.ES_INDEX_TRANSCRIPTIONS, + Transcription.INDEX_TYPE, + Transcription.objects.filter( + id__in=[t.id for t in transcriptions] + ), + ) + + +class TranscriptionCreate(CreateAPIView): + serializer_class = TranscriptionCreateSerializer permission_classes = (IsAuthenticated, ) def perform_create(self, serializer): @@ -155,11 +194,22 @@ class TextElementCreate(CreateAPIView): ElementType.Character): raise APIException(detail="This endpoint can only import transcriptions.") element = serializer.validated_data['element'] + # Create a zone on the page's image - ts_zone, _ = Zone.objects.get_or_create( - image=element.zone.image, - polygon=[tuple(x) for x in serializer.validated_data['polygon']], - ) + polygon = Polygon(serializer.validated_data['polygon']) + try: + ts_zone, _ = Zone.objects.get_or_create( + image=element.zone.image, + polygon=polygon, + ) + except Zone.MultipleObjectsReturned: + # This should not happend, but we cannot set a unique + # on polygon PG fields + ts_zone = Zone.objects.filter( + image=element.zone.image, + polygon=polygon, + ).first() + ts, created = Transcription.objects.get_or_create( corpus=element.corpus, type=serializer.validated_data['type'], @@ -172,7 +222,11 @@ class TextElementCreate(CreateAPIView): if not created or ts.score != serializer.validated_data['score']: ts.score = serializer.validated_data['score'] ts.save() + + # Create element paths ts.add_parent(element) + + # Index in ES Indexer().run_index( settings.ES_INDEX_TRANSCRIPTIONS, Transcription.INDEX_TYPE, diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 2a8722674de6000fa2568a7f783e60970f2f2c08..413743c95cbbfd046417abcb97fdbcd0edf20d52 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -334,12 +334,6 @@ class Transcription(Element): def __str__(self): return 'Transcription: {}'.format(self.text[:20]) - def save(self, *args, **kwargs): - # TODO: move this in Element through introspection - if self.type is None: - self.type = ElementType.Word - super().save(*args, **kwargs) - def build_search_index(self): """ Structure indexed into Elastic Search diff --git a/arkindex/documents/serializers.py b/arkindex/documents/serializers.py index 8a29748ddac84526b812580ac4286b3f867da11c..14b4e22b173d32beeaf784a2079fc29fc7a6c168 100644 --- a/arkindex/documents/serializers.py +++ b/arkindex/documents/serializers.py @@ -149,14 +149,14 @@ class CorpusSerializer(serializers.ModelSerializer): fields = ('id', 'name') -class TextCreationSerializer(serializers.Serializer): +class TranscriptionCreateSerializer(serializers.Serializer): """ Allows for insertion of new transcriptions and zones """ element = serializers.PrimaryKeyRelatedField(queryset=Element.objects.all()) polygon = serializers.ListField( child=serializers.ListField( - child=serializers.IntegerField(min_value=0), + child=serializers.IntegerField(), min_length=2, max_length=2 ), @@ -167,6 +167,35 @@ class TextCreationSerializer(serializers.Serializer): type = EnumField(ElementType) +class TranscriptionBulkSerializer(serializers.Serializer): + """ + Allows for insertion of new transcriptions and zones + in Bulk (used by serializer below) + Note: no element ! + """ + polygon = serializers.ListField( + child=serializers.ListField( + child=serializers.IntegerField(), + min_length=2, + max_length=2 + ), + min_length=3 + ) + text = serializers.CharField() + score = serializers.FloatField(min_value=0, max_value=1) + type = EnumField(ElementType) + + +class TranscriptionsSerializer(serializers.Serializer): + """ + Allows for insertion of new transcriptions and zones + in Bulk (uses serializer above) on a common parent + """ + transcriptions = TranscriptionBulkSerializer(many=True) + parent = serializers.PrimaryKeyRelatedField(queryset=Element.objects.all()) + image = serializers.PrimaryKeyRelatedField(queryset=Image.objects.all()) + + class TranscriptionSearchResultSerializer(serializers.ModelSerializer): """ Link between objects & their search indexation @@ -265,8 +294,8 @@ class ElementCanvasManifestSerializer(serializers.BaseSerializer): "@id": element.build_absolute_url(self.context['request'], 'api:canvas-manifest'), "@type": "sc:Canvas", "label": element.name, - "height": zone.box.height, - "width": zone.box.width, + "height": zone.polygon.height, + "width": zone.polygon.width, "images": [ { "@type": "oa:Annotation", @@ -453,7 +482,7 @@ class AnnotationSerializer(ABC, serializers.BaseSerializer): """Get the target canvas (`on` property) for a given element.""" return "{0}#xywh={1.x},{1.y},{1.width},{1.height}".format( element.zone.image.get_thumbnail_url(max_width=None, max_height=None), - element.zone.box) + element.zone.polygon) def to_representation(self, element): assert isinstance(element, Element) @@ -475,7 +504,7 @@ class SearchResultAnnotationMixin(object): zone__polygon__contains=element.zone.polygon, zone__image_id=element.zone.image_id ).first().build_absolute_url(self.context['request'], 'api:canvas-manifest'), - element.zone.box) + element.zone.polygon) class TranscriptionAnnotationSerializer(AnnotationSerializer): diff --git a/arkindex/documents/surface_link.py b/arkindex/documents/surface_link.py index 370842042c4ed16dd5b77e2ec5bafc1e9b10e7a3..6f8b9b289e55e96f5cde8c10d6d8db7b74aa619a 100644 --- a/arkindex/documents/surface_link.py +++ b/arkindex/documents/surface_link.py @@ -149,7 +149,7 @@ class SurfaceLinker(object): surface for surface in surfaces if surface.zone.image_id == page.zone.image_id - ], key=lambda s: s.zone.box.center[1]) + ], key=lambda s: s.zone.polygon.center.y) for page in self.pages } diff --git a/arkindex/documents/tests/test_annotation_list.py b/arkindex/documents/tests/test_annotation_list.py index 3fb53971d294d8189dfd9e9248d9e44a82376676..9457bf7aa80911a0719bebf4702f6a4b34fd6b8e 100644 --- a/arkindex/documents/tests/test_annotation_list.py +++ b/arkindex/documents/tests/test_annotation_list.py @@ -19,8 +19,8 @@ class TestPageAnnotationListSerializer(APITestCase): polygon=[(100, 200), (100, 300), (300, 300), (300, 200), (100, 200)], image=self.img) self.z2 = Zone.objects.create( polygon=[(50, 100), (50, 150), (150, 150), (150, 100), (50, 100)], image=self.img) - self.t1 = Transcription.objects.create(corpus=self.corpus, text="AAA", zone=self.z1) - self.t2 = Transcription.objects.create(corpus=self.corpus, text="BBB", zone=self.z2) + self.t1 = Transcription.objects.create(type=ElementType.Word, corpus=self.corpus, text="AAA", zone=self.z1) + self.t2 = Transcription.objects.create(type=ElementType.Word, corpus=self.corpus, text="BBB", zone=self.z2) ElementLink.objects.create(parent=self.page, child=self.t1, order=0) ElementLink.objects.create(parent=self.page, child=self.t2, order=1) refresh_sync_only_for_unit_tests() diff --git a/arkindex/documents/tests/test_search_post.py b/arkindex/documents/tests/test_search_post.py index 6c9dc214c4f255f7752affaad28166c6b8414744..8db37786c713a3e4f60cecb0afcbc1d824745481 100644 --- a/arkindex/documents/tests/test_search_post.py +++ b/arkindex/documents/tests/test_search_post.py @@ -30,14 +30,42 @@ class TestSearchPostProcess(TestCase): ElementLink.objects.create(parent=self.vol2, child=self.p3, order=0) # Create a bunch of transcriptions - self.t1 = Transcription.objects.create(corpus=self.corpus, text="word", zone=Zone.objects.create( - polygon=[(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)], image=self.img1)) - self.t2 = Transcription.objects.create(corpus=self.corpus, text="word", zone=Zone.objects.create( - polygon=[(110, 110), (120, 110), (120, 120), (110, 120), (110, 110)], image=self.img1)) - self.t3 = Transcription.objects.create(corpus=self.corpus, text="word", zone=Zone.objects.create( - polygon=[(210, 210), (220, 210), (220, 220), (210, 220), (210, 210)], image=self.img2)) - self.t4 = Transcription.objects.create(corpus=self.corpus, text="word", zone=Zone.objects.create( - polygon=[(310, 210), (320, 310), (320, 320), (310, 320), (310, 310)], image=self.img3)) + self.t1 = Transcription.objects.create( + corpus=self.corpus, + text="word", + type=ElementType.Word, + zone=Zone.objects.create( + polygon=[(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)], + image=self.img1, + ) + ) + self.t2 = Transcription.objects.create( + corpus=self.corpus, + text="word", + type=ElementType.Word, + zone=Zone.objects.create( + polygon=[(110, 110), (120, 110), (120, 120), (110, 120), (110, 110)], + image=self.img1, + ) + ) + self.t3 = Transcription.objects.create( + corpus=self.corpus, + text="word", + type=ElementType.Word, + zone=Zone.objects.create( + polygon=[(210, 210), (220, 210), (220, 220), (210, 220), (210, 210)], + image=self.img2, + ) + ) + self.t4 = Transcription.objects.create( + corpus=self.corpus, + text="word", + type=ElementType.Word, + zone=Zone.objects.create( + polygon=[(310, 210), (320, 310), (320, 320), (310, 320), (310, 310)], + image=self.img3, + ) + ) ElementLink.objects.create(parent=self.p1, child=self.t1, order=0) ElementLink.objects.create(parent=self.p1, child=self.t2, order=1) ElementLink.objects.create(parent=self.p2, child=self.t3, order=2) diff --git a/arkindex/documents/tests/test_surface_linker.py b/arkindex/documents/tests/test_surface_linker.py index 5f3b8dddae97917b90645c9cb266b2099406182d..36816d379cdf8a41e1ddaaf6c92f1025b750030a 100644 --- a/arkindex/documents/tests/test_surface_linker.py +++ b/arkindex/documents/tests/test_surface_linker.py @@ -1,5 +1,6 @@ from unittest.mock import patch from django.test import TestCase +from arkindex.project.polygon import Polygon from arkindex.documents.surface_link import parse_folios, ParsedFolio, SurfaceLinker from arkindex.documents.models import Corpus, Element, ElementType, \ ElementLink, Page, PageDirection, PageComplement, PageType @@ -113,17 +114,17 @@ class TestSurfaceLinker(TestCase): self.img3 = Image.objects.create(path='img3', width=418, height=404, server=self.imgsrv) # Create zones for pages - self.z1 = Zone.objects.create(polygon=[(0, 0), (1337, 0), (1337, 42), (0, 42), (0, 0)], image=self.img1) - self.z2 = Zone.objects.create(polygon=[(0, 0), (255, 0), (255, 420), (0, 420), (0, 0)], image=self.img2) - self.z3 = Zone.objects.create(polygon=[(0, 0), (418, 0), (418, 404), (0, 404), (0, 0)], image=self.img3) + self.z1 = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 1337, 42), image=self.img1) + self.z2 = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 255, 420), image=self.img2) + self.z3 = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 418, 404), image=self.img3) # Create zones for 6 surfaces, two per page - self.z1a = Zone.objects.create(polygon=[(0, 0), (1337, 0), (1337, 22), (0, 22), (0, 0)], image=self.img1) - self.z1b = Zone.objects.create(polygon=[(0, 22), (1337, 22), (1337, 200), (0, 200), (0, 22)], image=self.img1) - self.z2a = Zone.objects.create(polygon=[(0, 0), (255, 0), (255, 210), (0, 210), (0, 0)], image=self.img2) - self.z2b = Zone.objects.create(polygon=[(0, 210), (255, 210), (255, 420), (0, 210), (0, 210)], image=self.img2) - self.z3a = Zone.objects.create(polygon=[(0, 0), (418, 0), (418, 202), (0, 202), (0, 0)], image=self.img3) - self.z3b = Zone.objects.create(polygon=[(0, 202), (418, 202), (418, 404), (0, 404), (0, 202)], image=self.img3) + self.z1a = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 1337, 22), image=self.img1) + self.z1b = Zone.objects.create(polygon=Polygon.from_coords(0, 22, 1337, 200), image=self.img1) + self.z2a = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 255, 210), image=self.img2) + self.z2b = Zone.objects.create(polygon=Polygon.from_coords(0, 210, 255, 420), image=self.img2) + self.z3a = Zone.objects.create(polygon=Polygon.from_coords(0, 0, 418, 202), image=self.img3) + self.z3b = Zone.objects.create(polygon=Polygon.from_coords(0, 202, 1337, 42), image=self.img3) # Create a volume and 3 pages self.vol = Element.objects.create(corpus=self.corpus, name="Volume Name", type=ElementType.Volume) @@ -290,7 +291,7 @@ class TestSurfaceLinker(TestCase): Test surface matching for an act with 2 folios on same page Greedy test case """ - self.assertTrue(self.z1a.box.center[1] < self.z1b.box.center[1]) + self.assertTrue(self.z1a.polygon.center.y < self.z1b.polygon.center.y) s = SurfaceLinker(self.vol, [('1', '001v'), ('2', '001v')]) s.find_pages() self.assertEqual(len(s.pages), 3) diff --git a/arkindex/documents/tests/test_text_create.py b/arkindex/documents/tests/test_transcription_create.py similarity index 63% rename from arkindex/documents/tests/test_text_create.py rename to arkindex/documents/tests/test_transcription_create.py index 817c7e51345ca72a3e0e96752a81b710fb9b58e6..862279b16faf50e8604475d5a6e949dc625497db 100644 --- a/arkindex/documents/tests/test_text_create.py +++ b/arkindex/documents/tests/test_transcription_create.py @@ -6,10 +6,11 @@ from arkindex.documents.models import Corpus, Page, Transcription, ElementLink, from arkindex.images.models import ImageServer, Image, Zone from arkindex.users.models import User from arkindex.documents.cache import refresh_sync_only_for_unit_tests +from arkindex.project.polygon import Polygon import uuid -class TestTextElementCreate(APITestCase): +class TestTranscriptionCreate(APITestCase): """ Tests for text element creation view """ @@ -25,13 +26,19 @@ class TestTextElementCreate(APITestCase): self.page = Page.objects.create(corpus=self.corpus, name="page", folio="page", zone=pagezone) self.ts_zone = Zone.objects.create( polygon=[(100, 200), (100, 300), (300, 300), (300, 200), (100, 200)], image=self.img) - self.ts = Transcription.objects.create(corpus=self.corpus, text="PAAMAYIM", score=0.5, zone=self.ts_zone) + self.ts = Transcription.objects.create( + corpus=self.corpus, + text="PAAMAYIM", + score=0.5, + zone=self.ts_zone, + type=ElementType.Word, + ) ElementLink.objects.create(parent=self.page, child=self.ts, order=0) self.user = User.objects.create_user(email='user@user.com', password='P45$w0rD') refresh_sync_only_for_unit_tests() def test_require_login(self): - response = self.client.post(reverse('api:element'), format='json') + response = self.client.post(reverse('api:transcription-create'), format='json') self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) @patch('arkindex.documents.api.Indexer') @@ -40,16 +47,16 @@ class TestTextElementCreate(APITestCase): Checks the view creates transcriptions, zones, links, paths and runs ES indexing """ self.client.force_login(self.user) - response = self.client.post(reverse('api:element'), format='json', data={ + response = self.client.post(reverse('api:transcription-create'), format='json', data={ "type": "word", "element": str(self.page.id), - "polygon": [(0, 0), (0, 100), (100, 100), (100, 0), (0, 0)], + "polygon": [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)], "text": "NEKUDOTAYIM", "score": 0.83, }) self.assertEqual(response.status_code, status.HTTP_201_CREATED) new_ts = Transcription.objects.get(text="NEKUDOTAYIM", type=ElementType.Word) - self.assertListEqual(new_ts.zone.polygon, [(0, 0), (0, 100), (100, 100), (100, 0), (0, 0)]) + self.assertEqual(new_ts.zone.polygon, Polygon.from_coords(0, 0, 100, 100)) self.assertEqual(new_ts.score, 0.83) self.assertTrue(ElementLink.objects.filter(parent=self.page, child=new_ts).exists()) self.assertEqual(ElementPath.objects.filter(element_id=new_ts.id).count(), 1) @@ -61,7 +68,7 @@ class TestTextElementCreate(APITestCase): Checks the view does not let other element types to be created """ self.client.force_login(self.user) - response = self.client.post(reverse('api:element'), format='json', data={ + response = self.client.post(reverse('api:transcription-create'), format='json', data={ "type": "register", "element": str(self.page.id), "polygon": [(0, 0), (0, 100), (100, 100), (100, 0), (0, 0)], @@ -75,7 +82,7 @@ class TestTextElementCreate(APITestCase): Checks the view reuses zones when available """ self.client.force_login(self.user) - response = self.client.post(reverse('api:element'), format='json', data={ + response = self.client.post(reverse('api:transcription-create'), format='json', data={ "type": "word", "element": str(self.page.id), "polygon": self.ts_zone.polygon, @@ -90,7 +97,7 @@ class TestTextElementCreate(APITestCase): Checks the view updates transcriptions when they already exist """ self.client.force_login(self.user) - response = self.client.post(reverse('api:element'), format='json', data={ + response = self.client.post(reverse('api:transcription-create'), format='json', data={ "type": "word", "element": str(self.page.id), "polygon": self.ts.zone.polygon, @@ -119,27 +126,57 @@ class TestTextElementCreate(APITestCase): # Negative score post_data['score'] = -1 - response = self.client.post(reverse('api:element'), format='json', data=post_data) + response = self.client.post(reverse('api:transcription-create'), format='json', data=post_data) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) # Score over 100% post_data['score'] = 2 - response = self.client.post(reverse('api:element'), format='json', data=post_data) + response = self.client.post(reverse('api:transcription-create'), format='json', data=post_data) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) # Missing element post_data['score'] = 0.83 post_data['element'] = str(uuid.uuid4()) - response = self.client.post(reverse('api:element'), format='json', data=post_data) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - - # Negative coordinates - post_data['element'] = str(self.page.id) - post_data['polygon'] = [(0, 0), (0, -100), (-100, -100), (-100, 0), (0, 0)] - response = self.client.post(reverse('api:element'), format='json', data=post_data) + response = self.client.post(reverse('api:transcription-create'), format='json', data=post_data) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) # Not enough polygon points post_data['polygon'] = [(0, 0), (100, 100)] - response = self.client.post(reverse('api:element'), format='json', data=post_data) + response = self.client.post(reverse('api:transcription-create'), format='json', data=post_data) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + @patch('arkindex.documents.api.Indexer') + def test_create_bulk_transcription(self, indexer): + """ + Checks the view creates transcriptions, zones, links, paths and runs ES indexing + Using bulk_transcriptions + """ + self.client.force_login(self.user) + response = self.client.post(reverse('api:transcription-bulk'), format='json', data={ + "parent": str(self.page.id), + "image": str(self.img.id), + "transcriptions": [ + { + "type": "word", + "polygon": [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)], + "text": "NEKUDOTAYIM", + "score": 0.83, + }, + { + "type": "line", + "polygon": [(0, 0), (200, 0), (200, 200), (0, 200), (0, 0)], + "text": "This is a test", + "score": 0.75, + }, + ] + }) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + new_ts = Transcription.objects.get(text="NEKUDOTAYIM", type=ElementType.Word) + self.assertEqual(new_ts.zone.polygon, Polygon.from_coords(0, 0, 100, 100)) + self.assertEqual(new_ts.score, 0.83) + self.assertTrue(ElementLink.objects.filter(parent=self.page, child=new_ts).exists()) + self.assertEqual(ElementPath.objects.filter(element_id=new_ts.id).count(), 1) + self.assertListEqual(ElementPath.objects.get(element=new_ts).path, [self.page.id]) + + # indexer called + self.assertTrue(indexer.return_value.run_index.called) diff --git a/arkindex/documents/views.py b/arkindex/documents/views.py index 9d77c0f9d1fa25d50d1c7205c89508c4dee03b02..2f24e12300dc63a980a3913636a9822a751899a4 100644 --- a/arkindex/documents/views.py +++ b/arkindex/documents/views.py @@ -68,7 +68,7 @@ class DumpActs(SingleObjectMixin, View): act.folio, surface.name, page.name, - str(surface.zone.box.center[1]), + str(surface.zone.polygon.center.y), str(act.id), str(surface.id), ]) diff --git a/arkindex/images/importer.py b/arkindex/images/importer.py index a91864e16ad9d55194a927d4ab6c282fd2e846b6..cecc02911308c6d4ba5fd3b2d30c78de1097f2cc 100644 --- a/arkindex/images/importer.py +++ b/arkindex/images/importer.py @@ -1,6 +1,6 @@ from arkindex.images.models import Zone, Image -from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page -from arkindex.project.tools import BoundingBox +from arkindex.documents.models import Transcription, ElementLink, ElementType, Element, Page, ElementPath +from arkindex.project.polygon import Polygon from collections import namedtuple from abc import ABC, abstractmethod from django.db import transaction @@ -60,22 +60,22 @@ def import_indexes(image, page, index_path, extension='jpg'): logger.info('Created {} transcriptions '.format(len(new_transcriptions))) -def bulk_transcriptions(image, page, items): +def bulk_transcriptions(image, parent, items): """ Create transcriptions and zones in bulk """ - # Link a transcription data with a bounding box + assert isinstance(parent, Element) + + # Link a transcription data with a Polygon # This is hashable (box is hashable) - TrBox = namedtuple('TrBox', 'box, line, text, score') + TrPolygon = namedtuple('TrPolygon', 'type, polygon, line, text, score') - # Build all TrBox from items + # Build all TrPolygon from items required = { - TrBox( - BoundingBox( - [[i['x'], i['y']], - [i['x'] + i['width'], i['y'] + i['height']]] - ), - int(i['line']), + TrPolygon( + i.get('type', ElementType.Word), + Polygon.from_dict(i), + int(i.get('line', 0)), i['text'], float(i['score']), ) @@ -88,10 +88,11 @@ def bulk_transcriptions(image, page, items): for z in image.zones.all() } - # Build all TrBox from existing + # Build all TrPolygon from existing existing = { - TrBox( - BoundingBox(tr.zone.polygon), + TrPolygon( + tr.type, + tr.zone.polygon, tr.line, tr.text, tr.score, @@ -99,7 +100,7 @@ def bulk_transcriptions(image, page, items): for tr in Transcription.objects.filter(zone__image=image).prefetch_related('zone') } - # Calc needed TrBox to build + # Calc needed TrPolygon to build needed = required.difference(existing) if not needed: return [] @@ -109,8 +110,8 @@ def bulk_transcriptions(image, page, items): # Raw elements elements = Element.objects.bulk_create( Element( - corpus=page.corpus, - type=ElementType.Word, + corpus=parent.corpus, + type=n.type, name=n.text, zone_id=uuid.uuid4() ) @@ -129,7 +130,7 @@ def bulk_transcriptions(image, page, items): Zone( id=elt.zone_id, image=image, - polygon=n.box.to_polygon(), + polygon=n.polygon.serialize(), ) ) for elt, n in zip(elements, needed) @@ -151,13 +152,23 @@ def bulk_transcriptions(image, page, items): ) # Create all links between transcription and page - max_order_dl = ElementLink.objects.filter(parent=page).order_by('-order').first() + max_order_dl = ElementLink.objects.filter(parent=parent).order_by('-order').first() max_order = 0 if max_order_dl is None else max_order_dl.order + 1 ElementLink.objects.bulk_create( - ElementLink(parent=page, child=elt, order=i) + ElementLink(parent=parent, child=elt, order=i) for i, elt in enumerate(elements, max_order) ) + # Support ElementPath + paths = ElementPath.objects.filter(element=parent).values_list('path', flat=True) + if not paths: + paths = [[]] # Wonderful hack to handle no parents case + ElementPath.objects.bulk_create( + ElementPath(element=elt, path=[parent.id, ] + path) + for elt in elements + for path in paths + ) + return transcriptions diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 68ca773d9a49ee8da32e6bade6c665b738dfaffa..08ed223285c10e24e6a7a21049eb2b1913e8bde5 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -1,7 +1,5 @@ from django.db import models -from django.utils.functional import cached_property from arkindex.project.models import IndexableModel -from arkindex.project.tools import BoundingBox from arkindex.project.polygon import PolygonField from enumfields import EnumField, Enum import requests @@ -139,10 +137,6 @@ class Zone(IndexableModel): polygon = PolygonField() - @cached_property - def box(self): - return BoundingBox(self.polygon) - def __str__(self): return 'Zone {}'.format(self.polygon) @@ -154,5 +148,5 @@ class Zone(IndexableModel): return urllib.parse.urljoin( self.image.url + '/', '{},{},{},{}/full/0/default.jpg'.format( - self.box.x, self.box.y, self.box.width, self.box.height), + self.polygon.x, self.polygon.y, self.polygon.width, self.polygon.height), ) diff --git a/arkindex/images/serializers.py b/arkindex/images/serializers.py index f5f4b3e7a800c631bf206544ef9e6970e85b819c..78b07b5029f64a4b6fbc116554dd810ed4c03bfb 100644 --- a/arkindex/images/serializers.py +++ b/arkindex/images/serializers.py @@ -3,10 +3,10 @@ from arkindex.images.models import Image, Zone class ZoneSerializer(serializers.ModelSerializer): - x = serializers.IntegerField(source='box.x') - y = serializers.IntegerField(source='box.y') - width = serializers.IntegerField(source='box.width') - height = serializers.IntegerField(source='box.height') + x = serializers.IntegerField(source='polygon.x') + y = serializers.IntegerField(source='polygon.y') + width = serializers.IntegerField(source='polygon.width') + height = serializers.IntegerField(source='polygon.height') class Meta: model = Zone diff --git a/arkindex/images/tests.py b/arkindex/images/tests.py index 645d88ad59880b4a1815d0b0c4c9f2dadf347abc..3d66dc9ce8e57017f74f277928d8117f80b4e8fa 100644 --- a/arkindex/images/tests.py +++ b/arkindex/images/tests.py @@ -1,5 +1,6 @@ from django.test import TestCase -from arkindex.documents.models import Corpus, Page, Transcription +from arkindex.documents.models import Corpus, Page, Transcription, Element, ElementType +from arkindex.project.polygon import Polygon from arkindex.images.models import ImageServer, Image, Zone from arkindex.images.importer import bulk_transcriptions @@ -54,8 +55,15 @@ class TestBulkTranscriptions(TestCase): self.assertIsNotNone(out[0].zone) self.assertIsNotNone(out[1].zone) - self.assertEqual(out[0].zone.polygon, [(0, 0), (0, 100), (100, 100), (100, 0)]) - self.assertEqual(out[1].zone.polygon, [(20, 20), (20, 120), (120, 120), (120, 20)]) + self.assertEqual(out[0].zone.polygon, Polygon.from_coords(0, 0, 100, 100)) + self.assertEqual(out[1].zone.polygon, Polygon.from_coords(20, 20, 100, 100)) + + # Check path + children = Element.objects.get_descending(self.page.id) + self.assertEqual(children.count(), 2) + ids = children.values_list('id', flat=True) + self.assertIn(out[0].id, ids) + self.assertIn(out[1].id, ids) def test_bulk_transcriptions_unique(self): """Check bulk_transcriptions does not import the same transcriptions twice""" @@ -79,7 +87,53 @@ class TestBulkTranscriptions(TestCase): 'score': 0.2, }, ] - bulk_transcriptions(self.img, self.page, items) out = bulk_transcriptions(self.img, self.page, items) + self.assertEqual(len(out), 2) + out = bulk_transcriptions(self.img, self.page, items) + self.assertEqual(len(out), 0) + def test_bulk_transcriptions_polygons(self): + items = [ + { + 'polygon': [[0, 0], [0, 200], [200, 200], [200, 0], [250, 50], [0, 0]], + 'line': '1', + 'text': 'test 1', + 'score': 0.1, + 'type': ElementType.Word, + }, + { + 'polygon': [[0, 0], [100, 0], [100, 100], [0, 100], [0, 0]], + 'line': '2', + 'text': 'test 2', + 'score': 0.2, + 'type': ElementType.Line, + }, + ] + out = bulk_transcriptions(self.img, self.page, items) + self.assertEqual(len(out), 2) + + trs = Transcription.objects.all().order_by('line') + + self.assertEqual(trs.count(), 2) + self.assertIsInstance(trs[0], Transcription) + self.assertIsInstance(trs[1], Transcription) + + self.assertEqual(trs[0].type, ElementType.Word) + self.assertEqual(trs[0].line, 1) + self.assertEqual(trs[0].text, 'test 1') + self.assertEqual(trs[0].score, 0.1) + + self.assertEqual(trs[1].type, ElementType.Line) + self.assertEqual(trs[1].line, 2) + self.assertEqual(trs[1].text, 'test 2') + self.assertEqual(trs[1].score, 0.2) + + self.assertIsNotNone(trs[0].zone) + self.assertIsNotNone(trs[1].zone) + + self.assertEqual(trs[0].zone.polygon, Polygon([[0, 0], [0, 200], [200, 200], [200, 0], [250, 50], [0, 0]])) + self.assertEqual(trs[1].zone.polygon, Polygon.from_coords(0, 0, 100, 100)) + + # should not recreate + out = bulk_transcriptions(self.img, self.page, items) self.assertEqual(len(out), 0) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 4acafaa1b68a5d528ce5acf6f1a6924e0bdf8d63..addbbd8b4fdaf1484a6379a1eb92541cec2dd252 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -5,7 +5,7 @@ from arkindex.documents.api import \ VolumeManifest, ActManifest, \ PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, \ TranscriptionSearch, ActSearch, TranscriptionSearchAnnotationList, \ - ActEdit, LinkCreate, LinkRetrieveDestroy, TextElementCreate, SurfaceDetails + ActEdit, LinkCreate, LinkRetrieveDestroy, TranscriptionCreate, TranscriptionBulk, SurfaceDetails api = [ @@ -66,5 +66,6 @@ api = [ url(r'^link/(?P<pk>[\w\-]+)/?$', LinkRetrieveDestroy.as_view(), name='link-manage'), # Ingest transcriptions - url(r'^element/?$', TextElementCreate.as_view(), name='element'), + url(r'^transcription/?$', TranscriptionCreate.as_view(), name='transcription-create'), + url(r'^transcription/bulk/?$', TranscriptionBulk.as_view(), name='transcription-bulk'), ] diff --git a/arkindex/project/polygon.py b/arkindex/project/polygon.py index 6566d7c6640c2f88a1852c3203370ded87fb9226..d19fa8b575dd574e84c7c3277405e34f99286d1a 100644 --- a/arkindex/project/polygon.py +++ b/arkindex/project/polygon.py @@ -1,23 +1,144 @@ from django.db import models +from django.utils.functional import cached_property import collections import re REGEX_SPLIT = re.compile(r'\((\d+),\s?(\d+)\)') -def string_to_polygon(values): - - return [ - (int(x), int(y)) - for x, y in REGEX_SPLIT.findall(values) - ] +class Point(object): + """ + Describe a point in the space. + """ + def __init__(self, x, y): + self.x = int(x) + self.y = int(y) + + def __hash__(self): + return hash((self.x, self.y)) + + def __repr__(self): + return 'Point({0.x},{0.y})'.format(self) + + def __str__(self): + return '({0.x},{0.y})'.format(self) + + def __eq__(self, other): + return (isinstance(other, self.__class__) and + self.x == other.x and + self.y == other.y) + + def __ne__(self, other): + return not self.__eq__(other) + + def __lt__(self, other): + return (isinstance(other, self.__class__) and + self.x <= other.x and + self.y <= other.y) + + +class Polygon(object): + ''' + A hashable Polygon in-memory + ''' + def __init__(self, points): + assert len(points) > 0 + if not all(isinstance(point, Point) for point in points): + points = [Point(*p) for p in points] + + # Clone & Close the polygon + self.points = list(points) + if self.points[0] != self.points[-1]: + self.points.append(points[0]) + + @staticmethod + def from_dict(data): + ''' + Helper to build an instance from dict data: + * from a direct polygon as list of tuples + * from x,y + width,height + ''' + assert isinstance(data, dict) + + # Direct usage + if 'polygon' in data and isinstance(data['polygon'], list): + return Polygon(data['polygon']) + + # Build from coords + for k in ('x', 'y', 'width', 'height'): + assert k in data, 'Missing key {}'.format(k) + assert data[k] is not None, 'Value of {} is None'.format(k) + assert isinstance(data[k], int) or isinstance(data[k], float), \ + 'Value of {} must be int or float'.format(k) + + return Polygon([ + (data['x'], data['y']), + (data['x'] + data['width'], data['y']), + (data['x'] + data['width'], data['y'] + data['height']), + (data['x'], data['y'] + data['height']), + (data['x'], data['y']), + ]) + + @staticmethod + def from_str(data): + assert isinstance(data, str) + return Polygon([ + (float(x), float(y)) + for x, y in REGEX_SPLIT.findall(data) + ]) + + @staticmethod + def from_coords(x, y, width, height): + return Polygon.from_dict({'x': x, 'y': y, 'width': width, 'height': height}) + + def __repr__(self): + return "Polygon({})".format(', '.join(map(str, self.points))) + + def __eq__(self, other): + if len(self.points) != len(other.points): + return False + return all( + x == y + for x, y in zip(self.points, other.points) + ) + + def __hash__(self): + return hash(tuple(self.points)) + + def serialize(self): + ''' + Used to save data in DB + ''' + return [(p.x, p.y) for p in self.points] + + @cached_property + def x(self): + return min([point.x for point in self.points]) + + @cached_property + def y(self): + return min([point.y for point in self.points]) + + @cached_property + def width(self): + return max([point.x for point in self.points]) - self.x + + @cached_property + def height(self): + return max([point.y for point in self.points]) - self.y + + @property + def center(self): + return Point( + self.x + self.width / 2.0, + self.y + self.height / 2.0, + ) class PolygonField(models.Field): """ Field to store a polygon; needs at least three set of points """ - def db_type(self, connection): return 'polygon' @@ -26,18 +147,18 @@ class PolygonField(models.Field): return None if isinstance(values, str): - values = string_to_polygon(values) + polygon = Polygon.from_str(values) if not isinstance(values, collections.Iterable): raise TypeError("Value {} is not iterable".format(values)) - return values + return polygon def from_db_value(self, value, expression, connection): if value is None: return - return string_to_polygon(value) + return Polygon.from_str(value) def get_prep_value(self, values): if values is None: @@ -45,12 +166,15 @@ class PolygonField(models.Field): if isinstance(values, list) or isinstance(values, tuple): if len(values) < 3: raise ValueError("Needs at minimum 3 points") + polygon = Polygon(values) elif isinstance(values, str): - values = string_to_polygon(values) + polygon = Polygon.from_str(values) + elif isinstance(values, Polygon): + polygon = values else: raise Exception('Not supported polygon input') - return ', '.join(str(v) for v in values) if values else None + return ', '.join(map(str, polygon.points)) if values else None def get_prep_lookup(self, lookup_type, value): raise NotImplementedError(self) diff --git a/arkindex/project/tests.py b/arkindex/project/tests.py index 5f1cdbca9adf77f8f8e065005e8c41a3ea57ed4e..615fff89873ce9e5404d5490cd1eae4acb6c7b38 100644 --- a/arkindex/project/tests.py +++ b/arkindex/project/tests.py @@ -1,8 +1,12 @@ from django.test import TestCase +from arkindex.project.polygon import Polygon from arkindex.images.models import ImageServer, Image, Zone class TestPolygonField(TestCase): + ''' + Test the DB field & usage + ''' def setUp(self): imgsrv = ImageServer.objects.create(name="Test Server", url="http://server") @@ -21,3 +25,58 @@ class TestPolygonField(TestCase): self.assertSequenceEqual(Zone.objects.filter(polygon__contains=self.z2.polygon), [self.z1, self.z2]) self.assertSequenceEqual(Zone.objects.filter(polygon__contains=self.z3.polygon), [self.z1, self.z3]) self.assertSequenceEqual(Zone.objects.filter(polygon__contains=self.z1.polygon), [self.z1]) + + +class TestPolygon(TestCase): + ''' + Test the low level class + ''' + + def test_base(self): + a = Polygon([ + [0, 0], + [100, 0], + [100, 100], + [0, 100], + ]) + self.assertEqual(str(a), 'Polygon((0,0), (100,0), (100,100), (0,100), (0,0))') + self.assertEqual(a, Polygon.from_coords(0, 0, 100, 100)) + self.assertEqual(a.x, 0) + self.assertEqual(a.y, 0) + self.assertEqual(a.width, 100) + self.assertEqual(a.height, 100) + self.assertEqual(a.center.x, 50) + self.assertEqual(a.center.y, 50) + self.assertEqual(len(a.points), 5) + + def test_dict(self): + a = Polygon.from_dict({ + 'dummy': 'test', + 'polygon': [[0, 0], [0, 1], [1, 1], [1, 0]], + }) + self.assertEqual(a.x, 0) + self.assertEqual(a.y, 0) + self.assertEqual(a.width, 1) + self.assertEqual(a.height, 1) + self.assertEqual(len(a.points), 5) + + a = Polygon.from_dict({ + 'dummy': 'anything', + 'x': 0, + 'y': 0, + 'width': 1, + 'height': 1, + }) + self.assertEqual(a.x, 0) + self.assertEqual(a.y, 0) + self.assertEqual(a.width, 1) + self.assertEqual(a.height, 1) + self.assertEqual(len(a.points), 5) + + def test_str(self): + a = Polygon.from_str('((0, 0), (0,10),(20, 10),(20,0))') + self.assertEqual(len(a.points), 5) + self.assertEqual(a.x, 0) + self.assertEqual(a.y, 0) + self.assertEqual(a.width, 20) + self.assertEqual(a.height, 10) diff --git a/arkindex/project/tools.py b/arkindex/project/tools.py index bc5a53df1ddfbf30acffa64e264d454652643c24..0e8f327a0832c62c9b879bf40b4b8c0df9c16115 100644 --- a/arkindex/project/tools.py +++ b/arkindex/project/tools.py @@ -2,100 +2,6 @@ from urllib.parse import urlsplit, SplitResult from django.conf import settings import random import string -import re - - -class Point(object): - """ - Describe a point in the space. - """ - - _FLOAT_RE = r'[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?' - POINT_RE = r'\(?(?P<x>{0}),\s*(?P<y>{0})\)?'.format(_FLOAT_RE) - - @staticmethod - def from_string(value): - """ - Convert a string describing a point into a `Point` instance. - The representation of a point as a string: - (x, y) - where `x` and `y` can be signed or unsigned integers or floats - """ - match = re.match(Point.POINT_RE, value) - - if not match: - raise ValueError("Value {} is not a valid point".format(value)) - - values = match.groupdict() - - return Point(float(values['x']), float(values['y'])) - - def __init__(self, x=0, y=0): - self.x = x - self.y = y - - def __repr__(self): - return '<Point({0.x},{0.y})>'.format(self) - - def __str__(self): - return '({0.x},{0.y})'.format(self) - - def __eq__(self, other): - return (isinstance(other, self.__class__) and - self.x == other.x and - self.y == other.y) - - def __ne__(self, other): - return not self.__eq__(other) - - def __lt__(self, other): - return (isinstance(other, self.__class__) and - self.x <= other.x and - self.y <= other.y) - - -class BoundingBox(object): - """ - A bounding box for a list of Point instances or a list of coordinates - """ - def __init__(self, points): - assert len(points) > 0 - if not all(isinstance(point, Point) for point in points): - points = [Point(c[0], c[1]) for c in points] - - all_x = [point.x for point in points] - all_y = [point.y for point in points] - - self.x, self.y = min(all_x), min(all_y) - self.width, self.height = max(all_x) - self.x, max(all_y) - self.y - - def __repr__(self): - return "BoundingBox({}, {}, {}, {})".format( - self.x, self.y, self.width, self.height) - - def __eq__(self, other): - return self.x == other.x \ - and self.y == other.y \ - and self.width == other.width \ - and self.height == other.height - - def __hash__(self): - return hash((self.x, self.y, self.width, self.height)) - - @property - def center(self): - return ( - self.x + self.width / 2, - self.y + self.height / 2, - ) - - def to_polygon(self): - return [ - (self.x, self.y), - (self.x, self.y + self.height), - (self.x + self.width, self.y + self.height), - (self.x + self.width, self.y), - ] def sslify_url(url):