diff --git a/arkindex/documents/api/ml.py b/arkindex/documents/api/ml.py index 9980943970ecff969fcd2e7c95bce72312e3854a..132d2604ac57bf6d15fbf7fc979435079c615966 100644 --- a/arkindex/documents/api/ml.py +++ b/arkindex/documents/api/ml.py @@ -11,7 +11,7 @@ from rest_framework.views import APIView from arkindex.documents.models import \ Corpus, Element, DataSource, Classification, ClassificationState, Transcription, Entity, Right, MLClass from arkindex_common.ml_tool import MLToolType -from arkindex_common.enums import TranscriptionType, EntityType +from arkindex_common.enums import EntityType from arkindex.documents.serializers.ml import ( ClassificationsSerializer, ClassificationCreateSerializer, ClassificationSerializer, TranscriptionsSerializer, TranscriptionCreateSerializer, DataSourceStatsSerializer, @@ -40,30 +40,45 @@ class TranscriptionCreate(CreateAPIView): permission_classes = (IsVerified, ) openapi_overrides = { 'operationId': 'CreateTranscription', - 'tags': ['ml'], + 'tags': ['elements'], } + def get_object(self): + if not hasattr(self, 'element'): + self.element = super().get_object() + return self.element + + def get_queryset(self): + if not self.request: + return Element.objects.none() + return Element.objects.filter( + corpus__in=Corpus.objects.writable(self.request.user) + ) + + def get_serializer_context(self): + context = super().get_serializer_context() + if self.request: + context['element'] = self.get_object() + return context + def perform_create(self, serializer): - transcription_type = serializer.validated_data['type'] - if transcription_type not in TranscriptionType: - raise ValidationError("This endpoint can only import transcriptions.") - element = serializer.validated_data['element'] - - ts, created = Transcription.objects.get_or_create( - element=element, - source=serializer.validated_data['source'], - type=transcription_type, - text=serializer.validated_data['text'], + manual_source, _ = DataSource.objects.get_or_create( + type=MLToolType.Recognizer, + slug='manual', defaults={ - 'score': serializer.validated_data['score'] + 'revision': '', + 'internal': False, } ) - if not created or ts.score != serializer.validated_data['score']: - ts.score = serializer.validated_data['score'] - ts.save() + ts = Transcription.objects.create( + element=self.element, + source=manual_source, + type=serializer.validated_data['type'], + text=serializer.validated_data['text'], + ) # Index in ES - reindex_start(element=element, transcriptions=True, elements=True) + reindex_start(element=self.element, transcriptions=True, elements=True) return ts def create(self, request, *args, **kwargs): diff --git a/arkindex/documents/migrations/0015_elementtype_allowed_transcription.py b/arkindex/documents/migrations/0015_elementtype_allowed_transcription.py new file mode 100644 index 0000000000000000000000000000000000000000..afd2a9bb5cc2614f8f4da106eca4e30e00e86094 --- /dev/null +++ b/arkindex/documents/migrations/0015_elementtype_allowed_transcription.py @@ -0,0 +1,25 @@ +# Generated by Django 2.2.9 on 2020-06-03 13:26 + +import arkindex_common.enums +from django.db import migrations +import enumfields.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0014_nullable_transcription_zone'), + ] + + operations = [ + migrations.AddField( + model_name='elementtype', + name='allowed_transcription', + field=enumfields.fields.EnumField( + blank=True, + enum=arkindex_common.enums.TranscriptionType, + max_length=10, + null=True + ), + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 0486eb4a37e70f3beffafe9afea2604d8ce1106b..26ca7659d8da55717ab9ddfb5e347ade6f62e039 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -89,6 +89,8 @@ class ElementType(models.Model): display_name = models.CharField(max_length=250) folder = models.BooleanField(default=False) default_view = models.BooleanField(default=False) + # An allowed manual transcription for elements of this type + allowed_transcription = EnumField(TranscriptionType, blank=True, null=True) class Meta: constraints = [ diff --git a/arkindex/documents/serializers/light.py b/arkindex/documents/serializers/light.py index c9d6c55125bf112009e8c3f8819b73040ba9bcee..6d55181d6c6654474a694671db2ad9ca64bda824 100644 --- a/arkindex/documents/serializers/light.py +++ b/arkindex/documents/serializers/light.py @@ -2,7 +2,7 @@ from rest_framework import serializers from rest_framework.exceptions import ValidationError, APIException from django.db.models import Max from arkindex.documents.models import Element, ElementType, Corpus, Entity, MetaData, AllowedMetaData -from arkindex_common.enums import EntityType, MetaType +from arkindex_common.enums import EntityType, MetaType, TranscriptionType from arkindex.documents.dates import DateType from arkindex.documents.serializers.ml import DataSourceSerializer from arkindex.dataimport.serializers.git import RevisionSerializer @@ -44,6 +44,7 @@ class ElementLightSerializer(serializers.ModelSerializer): class ElementTypeSerializer(serializers.ModelSerializer): + allowed_transcription = EnumField(TranscriptionType) class Meta: model = ElementType @@ -52,6 +53,7 @@ class ElementTypeSerializer(serializers.ModelSerializer): 'display_name', 'folder', 'default_view', + 'allowed_transcription', ) diff --git a/arkindex/documents/serializers/ml.py b/arkindex/documents/serializers/ml.py index 277832fcb3c8ab9745274fbb89e25baaa728e58c..04c30af3ef6bdb6dae5ce93702b52bd6d21c5d8f 100644 --- a/arkindex/documents/serializers/ml.py +++ b/arkindex/documents/serializers/ml.py @@ -168,27 +168,32 @@ class TranscriptionSerializer(serializers.ModelSerializer): ) -class TranscriptionCreateSerializer(serializers.Serializer): +class TranscriptionCreateSerializer(serializers.ModelSerializer): """ - Allows for insertion of new transcriptions and zones + Allows the insertion of a manual transcription attached to an element """ - element = serializers.PrimaryKeyRelatedField( - queryset=Element.objects.filter(zone__isnull=False), - ) - source = DataSourceSlugField(tool_type=MLToolType.Recognizer) - polygon = PolygonField(read_only=True) - text = serializers.CharField() - score = serializers.FloatField(min_value=0, max_value=1) type = EnumField(TranscriptionType) + text = serializers.CharField() - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if not self.context.get('request'): # May be None when generating an OpenAPI schema or using from a REPL - return - self.fields['element'].queryset = Element.objects.filter( - zone__isnull=False, - corpus__in=Corpus.objects.writable(self.context['request'].user), - ) + class Meta: + model = Transcription + fields = ('text', 'type') + + def validate(self, data): + data = super().validate(data) + element = self.context.get('element') + # In case of a manual transcription, assert the type is allowed + if not element.zone: + raise ValidationError({'element': ['The element has no zone']}) + allowed_transcription = element.type.allowed_transcription + if not allowed_transcription: + raise ValidationError({'element': ['The element type does not allow creating a manual transcription']}) + + if data['type'] is not allowed_transcription: + raise ValidationError({'type': [ + f"Only transcriptions of type '{allowed_transcription.value}' are allowed for this element" + ]}) + return data class TranscriptionBulkSerializer(serializers.Serializer): diff --git a/arkindex/documents/tests/test_corpus.py b/arkindex/documents/tests/test_corpus.py index e3971cd099d1b9305db5017ed0941cf7b11409ff..a6a030a6667cb20b79df20c37022f340840e3cc6 100644 --- a/arkindex/documents/tests/test_corpus.py +++ b/arkindex/documents/tests/test_corpus.py @@ -44,25 +44,29 @@ class TestCorpus(FixtureAPITestCase): 'slug': 'volume', 'display_name': 'Volume', 'folder': True, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'page', 'display_name': 'Page', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'act', 'display_name': 'Act', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'surface', 'display_name': 'Surface', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None } ]) @@ -95,25 +99,29 @@ class TestCorpus(FixtureAPITestCase): 'slug': 'volume', 'display_name': 'Volume', 'folder': True, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'page', 'display_name': 'Page', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'act', 'display_name': 'Act', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'surface', 'display_name': 'Surface', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None } ]) @@ -156,25 +164,29 @@ class TestCorpus(FixtureAPITestCase): 'slug': 'volume', 'display_name': 'Volume', 'folder': True, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'page', 'display_name': 'Page', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'act', 'display_name': 'Act', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'surface', 'display_name': 'Surface', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None } ]) @@ -272,25 +284,29 @@ class TestCorpus(FixtureAPITestCase): 'slug': 'volume', 'display_name': 'Volume', 'folder': True, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'page', 'display_name': 'Page', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'act', 'display_name': 'Act', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, { 'slug': 'surface', 'display_name': 'Surface', 'folder': False, - 'default_view': False + 'default_view': False, + 'allowed_transcription': None }, ]) self.assertDictEqual(data, { diff --git a/arkindex/documents/tests/test_transcription_create.py b/arkindex/documents/tests/test_transcription_create.py index e6bfc61f9e0281952c2504f792375e07be10664b..2691280a9ef69ee46f615ee2b3203a4f1b5c2fbd 100644 --- a/arkindex/documents/tests/test_transcription_create.py +++ b/arkindex/documents/tests/test_transcription_create.py @@ -6,8 +6,9 @@ from rest_framework import status from arkindex.project.tests import FixtureAPITestCase from arkindex.project.polygon import Polygon from arkindex_common.enums import TranscriptionType -from arkindex.documents.models import Transcription, DataSource -import uuid +from arkindex_common.ml_tool import MLToolType +from arkindex.documents.models import Corpus, Transcription, DataSource +from arkindex.users.models import User class TestTranscriptionCreate(FixtureAPITestCase): @@ -21,32 +22,92 @@ class TestTranscriptionCreate(FixtureAPITestCase): cls.page = cls.corpus.elements.get(name='Volume 1, page 1r') cls.vol = cls.corpus.elements.get(name='Volume 1') cls.src = DataSource.objects.get(slug='test') + cls.page.type.allowed_transcription = TranscriptionType.Word + cls.page.type.save() - def test_require_login(self): - response = self.client.post(reverse('api:transcription-create'), format='json') + def test_create_transcription_require_login(self): + response = self.client.post(reverse('api:transcription-create', kwargs={'pk': self.page.id}), format='json') self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + self.assertDictEqual(response.json(), { + 'detail': 'Authentication credentials were not provided.' + }) + + def test_create_transcription_write_right(self): + private_corpus = Corpus.objects.create(name='Private') + private_page = private_corpus.elements.create(type=self.page.type) + user = User.objects.create_user('a@bc.de', 'a') + private_corpus.corpus_right.create(user=user) + + self.client.force_login(user) + + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': private_page.id}), + format='json', + data={ + 'type': 'word', + 'text': 'NEKUDOTAYIM' + } + ) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + self.assertDictEqual(response.json(), { + 'detail': 'You do not have permission to perform this action.' + }) + + @patch('arkindex.project.triggers.get_channel_layer') + def test_create_transcription_no_element(self, get_layer_mock): + from uuid import uuid4 + self.client.force_login(self.user) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': uuid4()}), + format='json', + data={ + 'type': 'word', + 'text': 'NEKUDOTAYIM' + } + ) + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + + @patch('arkindex.project.triggers.get_channel_layer') + def test_create_transcription_elt_null_zone(self, get_layer_mock): + null_zone_page = self.corpus.elements.create(type=self.page.type) + + self.client.force_login(self.user) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': null_zone_page.id}), + format='json', + data={ + 'type': 'word', + 'text': 'NEKUDOTAYIM' + } + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), { + 'element': ['The element has no zone'] + }) @patch('arkindex.project.triggers.get_channel_layer') def test_create_transcription(self, get_layer_mock): """ - Checks the view creates transcriptions, zones, links, paths and runs ES indexing + Checks the view creates transcriptions and runs ES indexing """ get_layer_mock.return_value.send = AsyncMock() self.client.force_login(self.user) - response = self.client.post(reverse('api:transcription-create'), format='json', data={ - "type": "word", - "element": str(self.page.id), - "source": self.src.slug, - "text": "NEKUDOTAYIM", - "score": 0.83, - }) + with self.assertNumQueries(11): + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'word', + 'text': 'NEKUDOTAYIM' + } + ) self.assertEqual(response.status_code, status.HTTP_201_CREATED) new_ts = Transcription.objects.get(text="NEKUDOTAYIM", type=TranscriptionType.Word) - self.assertEqual(new_ts.zone, None) - self.assertEqual(new_ts.score, 0.83) - self.assertEqual(new_ts.source, self.src) + self.assertIsNone(new_ts.zone) + self.assertIsNone(new_ts.score) + self.assertEqual(new_ts.source.slug, 'manual') self.assertTrue(self.page.transcriptions.filter(pk=new_ts.id).exists()) get_layer_mock().send.assert_called_once_with('reindex', { @@ -68,94 +129,107 @@ class TestTranscriptionCreate(FixtureAPITestCase): get_layer_mock.return_value.send = AsyncMock() self.client.force_login(self.user) - response = self.client.post(reverse('api:transcription-create'), format='json', data={ - "type": "page", - "element": str(self.page.id), - "source": self.src.slug, - "polygon": [(0, 0), (42, 0), (42, 42), (0, 42), (0, 0)], - "text": "SQUARE", - "score": 0.42, - }) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'word', + 'polygon': [(0, 0), (42, 0), (42, 42), (0, 42), (0, 0)], + 'text': 'SQUARE' + } + ) self.assertEqual(response.status_code, status.HTTP_201_CREATED) - new_ts = Transcription.objects.get(text="SQUARE", type=TranscriptionType.Page) + new_ts = Transcription.objects.get(text='SQUARE', type=TranscriptionType.Word) self.assertEqual(new_ts.zone, None) @patch('arkindex.project.triggers.get_channel_layer') - def test_update_transcription(self, get_layer_mock): + def test_create_duplicated_transcription(self, get_layer_mock): """ - Checks the view updates transcriptions when they already exist + Check the view creates a new transcriptions with a similar manual source """ + manual_source = DataSource.objects.create(type=MLToolType.Recognizer, slug='manual', internal=True) get_layer_mock.return_value.send = AsyncMock() self.client.force_login(self.user) - ts = Transcription.objects.get(zone__image__path='img1', text="PARIS") - self.assertNotEqual(ts.score, 0.99) - response = self.client.post(reverse('api:transcription-create'), format='json', data={ - "type": "word", - "element": str(self.page.id), - "source": ts.source.slug, - "text": ts.text, - "score": 0.99, - }) + ts = self.page.transcriptions.create(text='GLOUBIBOULGA', type=TranscriptionType.Word, source=manual_source) + with self.assertNumQueries(8): + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'word', + 'text': ts.text + } + ) self.assertEqual(response.status_code, status.HTTP_201_CREATED) - self.assertEqual(Transcription.objects.filter(zone__image__path='img1', text=ts.text).count(), 1) + self.assertEqual(Transcription.objects.filter(text=ts.text, source=manual_source).count(), 2) ts.refresh_from_db() - self.assertEqual(ts.score, 0.99) + self.assertNotEqual(ts.score, 0.99) - get_layer_mock().send.assert_called_once_with('reindex', { - 'type': 'reindex.start', - 'element': str(self.page.id), - 'corpus': None, - 'entity': None, - 'transcriptions': True, - 'elements': True, - 'entities': False, - 'drop': False, + @patch('arkindex.project.triggers.get_channel_layer') + def test_create_transcription_wrong_type(self, get_layer_mock): + self.client.force_login(self.user) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'AAAAA', + 'text': 'NEKUDOTAYIM' + } + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertDictEqual(response.json(), { + 'type': ['Value is not of type TranscriptionType'] }) + @patch('arkindex.project.triggers.get_channel_layer') @override_settings(ARKINDEX_FEATURES={'search': False}) - def test_invalid_data(self): - """ - Checks the view validates data properly - (score between 0 and 1, non-existent element) - """ + def test_create_transcription_no_search(self, get_layer_mock): self.client.force_login(self.user) - - post_data = { - "type": "word", - "element": str(self.page.id), - "source": self.src.slug, - "text": "NEKUDOTAYIM", - "score": 0.83, - } - - # Assert data is valid - response = self.client.post(reverse('api:transcription-create'), format='json', data=post_data) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'word', + 'text': 'NEKUDOTAYIM' + } + ) self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertFalse(get_layer_mock().send.called) - # Negative score - data = {**post_data, 'score': -1} - response = self.client.post(reverse('api:transcription-create'), format='json', data=data) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertDictEqual(response.json(), { - 'score': ['Ensure this value is greater than or equal to 0.'] - }) - - # Score over 100% - data = {**post_data, 'score': 2} - response = self.client.post(reverse('api:transcription-create'), format='json', data=data) + def test_manual_transcription_forbidden_type(self): + """ + Creating a manual transcription with a non allowed type is forbidden + """ + self.client.force_login(self.user) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'line', + 'text': 'A classy text line' + } + ) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertDictEqual(response.json(), { - 'score': ['Ensure this value is less than or equal to 1.'] + 'type': ["Only transcriptions of type 'word' are allowed for this element"] }) - # Missing element - wrong_id = str(uuid.uuid4()) - data = {**post_data, 'element': wrong_id} - response = self.client.post(reverse('api:transcription-create'), format='json', data=data) + def test_manual_transcription_no_allowed_type(self): + self.page.type.allowed_transcription = None + self.page.type.save() + self.client.force_login(self.user) + response = self.client.post( + reverse('api:transcription-create', kwargs={'pk': self.page.id}), + format='json', + data={ + 'type': 'line', + 'text': 'A classy text line' + } + ) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertDictEqual(response.json(), { - 'element': [f'Invalid pk "{wrong_id}" - object does not exist.'] + 'element': ['The element type does not allow creating a manual transcription'] }) @patch('arkindex.project.serializer_fields.MLTool.get') @@ -411,7 +485,7 @@ class TestTranscriptionCreate(FixtureAPITestCase): @patch('arkindex.project.serializer_fields.MLTool.get') @patch('arkindex.project.triggers.get_channel_layer') @override_settings(ARKINDEX_FEATURES={'search': False}) - def test_unique_zone(self, get_layer_mock, ml_get_mock): + def test_bulk_unique_zone(self, get_layer_mock, ml_get_mock): """ Checks the bulk view reuses zones when available """ @@ -441,35 +515,3 @@ class TestTranscriptionCreate(FixtureAPITestCase): # A new transcription has been added with the same zone self.assertEqual(self.page.transcriptions.count(), 5) self.assertEqual(Transcription.objects.get(text="PARISH").zone.id, existing_ts.zone.id) - - @patch('arkindex.project.triggers.get_channel_layer') - @override_settings(ARKINDEX_FEATURES={'search': False}) - def test_create_transcription_no_search(self, get_layer_mock): - self.client.force_login(self.user) - response = self.client.post(reverse('api:transcription-create'), format='json', data={ - "type": "word", - "element": str(self.page.id), - "source": self.src.slug, - "polygon": [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)], - "text": "NEKUDOTAYIM", - "score": 0.83, - }) - self.assertEqual(response.status_code, status.HTTP_201_CREATED) - self.assertFalse(get_layer_mock().send.called) - - @patch('arkindex.project.triggers.get_channel_layer') - @override_settings(ARKINDEX_FEATURES={'search': False}) - def test_update_transcription_no_search(self, get_layer_mock): - self.client.force_login(self.user) - ts = Transcription.objects.get(zone__image__path='img1', text="PARIS") - self.assertNotEqual(ts.score, 0.99) - response = self.client.post(reverse('api:transcription-create'), format='json', data={ - "type": "word", - "element": str(self.page.id), - "source": ts.source.slug, - "polygon": ts.zone.polygon.serialize(), - "text": ts.text, - "score": 0.99, - }) - self.assertEqual(response.status_code, status.HTTP_201_CREATED) - self.assertFalse(get_layer_mock().send.called) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index e36624fac0e2c13f4d14f8aee8c118b663c99674..7b73099fabadfb70b34cdac636f0a314931176b1 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -51,6 +51,7 @@ api = [ path('element/<uuid:pk>/metadata/', ElementMetadata.as_view(), name='element-metadata'), path('element/<uuid:pk>/entities/', ElementEntities.as_view(), name='element-entities'), path('element/<uuid:pk>/links/', ElementLinks.as_view(), name='element-links'), + path('element/<uuid:pk>/transcription/', TranscriptionCreate.as_view(), name='transcription-create'), path('element/<uuid:pk>/transcriptions/', ElementTranscriptions.as_view(), name='element-transcriptions'), path('element/<uuid:pk>/ml-stats/', ElementMLStats.as_view(), name='element-ml-stats'), path('element/<uuid:child>/parent/<uuid:parent>/', ElementParent.as_view(), name='element-parent'), @@ -102,7 +103,6 @@ api = [ path('entity/search/', EntitySearch.as_view(), name='entity-search'), # Ingest transcriptions - path('transcription/', TranscriptionCreate.as_view(), name='transcription-create'), path('transcription/bulk/', TranscriptionBulk.as_view(), name='transcription-bulk'), path('classification/bulk/', ClassificationBulk.as_view(), name='classification-bulk'),