Skip to content
Snippets Groups Projects
Commit 10d4ad94 authored by ml bonhomme's avatar ml bonhomme :bee: Committed by Erwan Rouchet
Browse files

support worker run ids on transcription entities

parent 74487e31
No related branches found
No related tags found
1 merge request!1700support worker run ids on transcription entities
......@@ -232,10 +232,6 @@ class TranscriptionEntityCreate(CreateAPIView):
def perform_create(self, serializer):
data = serializer.validated_data
if TranscriptionEntity.objects.filter(**data).exists():
raise serializers.ValidationError({
'__all__': ['Entity is already linked to this transcription at this position']
})
try:
TranscriptionEntity(**data).clean()
except ValidationError as e:
......
# Generated by Django 4.0.2 on 2022-05-17 14:46
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('dataimport', '0048_workerrun_model_version'),
('documents', '0055_entity_worker_run_and_more'),
]
operations = [
migrations.AlterUniqueTogether(
name='transcriptionentity',
unique_together=set(),
),
migrations.AddField(
model_name='transcriptionentity',
name='worker_run',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='transcription_entities', to='dataimport.workerrun'),
),
migrations.AddConstraint(
model_name='transcriptionentity',
constraint=models.CheckConstraint(check=models.Q(('worker_version_id__isnull', False), ('worker_run_id__isnull', True), _connector='OR'), name='transcription_entity_worker_run_requires_worker_version'),
),
migrations.AddConstraint(
model_name='transcriptionentity',
constraint=models.UniqueConstraint(condition=models.Q(('worker_version_id__isnull', True)), fields=('transcription', 'entity', 'offset', 'length'), name='transcription_entity_unique_manual'),
),
migrations.AddConstraint(
model_name='transcriptionentity',
constraint=models.UniqueConstraint(condition=models.Q(('worker_run_id__isnull', True), ('worker_version_id__isnull', False)), fields=('transcription', 'entity', 'offset', 'length', 'worker_version'), name='transcription_entity_unique_worker_version'),
),
migrations.AddConstraint(
model_name='transcriptionentity',
constraint=models.UniqueConstraint(condition=models.Q(('worker_run_id__isnull', False)), fields=('transcription', 'entity', 'offset', 'length', 'worker_run'), name='transcription_entity_unique_worker_run'),
),
]
......@@ -652,12 +652,53 @@ class TranscriptionEntity(models.Model):
blank=True,
)
worker_run = models.ForeignKey(
'dataimport.WorkerRun',
related_name='transcription_entities',
# WorkerRuns that are on processes that have not yet run can be deleted easily by the user, to allow them
# to customize the worker runs before starting the process.
# Since WorkerRuns are related to each worker result, deleting a worker run with models.CASCADE
# could cause a dozen useless SQL queries to look for related classifications and other worker results that
# could be related to it, and could fill up the RAM.
# Using models.RESTRICT or PROTECT would also cause Django to check for related classifications to raise a
# RestrictedError or ProtectedError, which both would cause a HTTP 500. Using models.DO_NOTHING lets PostgreSQL
# do the checks itself, which it always does anyway, and crash with an IntegrityError, which also causes a 500.
on_delete=models.DO_NOTHING,
null=True,
blank=True
)
confidence = models.FloatField(validators=[MinValueValidator(0), MaxValueValidator(1)], null=True, blank=True)
class Meta:
unique_together = (
('transcription', 'entity', 'offset', 'length', 'worker_version'),
)
constraints = [
# There can be a worker run ID only if there is a worker version ID,
# but there can be a worker version ID without a worker run ID (backwards compatibility).
# In other words, either the worker run ID is null, or the worker version ID is not null.
models.CheckConstraint(
check=Q(worker_version_id__isnull=False) | Q(worker_run_id__isnull=True),
name='transcription_entity_worker_run_requires_worker_version',
),
# Add unicity for manual and non manual transcription entities on a transcription.
# Two constraints are required as Null values are not compared for unicity.
models.UniqueConstraint(
fields=['transcription', 'entity', 'offset', 'length'],
name='transcription_entity_unique_manual',
condition=Q(worker_version_id__isnull=True),
),
# Keep the old unique constraint on worker versions for transcription entities
# without worker runs for backwards compatibility
models.UniqueConstraint(
fields=['transcription', 'entity', 'offset', 'length', 'worker_version'],
name='transcription_entity_unique_worker_version',
condition=Q(worker_version_id__isnull=False, worker_run_id__isnull=True),
),
models.UniqueConstraint(
fields=['transcription', 'entity', 'offset', 'length', 'worker_run'],
name='transcription_entity_unique_worker_run',
condition=Q(worker_run_id__isnull=False),
),
]
def clean(self):
if self.transcription.element.corpus != self.entity.corpus:
......
......@@ -227,11 +227,21 @@ class TranscriptionEntitySerializer(serializers.ModelSerializer):
default=None,
style={'base_template': 'input.html'},
)
worker_run_id = serializers.PrimaryKeyRelatedField(
queryset=WorkerRun.objects.all(),
required=False,
allow_null=True,
style={'base_template': 'input.html'},
source='worker_run'
)
confidence = serializers.FloatField(min_value=0, max_value=1, default=None)
class Meta:
model = TranscriptionEntity
fields = ('entity', 'offset', 'length', 'worker_version_id', 'confidence')
fields = ('entity', 'offset', 'length', 'worker_version_id', 'worker_run_id', 'confidence')
validators = [
WorkerRunOrVersionValidator(),
]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
......@@ -259,6 +269,21 @@ class TranscriptionEntitySerializer(serializers.ModelSerializer):
'An internal user is required to link an entity to a transcription with a worker version.'
]
worker_run = data.get('worker_run')
if worker_run is not None:
data['worker_version'] = WorkerVersion(id=worker_run.version_id)
existing_transcription_entities = TranscriptionEntity.objects.filter(transcription=data['transcription'], entity=data['entity'], offset=data['offset'], length=data['length'])
if worker_run and data['worker_version']:
if existing_transcription_entities.filter(worker_run=worker_run).exists():
errors['__all__'] = ['This entity is already linked to this transcription by this worker run at this position.']
elif data['worker_version']:
if existing_transcription_entities.filter(worker_version=data['worker_version']).exists():
errors['__all__'] = ['This entity is already linked to this transcription by this worker version at this position.']
else:
if existing_transcription_entities.exists():
errors['__all__'] = ['This entity is already linked to this transcription at this position.']
if errors:
raise serializers.ValidationError(errors)
......
......@@ -92,6 +92,12 @@ class TestEntitiesAPI(FixtureAPITestCase):
'length': len(self.entity.name),
'worker_version_id': str(self.worker_version_1.id)
}
self.tr_entities_run_sample = {
'entity': str(self.entity.id),
'offset': 4,
'length': len(self.entity.name),
'worker_run_id': str(self.worker_run.id)
}
def make_create_entity_request(self, name='entity', ent_type=EntityType.Person.value, **options):
request = {
......@@ -430,11 +436,12 @@ class TestEntitiesAPI(FixtureAPITestCase):
def test_create_transcription_entity(self):
self.client.force_login(self.user)
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_sample,
format='json'
)
with self.assertNumQueries(11):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
transcription_entity = TranscriptionEntity.objects.get(
transcription=self.transcription,
......@@ -450,6 +457,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
'offset': transcription_entity.offset,
'length': transcription_entity.length,
'worker_version_id': None,
'worker_run_id': None,
'confidence': None
}
)
......@@ -477,6 +485,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
'offset': transcription_entity.offset,
'length': transcription_entity.length,
'worker_version_id': None,
'worker_run_id': None,
'confidence': transcription_entity.confidence
}
)
......@@ -491,11 +500,12 @@ class TestEntitiesAPI(FixtureAPITestCase):
def test_create_transcription_entity_worker_version(self):
self.client.force_login(self.internal_user)
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_version_sample,
format='json'
)
with self.assertNumQueries(10):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_version_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
transcription_entity = TranscriptionEntity.objects.get(
transcription=self.transcription,
......@@ -517,6 +527,48 @@ class TestEntitiesAPI(FixtureAPITestCase):
"worker_version_id": ['An internal user is required to link an entity to a transcription with a worker version.']
})
def test_create_transcription_entity_worker_run(self):
self.client.force_login(self.user)
with self.assertNumQueries(12):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_run_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
self.assertDictEqual(response.json(), {
'entity': self.tr_entities_run_sample['entity'],
'offset': self.tr_entities_run_sample['offset'],
'length': self.tr_entities_run_sample['length'],
'worker_version_id': str(self.worker_run.version_id),
'worker_run_id': str(self.worker_run.id),
'confidence': None
})
def test_create_transcription_entity_worker_run_or_version(self):
self.client.force_login(self.user)
payload = self.tr_entities_run_sample.copy()
payload['worker_version_id'] = str(self.worker_version_1.id)
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=payload,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertDictEqual(response.json(), {"non_field_errors": ["Only one of `worker_version_id` and `worker_run_id` may be set."]})
def test_create_transcription_entity_bad_worker_run(self):
self.client.force_login(self.user)
payload = self.tr_entities_run_sample.copy()
payload['worker_run_id'] = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=payload,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertDictEqual(response.json(), {"worker_run_id": ['Invalid pk "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" - object does not exist.']})
def test_create_transcription_entity_wrong_acl(self):
self.client.force_login(self.user)
self.element.corpus = self.private_corpus
......@@ -605,7 +657,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
{'entity': [f'Invalid pk "{ent.id}" - object does not exist.']}
)
def test_create_transcription_entity_exists(self):
def test_create_transcription_entity_duplicate(self):
self.client.force_login(self.user)
TranscriptionEntity.objects.create(
transcription=self.transcription,
......@@ -613,12 +665,57 @@ class TestEntitiesAPI(FixtureAPITestCase):
offset=4,
length=len(self.entity.name)
)
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_sample,
format='json'
with self.assertNumQueries(7):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertDictEqual(response.json(), {
'__all__': ['This entity is already linked to this transcription at this position.']
})
def test_create_transcription_entity_duplicate_worker_version(self):
self.client.force_login(self.internal_user)
TranscriptionEntity.objects.create(
transcription=self.transcription,
entity=self.entity,
offset=4,
length=len(self.entity.name),
worker_version=self.worker_version_1
)
with self.assertNumQueries(6):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_version_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertDictEqual(response.json(), {
'__all__': ['This entity is already linked to this transcription by this worker version at this position.']
})
def test_create_transcription_entity_duplicate_worker_run(self):
self.client.force_login(self.internal_user)
TranscriptionEntity.objects.create(
transcription=self.transcription,
entity=self.entity,
offset=4,
length=len(self.entity.name),
worker_run=self.worker_run,
worker_version=self.worker_version_1
)
with self.assertNumQueries(6):
response = self.client.post(
reverse('api:transcription-entity-create', kwargs={'pk': str(self.transcription.id)}),
data=self.tr_entities_run_sample,
format='json'
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertDictEqual(response.json(), {
'__all__': ['This entity is already linked to this transcription by this worker run at this position.']
})
def test_create_transcription_entity_key_missing(self):
self.client.force_login(self.user)
......@@ -671,6 +768,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
'length': self.transcriptionentity.length,
'offset': self.transcriptionentity.offset,
'worker_version_id': None,
'worker_run_id': None,
'confidence': None
}]
)
......@@ -722,6 +820,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
'length': 8,
'offset': 8,
'worker_version_id': None,
'worker_run_id': None,
'confidence': None
}]
)
......@@ -866,6 +965,7 @@ class TestEntitiesAPI(FixtureAPITestCase):
'offset': t.offset,
'length': t.length,
'worker_version_id': None,
'worker_run_id': None,
'confidence': None
} for t in TranscriptionEntity.objects.filter(transcription=self.transcription).order_by('offset')],
'metadata': [{
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment