From 4d48eff8edcb02ed0e11b695a84dff2c8bea49ed Mon Sep 17 00:00:00 2001 From: Bastien Abadie <bastien@nextcairn.com> Date: Mon, 17 Sep 2018 11:37:10 +0200 Subject: [PATCH] Avoid doublons on transcriptions by adding a unique constraint --- .../migrations/0025_avoid_doublons.py | 63 +++++++++++++++++++ arkindex/documents/models.py | 8 +++ 2 files changed, 71 insertions(+) create mode 100644 arkindex/documents/migrations/0025_avoid_doublons.py diff --git a/arkindex/documents/migrations/0025_avoid_doublons.py b/arkindex/documents/migrations/0025_avoid_doublons.py new file mode 100644 index 0000000000..fd81bc5add --- /dev/null +++ b/arkindex/documents/migrations/0025_avoid_doublons.py @@ -0,0 +1,63 @@ +# Generated by Django 2.1 on 2018-09-17 08:35 + +from django.db import migrations, models +from django.db import connection +import django.db.models.deletion + +# This sql query delete all the transcriptions +# that have the same (element_id, zone_id, text) +# keeping only the first item +SQL_REMOVE = ''' +delete from documents_transcription +where id in ( + select unnest((array_agg(id))[2:]) + from documents_transcription + group by (element_id, zone_id, text) + having count(*) > 1 +); +''' + +# This sql query create a unique together index +# on (element_id, zone_id, text) +# It cannot be automatically achieved by Django +# as it needs to make a md5 hash of text to fit in its index max size +SQL_UNIQUE = ''' +create unique index documents_transcription_uniq_elt_zone_text +on documents_transcription ( + zone_id, + element_id, + md5(text) +) +''' + + +def remove_doublons(apps, schema_editor): + ''' + Run above sql queries + ''' + with connection.cursor() as cursor: + cursor.execute(SQL_REMOVE) + cursor.execute(SQL_UNIQUE) + + # Custom index + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0005_zone_polygon_tmp'), + ('documents', '0024_page_text'), + ] + + operations = [ + migrations.AlterField( + model_name='elementpath', + name='element', + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='paths', + to='documents.Element', + ), + ), + migrations.RunPython(remove_doublons), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 211d9cd44a..23522e15db 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -474,6 +474,14 @@ class Transcription(models.Model): text = models.TextField(null=True, blank=True) score = models.FloatField(null=True, blank=True) + class Meta: + # This index is manually created in a migration to + # support the md5(text) operation + # unique_together = ( + # ('element', 'zone', 'text') + # ) + pass + def __str__(self): return 'Transcription: {}'.format(self.text[:20]) -- GitLab