diff --git a/arkindex/documents/migrations/0025_avoid_doublons.py b/arkindex/documents/migrations/0025_avoid_doublons.py new file mode 100644 index 0000000000000000000000000000000000000000..fd81bc5add664b9c0a608ad8918bd8064f7f84e2 --- /dev/null +++ b/arkindex/documents/migrations/0025_avoid_doublons.py @@ -0,0 +1,63 @@ +# Generated by Django 2.1 on 2018-09-17 08:35 + +from django.db import migrations, models +from django.db import connection +import django.db.models.deletion + +# This sql query delete all the transcriptions +# that have the same (element_id, zone_id, text) +# keeping only the first item +SQL_REMOVE = ''' +delete from documents_transcription +where id in ( + select unnest((array_agg(id))[2:]) + from documents_transcription + group by (element_id, zone_id, text) + having count(*) > 1 +); +''' + +# This sql query create a unique together index +# on (element_id, zone_id, text) +# It cannot be automatically achieved by Django +# as it needs to make a md5 hash of text to fit in its index max size +SQL_UNIQUE = ''' +create unique index documents_transcription_uniq_elt_zone_text +on documents_transcription ( + zone_id, + element_id, + md5(text) +) +''' + + +def remove_doublons(apps, schema_editor): + ''' + Run above sql queries + ''' + with connection.cursor() as cursor: + cursor.execute(SQL_REMOVE) + cursor.execute(SQL_UNIQUE) + + # Custom index + + +class Migration(migrations.Migration): + + dependencies = [ + ('images', '0005_zone_polygon_tmp'), + ('documents', '0024_page_text'), + ] + + operations = [ + migrations.AlterField( + model_name='elementpath', + name='element', + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='paths', + to='documents.Element', + ), + ), + migrations.RunPython(remove_doublons), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 211d9cd44aadec7809872db97ece1605327e7e0a..23522e15db32592bbbc786231bfc4b4d44daed84 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -474,6 +474,14 @@ class Transcription(models.Model): text = models.TextField(null=True, blank=True) score = models.FloatField(null=True, blank=True) + class Meta: + # This index is manually created in a migration to + # support the md5(text) operation + # unique_together = ( + # ('element', 'zone', 'text') + # ) + pass + def __str__(self): return 'Transcription: {}'.format(self.text[:20])