Skip to content
Snippets Groups Projects
Commit e5af9020 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'avoid-doublons' into 'master'

Avoid doublons on transcriptions by adding a unique constraint

See merge request !105
parents 777f50bc 08654217
No related branches found
No related tags found
1 merge request!105Avoid doublons on transcriptions by adding a unique constraint
# Generated by Django 2.1 on 2018-09-17 08:35
from django.db import migrations, models
from django.db import connection
import django.db.models.deletion
# This sql query delete all the transcriptions
# that have the same (element_id, zone_id, text)
# keeping only the first item
SQL_REMOVE = '''
delete from documents_transcription
where id in (
select unnest((array_agg(id))[2:])
from documents_transcription
group by (element_id, zone_id, text)
having count(*) > 1
);
'''
# This sql query create a unique together index
# on (element_id, zone_id, text)
# It cannot be automatically achieved by Django
# as it needs to make a md5 hash of text to fit in its index max size
SQL_UNIQUE = '''
create unique index documents_transcription_uniq_elt_zone_text
on documents_transcription (
zone_id,
element_id,
md5(text)
)
'''
def remove_doublons(apps, schema_editor):
'''
Run above sql queries
'''
with connection.cursor() as cursor:
cursor.execute(SQL_REMOVE)
cursor.execute(SQL_UNIQUE)
class Migration(migrations.Migration):
dependencies = [
('images', '0005_zone_polygon_tmp'),
('documents', '0024_page_text'),
]
operations = [
migrations.AlterField(
model_name='elementpath',
name='element',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='paths',
to='documents.Element',
),
),
migrations.RunPython(remove_doublons),
]
......@@ -474,6 +474,14 @@ class Transcription(models.Model):
text = models.TextField(null=True, blank=True)
score = models.FloatField(null=True, blank=True)
class Meta:
# This index is manually created in a migration to
# support the md5(text) operation
# unique_together = (
# ('element', 'zone', 'text')
# )
pass
def __str__(self):
return 'Transcription: {}'.format(self.text[:20])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment