From efd96ba4f91bc2faa79a2916bb630992256c50e8 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Tue, 25 Apr 2023 11:56:35 +0000 Subject: [PATCH] Export WorkerRuns --- arkindex/documents/export/__init__.py | 1 + arkindex/documents/export/classification.sql | 3 +- arkindex/documents/export/element.sql | 1 + arkindex/documents/export/entity.sql | 3 +- arkindex/documents/export/indexes.sql | 8 ++ arkindex/documents/export/metadata.sql | 3 +- arkindex/documents/export/structure.sql | 45 ++++++++-- arkindex/documents/export/transcription.sql | 3 +- .../documents/export/transcription_entity.sql | 1 + arkindex/documents/export/worker_run.sql | 46 ++++++++++ .../management/commands/load_export.py | 53 +++++++++++- .../tests/commands/test_load_export.py | 31 +++++-- arkindex/documents/tests/tasks/test_export.py | 85 +++++++++++++++++-- 13 files changed, 255 insertions(+), 28 deletions(-) create mode 100644 arkindex/documents/export/worker_run.sql diff --git a/arkindex/documents/export/__init__.py b/arkindex/documents/export/__init__.py index 89b8cbe2f9..779852419a 100644 --- a/arkindex/documents/export/__init__.py +++ b/arkindex/documents/export/__init__.py @@ -30,6 +30,7 @@ EXPORT_QUERIES = [ 'image_server', 'image', 'worker_version', + 'worker_run', 'element', 'element_path', 'transcription', diff --git a/arkindex/documents/export/classification.sql b/arkindex/documents/export/classification.sql index cb03484685..9cb1acb1cf 100644 --- a/arkindex/documents/export/classification.sql +++ b/arkindex/documents/export/classification.sql @@ -7,7 +7,8 @@ SELECT classification.confidence, -- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1) classification.high_confidence::integer, - classification.worker_version_id + classification.worker_version_id, + classification.worker_run_id FROM documents_classification classification INNER JOIN documents_element element ON (element.id = classification.element_id) INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id) diff --git a/arkindex/documents/export/element.sql b/arkindex/documents/export/element.sql index c5ffe91505..3c6c1a7d1f 100644 --- a/arkindex/documents/export/element.sql +++ b/arkindex/documents/export/element.sql @@ -11,6 +11,7 @@ SELECT element.rotation_angle, element.mirrored::integer, element.worker_version_id, + element.worker_run_id, element.confidence FROM documents_element element INNER JOIN documents_elementtype type ON (element.type_id = type.id) diff --git a/arkindex/documents/export/entity.sql b/arkindex/documents/export/entity.sql index 3515b63f25..8949691cf6 100644 --- a/arkindex/documents/export/entity.sql +++ b/arkindex/documents/export/entity.sql @@ -5,7 +5,8 @@ SELECT entity.validated::integer, moderator.email, hstore_to_json(entity.metas), - entity.worker_version_id + entity.worker_version_id, + entity.worker_run_id FROM documents_entity entity LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id) WHERE entity.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/indexes.sql b/arkindex/documents/export/indexes.sql index 84eab47a63..d0b28e3aa3 100644 --- a/arkindex/documents/export/indexes.sql +++ b/arkindex/documents/export/indexes.sql @@ -1,23 +1,30 @@ CREATE INDEX image_server_id ON image (server_id); +CREATE INDEX worker_run_worker_version_id ON worker_run (worker_version_id); + CREATE INDEX element_image_id ON element (image_id); CREATE INDEX element_worker_version_id ON element (worker_version_id); +CREATE INDEX element_worker_run_id ON element (worker_run_id); CREATE INDEX element_path_parent_id ON element_path (parent_id); CREATE INDEX element_path_child_id ON element_path (child_id); CREATE INDEX transcription_element_id ON transcription (element_id); CREATE INDEX transcription_worker_version_id ON transcription (worker_version_id); +CREATE INDEX transcription_worker_run_id ON transcription (worker_run_id); CREATE INDEX classification_element_id ON classification (element_id); CREATE INDEX classification_worker_version_id ON classification (worker_version_id); +CREATE INDEX classification_worker_run_id ON classification (worker_run_id); CREATE INDEX entity_worker_version_id ON entity (worker_version_id); +CREATE INDEX entity_worker_run_id ON entity (worker_run_id); CREATE INDEX entity_type_id ON entity (type_id); CREATE INDEX transcription_entity_transcription_id ON transcription_entity (transcription_id); CREATE INDEX transcription_entity_entity_id ON transcription_entity (entity_id); CREATE INDEX transcription_entity_worker_version_id ON transcription_entity (worker_version_id); +CREATE INDEX transcription_entity_worker_run_id ON transcription_entity (worker_run_id); CREATE INDEX entity_link_parent_id ON entity_link (parent_id); CREATE INDEX entity_link_child_id ON entity_link (child_id); @@ -29,3 +36,4 @@ CREATE INDEX entity_role_child_type_id ON entity_role (child_type_id); CREATE INDEX metadata_element_id ON metadata (element_id); CREATE INDEX metadata_entity_id ON metadata (entity_id); CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id); +CREATE INDEX metadata_worker_run_id ON metadata (worker_run_id); diff --git a/arkindex/documents/export/metadata.sql b/arkindex/documents/export/metadata.sql index c4a9fa7021..714b73b208 100644 --- a/arkindex/documents/export/metadata.sql +++ b/arkindex/documents/export/metadata.sql @@ -5,7 +5,8 @@ SELECT metadata.type, metadata.value, metadata.entity_id, - metadata.worker_version_id + metadata.worker_version_id, + metadata.worker_run_id FROM documents_metadata metadata INNER JOIN documents_element element ON (element.id = metadata.element_id) WHERE element.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/structure.sql b/arkindex/documents/export/structure.sql index b82d67c24a..f6d75f580f 100644 --- a/arkindex/documents/export/structure.sql +++ b/arkindex/documents/export/structure.sql @@ -1,6 +1,6 @@ PRAGMA foreign_keys = ON; -CREATE TABLE export_version AS SELECT 5 AS version; +CREATE TABLE export_version AS SELECT 6 AS version; CREATE TABLE image_server ( id VARCHAR(37) NOT NULL, @@ -35,6 +35,20 @@ CREATE TABLE worker_version ( PRIMARY KEY (id) ); +CREATE TABLE worker_run ( + id VARCHAR(37) NOT NULL, + worker_version_id VARCHAR(37) NOT NULL, + model_version_id VARCHAR(37), + model_id VARCHAR(37), + model_name VARCHAR(100), + configuration_id VARCHAR(37), + configuration TEXT, + PRIMARY KEY (id), + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + CHECK ((model_version_id IS NULL) = (model_id IS NULL) AND (model_id IS NULL) = (model_name IS NULL)), + CHECK ((configuration_id IS NULL) = (configuration IS NULL)) +); + CREATE TABLE element ( id VARCHAR(37) NOT NULL, created REAL NOT NULL, @@ -46,14 +60,17 @@ CREATE TABLE element ( rotation_angle INTEGER NOT NULL DEFAULT 0, mirrored INTEGER NOT NULL DEFAULT 0, worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), confidence REAL, PRIMARY KEY (id), FOREIGN KEY (image_id) REFERENCES image (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, CHECK ((image_id IS NULL AND polygon IS NULL) OR (image_id IS NOT NULL AND polygon IS NOT NULL)), CHECK (rotation_angle >= 0 AND rotation_angle <= 359), CHECK (mirrored = 0 OR mirrored = 1), - CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)) + CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), + CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL) ); CREATE TABLE element_path ( @@ -75,9 +92,11 @@ CREATE TABLE transcription ( confidence REAL, orientation TEXT NOT NULL DEFAULT 'horizontal-lr', worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), PRIMARY KEY (id), FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), CHECK (orientation IN ('horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl')) ); @@ -91,11 +110,14 @@ CREATE TABLE classification ( confidence REAL, high_confidence INTEGER NOT NULL DEFAULT 0, worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), PRIMARY KEY (id), FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), - CHECK (high_confidence = 0 OR high_confidence = 1) + CHECK (high_confidence = 0 OR high_confidence = 1), + CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL) ); CREATE TABLE entity ( @@ -106,10 +128,13 @@ CREATE TABLE entity ( moderator VARCHAR(255), metas TEXT, worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), PRIMARY KEY (id), FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, FOREIGN KEY (type_id) REFERENCES entity_type (id) ON DELETE CASCADE, - CHECK (validated = 0 OR validated = 1) + CHECK (validated = 0 OR validated = 1), + CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL) ); CREATE TABLE entity_type ( @@ -127,14 +152,17 @@ CREATE TABLE transcription_entity ( offset INTEGER NOT NULL, length INTEGER NOT NULL, worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), confidence REAL, PRIMARY KEY (id), FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE, FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, - UNIQUE (transcription_id, entity_id, offset, length, worker_version_id), + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, + UNIQUE (transcription_id, entity_id, offset, length, worker_version_id, worker_run_id), CHECK (offset >= 0 AND length >= 0), - CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)) + CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), + CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL) ); CREATE TABLE entity_role ( @@ -168,8 +196,11 @@ CREATE TABLE metadata ( value TEXT NOT NULL, entity_id VARCHAR(37), worker_version_id VARCHAR(37), + worker_run_id VARCHAR(37), PRIMARY KEY (id), FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE SET NULL, - FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE, + CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL) ); diff --git a/arkindex/documents/export/transcription.sql b/arkindex/documents/export/transcription.sql index 251497e89d..0f71045d41 100644 --- a/arkindex/documents/export/transcription.sql +++ b/arkindex/documents/export/transcription.sql @@ -4,7 +4,8 @@ SELECT transcription.text, transcription.confidence, transcription.orientation, - transcription.worker_version_id + transcription.worker_version_id, + transcription.worker_run_id FROM documents_transcription transcription INNER JOIN documents_element element ON (element.id = transcription.element_id) WHERE element.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/transcription_entity.sql b/arkindex/documents/export/transcription_entity.sql index 932b73d3f0..52c9984de6 100644 --- a/arkindex/documents/export/transcription_entity.sql +++ b/arkindex/documents/export/transcription_entity.sql @@ -7,6 +7,7 @@ SELECT te.offset, te.length, te.worker_version_id, + te.worker_run_id, te.confidence FROM documents_transcriptionentity te INNER JOIN documents_entity entity ON (te.entity_id = entity.id) diff --git a/arkindex/documents/export/worker_run.sql b/arkindex/documents/export/worker_run.sql new file mode 100644 index 0000000000..fab77a3314 --- /dev/null +++ b/arkindex/documents/export/worker_run.sql @@ -0,0 +1,46 @@ +-- This filters worker runs to only include those used in any of the kinds +-- of ML results we have. Doing it using LEFT JOIN would require 9 joins and +-- fills up the RAM. Adding DISTINCT to all the SELECT queries of the UNION +-- slows this query down by ~20%. Using multiple INs instead of a UNION makes +-- this query twice as slow. + +-- Note that exports may fail if an ML result uses a WorkerRun that has a +-- WorkerVersion that is not the worker_version_id, as the version may not +-- have been exported properly and the FK constraint may fail. +SELECT + run.id, + run.version_id, + run.model_version_id, + model.id, + model.name, + run.configuration_id, + configuration.configuration +FROM process_workerrun run +LEFT JOIN process_workerconfiguration configuration ON configuration.id = run.configuration_id +LEFT JOIN training_modelversion modelversion ON modelversion.id = run.model_version_id +LEFT JOIN training_model model ON model.id = modelversion.model_id +WHERE run.id IN ( + SELECT worker_run_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid +UNION ALL + SELECT worker_run_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid +UNION ALL + SELECT classification.worker_run_id + FROM documents_classification classification + INNER JOIN documents_element element ON (element.id = classification.element_id) + WHERE element.corpus_id = '{corpus_id}'::uuid +UNION ALL + SELECT transcription.worker_run_id + FROM documents_transcription transcription + INNER JOIN documents_element element ON (element.id = transcription.element_id) + WHERE element.corpus_id = '{corpus_id}'::uuid +UNION ALL + SELECT te.worker_run_id + FROM documents_transcriptionentity te + INNER JOIN documents_entity entity ON (te.entity_id = entity.id) + WHERE entity.corpus_id = '{corpus_id}'::uuid +UNION ALL + SELECT md.worker_run_id + FROM documents_metadata md + INNER JOIN documents_element element ON (md.element_id = element.id) + WHERE element.corpus_id = '{corpus_id}'::uuid +) diff --git a/arkindex/documents/management/commands/load_export.py b/arkindex/documents/management/commands/load_export.py index 9b325aaeb6..3342c44511 100644 --- a/arkindex/documents/management/commands/load_export.py +++ b/arkindex/documents/management/commands/load_export.py @@ -28,16 +28,27 @@ from arkindex.documents.models import ( TranscriptionEntity, ) from arkindex.images.models import Image, ImageServer -from arkindex.process.models import Repository, Revision, Worker, WorkerType, WorkerVersion +from arkindex.process.models import ( + ProcessMode, + Repository, + Revision, + Worker, + WorkerConfiguration, + WorkerRun, + WorkerType, + WorkerVersion, +) +from arkindex.training.models import Model from arkindex.users.models import Role, User -EXPORT_VERSION = 5 +EXPORT_VERSION = 6 TABLE_NAMES = { 'export_version', 'image_server', 'image', 'worker_version', + 'worker_run', 'element', 'element_path', 'entity', @@ -65,6 +76,7 @@ SQL_REPOSITORY_QUERY = "SELECT DISTINCT repository_url FROM worker_version" SQL_REVISION_QUERY = "SELECT DISTINCT revision, repository_url FROM worker_version" SQL_WORKER_TYPE_QUERY = "SELECT DISTINCT type FROM worker_version" SQL_WORKER_VERSION_QUERY = "SELECT * FROM worker_version" +SQL_WORKER_RUN_QUERY = "SELECT * FROM worker_run" SQL_IMAGE_SERVER_QUERY = "SELECT * FROM image_server" SQL_IMAGE_QUERY = """ @@ -94,6 +106,7 @@ SQL_ELEMENT_QUERY = """ rotation_angle, mirrored, worker_version_id, + worker_run_id, confidence FROM element LEFT JOIN image ON (image.id = element.image_id) @@ -187,6 +200,7 @@ class Command(BaseCommand): rotation_angle=row["rotation_angle"], mirrored=row["mirrored"], worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, confidence=row["confidence"], corpus=self.corpus )] @@ -215,6 +229,7 @@ class Command(BaseCommand): moderator=User.objects.get(email=row["moderator"]) if row["moderator"] else None, metas=json.loads(row["metas"]) if row["metas"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, corpus=self.corpus )] @@ -252,6 +267,7 @@ class Command(BaseCommand): confidence=row["confidence"], orientation=row["orientation"], worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, )] def convert_transcription_entities(self, row): @@ -262,6 +278,7 @@ class Command(BaseCommand): offset=row["offset"], length=row["length"], worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, confidence=row["confidence"], )] @@ -274,6 +291,7 @@ class Command(BaseCommand): value=row["value"], entity_id=row["entity_id"], worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, )] def convert_classifications(self, row): @@ -286,6 +304,7 @@ class Command(BaseCommand): confidence=row["confidence"], high_confidence=row["high_confidence"], worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, + worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None, )] def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True): @@ -407,6 +426,33 @@ class Command(BaseCommand): } ) + def create_worker_run(self, row): + worker_version_id = self.worker_version_map[row['worker_version_id']] + model_version, configuration = None, None + + if row['model_version_id']: + model, created = Model.objects.get_or_create(name=row['model_name']) + if created: + model.memberships.create( + user=self.user, + level=Role.Admin.value, + ) + + model_version, _ = model.objects.get_or_create(id=row['model_version_id']) + + if row['configuration_id']: + configuration, _ = WorkerConfiguration.objects.get_or_create( + worker=Worker.objects.get(versions__id=worker_version_id), + configuration=json.loads(row['configuration']), + ) + + return self.local_process.worker_runs.get_or_create( + version_id=worker_version_id, + model_version=model_version, + configuration=configuration, + defaults={'parents': []}, + ) + def create_image_server(self, row): return ImageServer.objects.get_or_create( url=row['url'], @@ -479,6 +525,8 @@ class Command(BaseCommand): if not corpus_name: corpus_name = f"Corpus import {date}" + self.local_process, _ = self.user.processes.get_or_create(mode=ProcessMode.Local) + self.stdout.write(f"Creating corpus {corpus_name}") with Timer() as t: # Create corpus @@ -500,6 +548,7 @@ class Command(BaseCommand): self.worker_type_map = self.create_objects(WorkerType, self.create_worker_type, SQL_WORKER_TYPE_QUERY) self.worker_map = self.create_objects(Worker, self.create_worker, SQL_WORKER_VERSION_QUERY) self.worker_version_map = self.create_objects(WorkerVersion, self.create_worker_version, SQL_WORKER_VERSION_QUERY) + self.worker_run_map = self.create_objects(WorkerRun, self.create_worker_run, SQL_WORKER_RUN_QUERY) # Create images and servers self.image_server_map = self.create_objects(ImageServer, self.create_image_server, SQL_IMAGE_SERVER_QUERY) diff --git a/arkindex/documents/tests/commands/test_load_export.py b/arkindex/documents/tests/commands/test_load_export.py index 9a3a1cf594..f2056c3b25 100644 --- a/arkindex/documents/tests/commands/test_load_export.py +++ b/arkindex/documents/tests/commands/test_load_export.py @@ -12,7 +12,7 @@ from arkindex.documents.management.commands.load_export import Command from arkindex.documents.models import Corpus, Element, ElementPath, EntityType, Transcription from arkindex.documents.tasks import corpus_delete from arkindex.images.models import Image, ImageServer -from arkindex.process.models import Repository, Worker, WorkerType, WorkerVersion +from arkindex.process.models import ProcessMode, Repository, Worker, WorkerType, WorkerVersion from arkindex.project.tests import FixtureTestCase BASE_DIR = Path(__file__).absolute().parent @@ -37,6 +37,9 @@ class TestLoadExport(FixtureTestCase): 'process.worker': [], 'process.revision': ['message', 'author'], 'process.workerversion': ['configuration', 'state', 'docker_image', 'docker_image_iid'], + # The WorkerRuns lose their parents, use different worker versions that just got recreated, + # are assigned to the user's local process and not the original one + 'process.workerrun': ['parents', 'version', 'process', 'summary'], 'process.workertype': [], 'images.imageserver': ['s3_bucket', 's3_region', 'created', 'updated', 'read_only'], 'images.image': ['created', 'updated', 'hash', 'status'], @@ -123,11 +126,17 @@ class TestLoadExport(FixtureTestCase): element = self.corpus.elements.get(name='Volume 1') transcription = Transcription.objects.first() - version = WorkerVersion.objects.get(worker__slug='reco') + + reco_version = WorkerVersion.objects.get(worker__slug='reco') + reco_run = reco_version.worker_runs.get() + dla_version = WorkerVersion.objects.get(worker__slug='dla') + dla_run = dla_version.worker_runs.get() element.classifications.create( ml_class=self.corpus.ml_classes.create(name='Blah'), confidence=.55555555, + worker_version=dla_version, + worker_run=dla_run, ) person_type = EntityType.objects.get( @@ -161,14 +170,15 @@ class TestLoadExport(FixtureTestCase): entity=entity1, offset=1, length=1, - worker_version=version, + worker_version=reco_version, + worker_run=reco_run, ) transcription.transcription_entities.create( entity=entity2, offset=0, length=1, - worker_version=version, + worker_version=reco_version, confidence=0.42, ) @@ -189,7 +199,7 @@ class TestLoadExport(FixtureTestCase): corpus_delete(self.corpus.id) Image.objects.all().delete() ImageServer.objects.all().delete() - WorkerVersion.objects.filter(id=version.id).delete() + WorkerVersion.objects.filter(id=reco_version.id).delete() call_command('load_export', db_path, '--email', self.user.email, '--corpus-name', 'My corpus') @@ -199,12 +209,21 @@ class TestLoadExport(FixtureTestCase): corpus = Corpus.objects.get(name='My corpus') + local_process = self.user.processes.get(mode=ProcessMode.Local) + data_before = self.clean_dump_data( dump_path_before, # Remap the corpus, imageserver and worker version IDs corpus={str(self.corpus.id): str(corpus.id)}, server={self.imgsrv.id: ImageServer.objects.get().id}, - worker_version={str(version.id): str(WorkerVersion.objects.get(worker__slug='reco').id)}, + worker_version={ + str(reco_version.id): str(WorkerVersion.objects.get(worker__slug='reco').id), + str(dla_version.id): str(WorkerVersion.objects.get(worker__slug='dla').id), + }, + worker_run={ + str(reco_run.id): str(local_process.worker_runs.get(version__worker__slug='reco').id), + str(dla_run.id): str(local_process.worker_runs.get(version__worker__slug='dla').id), + }, ) data_after = self.clean_dump_data(dump_path_after) diff --git a/arkindex/documents/tests/tasks/test_export.py b/arkindex/documents/tests/tasks/test_export.py index 3b35f95e56..00763ded34 100644 --- a/arkindex/documents/tests/tasks/test_export.py +++ b/arkindex/documents/tests/tasks/test_export.py @@ -41,6 +41,7 @@ TABLE_NAMES = { 'transcription', 'transcription_entity', 'worker_version', + 'worker_run', } @@ -161,7 +162,7 @@ class TestExport(FixtureTestCase): ) self.assertCountEqual( - db.execute("SELECT version FROM export_version").fetchall(), [(5, )] + db.execute("SELECT version FROM export_version").fetchall(), [(6, )] ) self.assertCountEqual( @@ -225,6 +226,7 @@ class TestExport(FixtureTestCase): mirrored, confidence, worker_version_id, + worker_run_id, image_id, polygon FROM element @@ -256,6 +258,11 @@ class TestExport(FixtureTestCase): else: row.append(None) + if element.worker_run_id: + row.append(str(element.worker_run_id)) + else: + row.append(None) + if element.polygon: row.append(str(element.image_id)) row.append([ @@ -286,7 +293,17 @@ class TestExport(FixtureTestCase): ) self.assertCountEqual( - db.execute("SELECT id, element_id, text, confidence, orientation, worker_version_id FROM transcription").fetchall(), + db.execute(""" + SELECT + id, + element_id, + text, + confidence, + orientation, + worker_version_id, + worker_run_id + FROM transcription + """).fetchall(), [ ( str(transcription.id), @@ -294,14 +311,27 @@ class TestExport(FixtureTestCase): transcription.text, transcription.confidence, transcription.orientation.value, - str(transcription.worker_version_id) if transcription.worker_version_id else None + str(transcription.worker_version_id) if transcription.worker_version_id else None, + str(transcription.worker_run_id) if transcription.worker_run_id else None, ) for transcription in Transcription.objects.filter(element__corpus=self.corpus) ] ) self.assertCountEqual( - db.execute("SELECT id, element_id, class_name, state, moderator, confidence, high_confidence, worker_version_id FROM classification").fetchall(), + db.execute(""" + SELECT + id, + element_id, + class_name, + state, + moderator, + confidence, + high_confidence, + worker_version_id, + worker_run_id + FROM classification + """).fetchall(), [ ( str(classification.id), @@ -311,14 +341,26 @@ class TestExport(FixtureTestCase): classification.moderator.email if classification.moderator else None, classification.confidence, int(classification.high_confidence), - str(classification.worker_version_id) if classification.worker_version_id else None + str(classification.worker_version_id) if classification.worker_version_id else None, + str(classification.worker_run_id) if classification.worker_run_id else None, ) for classification in Classification.objects.filter(element__corpus=self.corpus) ] ) self.assertCountEqual( - db.execute("SELECT id, element_id, name, type, value, entity_id, worker_version_id FROM metadata").fetchall(), + db.execute(""" + SELECT + id, + element_id, + name, + type, + value, + entity_id, + worker_version_id, + worker_run_id + FROM metadata + """).fetchall(), [ ( str(metadata.id), @@ -327,14 +369,26 @@ class TestExport(FixtureTestCase): metadata.type.value, metadata.value, str(metadata.entity_id) if metadata.entity_id else None, - str(metadata.worker_version_id) if metadata.worker_version_id else None + str(metadata.worker_version_id) if metadata.worker_version_id else None, + str(metadata.worker_run_id) if metadata.worker_run_id else None, ) for metadata in MetaData.objects.filter(element__corpus=self.corpus) ] ) self.assertCountEqual( - db.execute("SELECT id, name, type_id, validated, moderator, metas, worker_version_id FROM entity").fetchall(), + db.execute(""" + SELECT + id, + name, + type_id, + validated, + moderator, + metas, + worker_version_id, + worker_run_id + FROM entity + """).fetchall(), [ ( str(entity.id), @@ -344,6 +398,7 @@ class TestExport(FixtureTestCase): entity.moderator.email if entity.moderator else None, json.dumps(entity.metas) if entity.metas else None, str(entity.worker_version_id) if entity.worker_version_id else None, + str(entity.worker_run_id) if entity.worker_run_id else None, ) for entity in self.corpus.entities.all() ] @@ -389,7 +444,18 @@ class TestExport(FixtureTestCase): ) self.assertCountEqual( - db.execute("SELECT id, transcription_id, entity_id, offset, length, worker_version_id, confidence FROM transcription_entity").fetchall(), + db.execute(""" + SELECT + id, + transcription_id, + entity_id, + offset, + length, + worker_version_id, + worker_run_id, + confidence + FROM transcription_entity + """).fetchall(), [ ( str(transcription_entity.id), @@ -398,6 +464,7 @@ class TestExport(FixtureTestCase): transcription_entity.offset, transcription_entity.length, str(transcription_entity.worker_version_id) if transcription_entity.worker_version_id else None, + str(transcription_entity.worker_run_id) if transcription_entity.worker_run_id else None, transcription_entity.confidence, ) for transcription_entity in TranscriptionEntity.objects.filter(entity__corpus=self.corpus) -- GitLab