Skip to content
Snippets Groups Projects
Commit efd96ba4 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Export WorkerRuns

parent f4a19294
No related branches found
No related tags found
1 merge request!1964Export WorkerRuns
Showing
with 255 additions and 28 deletions
...@@ -30,6 +30,7 @@ EXPORT_QUERIES = [ ...@@ -30,6 +30,7 @@ EXPORT_QUERIES = [
'image_server', 'image_server',
'image', 'image',
'worker_version', 'worker_version',
'worker_run',
'element', 'element',
'element_path', 'element_path',
'transcription', 'transcription',
......
...@@ -7,7 +7,8 @@ SELECT ...@@ -7,7 +7,8 @@ SELECT
classification.confidence, classification.confidence,
-- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1) -- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1)
classification.high_confidence::integer, classification.high_confidence::integer,
classification.worker_version_id classification.worker_version_id,
classification.worker_run_id
FROM documents_classification classification FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id) INNER JOIN documents_element element ON (element.id = classification.element_id)
INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id) INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
......
...@@ -11,6 +11,7 @@ SELECT ...@@ -11,6 +11,7 @@ SELECT
element.rotation_angle, element.rotation_angle,
element.mirrored::integer, element.mirrored::integer,
element.worker_version_id, element.worker_version_id,
element.worker_run_id,
element.confidence element.confidence
FROM documents_element element FROM documents_element element
INNER JOIN documents_elementtype type ON (element.type_id = type.id) INNER JOIN documents_elementtype type ON (element.type_id = type.id)
......
...@@ -5,7 +5,8 @@ SELECT ...@@ -5,7 +5,8 @@ SELECT
entity.validated::integer, entity.validated::integer,
moderator.email, moderator.email,
hstore_to_json(entity.metas), hstore_to_json(entity.metas),
entity.worker_version_id entity.worker_version_id,
entity.worker_run_id
FROM documents_entity entity FROM documents_entity entity
LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id) LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
WHERE entity.corpus_id = '{corpus_id}'::uuid WHERE entity.corpus_id = '{corpus_id}'::uuid
CREATE INDEX image_server_id ON image (server_id); CREATE INDEX image_server_id ON image (server_id);
CREATE INDEX worker_run_worker_version_id ON worker_run (worker_version_id);
CREATE INDEX element_image_id ON element (image_id); CREATE INDEX element_image_id ON element (image_id);
CREATE INDEX element_worker_version_id ON element (worker_version_id); CREATE INDEX element_worker_version_id ON element (worker_version_id);
CREATE INDEX element_worker_run_id ON element (worker_run_id);
CREATE INDEX element_path_parent_id ON element_path (parent_id); CREATE INDEX element_path_parent_id ON element_path (parent_id);
CREATE INDEX element_path_child_id ON element_path (child_id); CREATE INDEX element_path_child_id ON element_path (child_id);
CREATE INDEX transcription_element_id ON transcription (element_id); CREATE INDEX transcription_element_id ON transcription (element_id);
CREATE INDEX transcription_worker_version_id ON transcription (worker_version_id); CREATE INDEX transcription_worker_version_id ON transcription (worker_version_id);
CREATE INDEX transcription_worker_run_id ON transcription (worker_run_id);
CREATE INDEX classification_element_id ON classification (element_id); CREATE INDEX classification_element_id ON classification (element_id);
CREATE INDEX classification_worker_version_id ON classification (worker_version_id); CREATE INDEX classification_worker_version_id ON classification (worker_version_id);
CREATE INDEX classification_worker_run_id ON classification (worker_run_id);
CREATE INDEX entity_worker_version_id ON entity (worker_version_id); CREATE INDEX entity_worker_version_id ON entity (worker_version_id);
CREATE INDEX entity_worker_run_id ON entity (worker_run_id);
CREATE INDEX entity_type_id ON entity (type_id); CREATE INDEX entity_type_id ON entity (type_id);
CREATE INDEX transcription_entity_transcription_id ON transcription_entity (transcription_id); CREATE INDEX transcription_entity_transcription_id ON transcription_entity (transcription_id);
CREATE INDEX transcription_entity_entity_id ON transcription_entity (entity_id); CREATE INDEX transcription_entity_entity_id ON transcription_entity (entity_id);
CREATE INDEX transcription_entity_worker_version_id ON transcription_entity (worker_version_id); CREATE INDEX transcription_entity_worker_version_id ON transcription_entity (worker_version_id);
CREATE INDEX transcription_entity_worker_run_id ON transcription_entity (worker_run_id);
CREATE INDEX entity_link_parent_id ON entity_link (parent_id); CREATE INDEX entity_link_parent_id ON entity_link (parent_id);
CREATE INDEX entity_link_child_id ON entity_link (child_id); CREATE INDEX entity_link_child_id ON entity_link (child_id);
...@@ -29,3 +36,4 @@ CREATE INDEX entity_role_child_type_id ON entity_role (child_type_id); ...@@ -29,3 +36,4 @@ CREATE INDEX entity_role_child_type_id ON entity_role (child_type_id);
CREATE INDEX metadata_element_id ON metadata (element_id); CREATE INDEX metadata_element_id ON metadata (element_id);
CREATE INDEX metadata_entity_id ON metadata (entity_id); CREATE INDEX metadata_entity_id ON metadata (entity_id);
CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id); CREATE INDEX metadata_worker_version_id ON metadata (worker_version_id);
CREATE INDEX metadata_worker_run_id ON metadata (worker_run_id);
...@@ -5,7 +5,8 @@ SELECT ...@@ -5,7 +5,8 @@ SELECT
metadata.type, metadata.type,
metadata.value, metadata.value,
metadata.entity_id, metadata.entity_id,
metadata.worker_version_id metadata.worker_version_id,
metadata.worker_run_id
FROM documents_metadata metadata FROM documents_metadata metadata
INNER JOIN documents_element element ON (element.id = metadata.element_id) INNER JOIN documents_element element ON (element.id = metadata.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid WHERE element.corpus_id = '{corpus_id}'::uuid
PRAGMA foreign_keys = ON; PRAGMA foreign_keys = ON;
CREATE TABLE export_version AS SELECT 5 AS version; CREATE TABLE export_version AS SELECT 6 AS version;
CREATE TABLE image_server ( CREATE TABLE image_server (
id VARCHAR(37) NOT NULL, id VARCHAR(37) NOT NULL,
...@@ -35,6 +35,20 @@ CREATE TABLE worker_version ( ...@@ -35,6 +35,20 @@ CREATE TABLE worker_version (
PRIMARY KEY (id) PRIMARY KEY (id)
); );
CREATE TABLE worker_run (
id VARCHAR(37) NOT NULL,
worker_version_id VARCHAR(37) NOT NULL,
model_version_id VARCHAR(37),
model_id VARCHAR(37),
model_name VARCHAR(100),
configuration_id VARCHAR(37),
configuration TEXT,
PRIMARY KEY (id),
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
CHECK ((model_version_id IS NULL) = (model_id IS NULL) AND (model_id IS NULL) = (model_name IS NULL)),
CHECK ((configuration_id IS NULL) = (configuration IS NULL))
);
CREATE TABLE element ( CREATE TABLE element (
id VARCHAR(37) NOT NULL, id VARCHAR(37) NOT NULL,
created REAL NOT NULL, created REAL NOT NULL,
...@@ -46,14 +60,17 @@ CREATE TABLE element ( ...@@ -46,14 +60,17 @@ CREATE TABLE element (
rotation_angle INTEGER NOT NULL DEFAULT 0, rotation_angle INTEGER NOT NULL DEFAULT 0,
mirrored INTEGER NOT NULL DEFAULT 0, mirrored INTEGER NOT NULL DEFAULT 0,
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
confidence REAL, confidence REAL,
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (image_id) REFERENCES image (id) ON DELETE CASCADE, FOREIGN KEY (image_id) REFERENCES image (id) ON DELETE CASCADE,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK ((image_id IS NULL AND polygon IS NULL) OR (image_id IS NOT NULL AND polygon IS NOT NULL)), CHECK ((image_id IS NULL AND polygon IS NULL) OR (image_id IS NOT NULL AND polygon IS NOT NULL)),
CHECK (rotation_angle >= 0 AND rotation_angle <= 359), CHECK (rotation_angle >= 0 AND rotation_angle <= 359),
CHECK (mirrored = 0 OR mirrored = 1), CHECK (mirrored = 0 OR mirrored = 1),
CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)) CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
); );
CREATE TABLE element_path ( CREATE TABLE element_path (
...@@ -75,9 +92,11 @@ CREATE TABLE transcription ( ...@@ -75,9 +92,11 @@ CREATE TABLE transcription (
confidence REAL, confidence REAL,
orientation TEXT NOT NULL DEFAULT 'horizontal-lr', orientation TEXT NOT NULL DEFAULT 'horizontal-lr',
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
CHECK (orientation IN ('horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl')) CHECK (orientation IN ('horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl'))
); );
...@@ -91,11 +110,14 @@ CREATE TABLE classification ( ...@@ -91,11 +110,14 @@ CREATE TABLE classification (
confidence REAL, confidence REAL,
high_confidence INTEGER NOT NULL DEFAULT 0, high_confidence INTEGER NOT NULL DEFAULT 0,
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
CHECK (high_confidence = 0 OR high_confidence = 1) CHECK (high_confidence = 0 OR high_confidence = 1),
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
); );
CREATE TABLE entity ( CREATE TABLE entity (
...@@ -106,10 +128,13 @@ CREATE TABLE entity ( ...@@ -106,10 +128,13 @@ CREATE TABLE entity (
moderator VARCHAR(255), moderator VARCHAR(255),
metas TEXT, metas TEXT,
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
FOREIGN KEY (type_id) REFERENCES entity_type (id) ON DELETE CASCADE, FOREIGN KEY (type_id) REFERENCES entity_type (id) ON DELETE CASCADE,
CHECK (validated = 0 OR validated = 1) CHECK (validated = 0 OR validated = 1),
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
); );
CREATE TABLE entity_type ( CREATE TABLE entity_type (
...@@ -127,14 +152,17 @@ CREATE TABLE transcription_entity ( ...@@ -127,14 +152,17 @@ CREATE TABLE transcription_entity (
offset INTEGER NOT NULL, offset INTEGER NOT NULL,
length INTEGER NOT NULL, length INTEGER NOT NULL,
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
confidence REAL, confidence REAL,
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE, FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE,
FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE, FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
UNIQUE (transcription_id, entity_id, offset, length, worker_version_id), FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
UNIQUE (transcription_id, entity_id, offset, length, worker_version_id, worker_run_id),
CHECK (offset >= 0 AND length >= 0), CHECK (offset >= 0 AND length >= 0),
CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)) CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)),
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
); );
CREATE TABLE entity_role ( CREATE TABLE entity_role (
...@@ -168,8 +196,11 @@ CREATE TABLE metadata ( ...@@ -168,8 +196,11 @@ CREATE TABLE metadata (
value TEXT NOT NULL, value TEXT NOT NULL,
entity_id VARCHAR(37), entity_id VARCHAR(37),
worker_version_id VARCHAR(37), worker_version_id VARCHAR(37),
worker_run_id VARCHAR(37),
PRIMARY KEY (id), PRIMARY KEY (id),
FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE,
FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE SET NULL, FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE SET NULL,
FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE,
FOREIGN KEY (worker_run_id) REFERENCES worker_run (id) ON DELETE CASCADE,
CHECK (worker_run_id IS NULL OR worker_version_id IS NOT NULL)
); );
...@@ -4,7 +4,8 @@ SELECT ...@@ -4,7 +4,8 @@ SELECT
transcription.text, transcription.text,
transcription.confidence, transcription.confidence,
transcription.orientation, transcription.orientation,
transcription.worker_version_id transcription.worker_version_id,
transcription.worker_run_id
FROM documents_transcription transcription FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id) INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid WHERE element.corpus_id = '{corpus_id}'::uuid
...@@ -7,6 +7,7 @@ SELECT ...@@ -7,6 +7,7 @@ SELECT
te.offset, te.offset,
te.length, te.length,
te.worker_version_id, te.worker_version_id,
te.worker_run_id,
te.confidence te.confidence
FROM documents_transcriptionentity te FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id) INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
......
-- This filters worker runs to only include those used in any of the kinds
-- of ML results we have. Doing it using LEFT JOIN would require 9 joins and
-- fills up the RAM. Adding DISTINCT to all the SELECT queries of the UNION
-- slows this query down by ~20%. Using multiple INs instead of a UNION makes
-- this query twice as slow.
-- Note that exports may fail if an ML result uses a WorkerRun that has a
-- WorkerVersion that is not the worker_version_id, as the version may not
-- have been exported properly and the FK constraint may fail.
SELECT
run.id,
run.version_id,
run.model_version_id,
model.id,
model.name,
run.configuration_id,
configuration.configuration
FROM process_workerrun run
LEFT JOIN process_workerconfiguration configuration ON configuration.id = run.configuration_id
LEFT JOIN training_modelversion modelversion ON modelversion.id = run.model_version_id
LEFT JOIN training_model model ON model.id = modelversion.model_id
WHERE run.id IN (
SELECT worker_run_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
UNION ALL
SELECT worker_run_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid
UNION ALL
SELECT classification.worker_run_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION ALL
SELECT transcription.worker_run_id
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION ALL
SELECT te.worker_run_id
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
UNION ALL
SELECT md.worker_run_id
FROM documents_metadata md
INNER JOIN documents_element element ON (md.element_id = element.id)
WHERE element.corpus_id = '{corpus_id}'::uuid
)
...@@ -28,16 +28,27 @@ from arkindex.documents.models import ( ...@@ -28,16 +28,27 @@ from arkindex.documents.models import (
TranscriptionEntity, TranscriptionEntity,
) )
from arkindex.images.models import Image, ImageServer from arkindex.images.models import Image, ImageServer
from arkindex.process.models import Repository, Revision, Worker, WorkerType, WorkerVersion from arkindex.process.models import (
ProcessMode,
Repository,
Revision,
Worker,
WorkerConfiguration,
WorkerRun,
WorkerType,
WorkerVersion,
)
from arkindex.training.models import Model
from arkindex.users.models import Role, User from arkindex.users.models import Role, User
EXPORT_VERSION = 5 EXPORT_VERSION = 6
TABLE_NAMES = { TABLE_NAMES = {
'export_version', 'export_version',
'image_server', 'image_server',
'image', 'image',
'worker_version', 'worker_version',
'worker_run',
'element', 'element',
'element_path', 'element_path',
'entity', 'entity',
...@@ -65,6 +76,7 @@ SQL_REPOSITORY_QUERY = "SELECT DISTINCT repository_url FROM worker_version" ...@@ -65,6 +76,7 @@ SQL_REPOSITORY_QUERY = "SELECT DISTINCT repository_url FROM worker_version"
SQL_REVISION_QUERY = "SELECT DISTINCT revision, repository_url FROM worker_version" SQL_REVISION_QUERY = "SELECT DISTINCT revision, repository_url FROM worker_version"
SQL_WORKER_TYPE_QUERY = "SELECT DISTINCT type FROM worker_version" SQL_WORKER_TYPE_QUERY = "SELECT DISTINCT type FROM worker_version"
SQL_WORKER_VERSION_QUERY = "SELECT * FROM worker_version" SQL_WORKER_VERSION_QUERY = "SELECT * FROM worker_version"
SQL_WORKER_RUN_QUERY = "SELECT * FROM worker_run"
SQL_IMAGE_SERVER_QUERY = "SELECT * FROM image_server" SQL_IMAGE_SERVER_QUERY = "SELECT * FROM image_server"
SQL_IMAGE_QUERY = """ SQL_IMAGE_QUERY = """
...@@ -94,6 +106,7 @@ SQL_ELEMENT_QUERY = """ ...@@ -94,6 +106,7 @@ SQL_ELEMENT_QUERY = """
rotation_angle, rotation_angle,
mirrored, mirrored,
worker_version_id, worker_version_id,
worker_run_id,
confidence confidence
FROM element FROM element
LEFT JOIN image ON (image.id = element.image_id) LEFT JOIN image ON (image.id = element.image_id)
...@@ -187,6 +200,7 @@ class Command(BaseCommand): ...@@ -187,6 +200,7 @@ class Command(BaseCommand):
rotation_angle=row["rotation_angle"], rotation_angle=row["rotation_angle"],
mirrored=row["mirrored"], mirrored=row["mirrored"],
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
confidence=row["confidence"], confidence=row["confidence"],
corpus=self.corpus corpus=self.corpus
)] )]
...@@ -215,6 +229,7 @@ class Command(BaseCommand): ...@@ -215,6 +229,7 @@ class Command(BaseCommand):
moderator=User.objects.get(email=row["moderator"]) if row["moderator"] else None, moderator=User.objects.get(email=row["moderator"]) if row["moderator"] else None,
metas=json.loads(row["metas"]) if row["metas"] else None, metas=json.loads(row["metas"]) if row["metas"] else None,
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
corpus=self.corpus corpus=self.corpus
)] )]
...@@ -252,6 +267,7 @@ class Command(BaseCommand): ...@@ -252,6 +267,7 @@ class Command(BaseCommand):
confidence=row["confidence"], confidence=row["confidence"],
orientation=row["orientation"], orientation=row["orientation"],
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
)] )]
def convert_transcription_entities(self, row): def convert_transcription_entities(self, row):
...@@ -262,6 +278,7 @@ class Command(BaseCommand): ...@@ -262,6 +278,7 @@ class Command(BaseCommand):
offset=row["offset"], offset=row["offset"],
length=row["length"], length=row["length"],
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
confidence=row["confidence"], confidence=row["confidence"],
)] )]
...@@ -274,6 +291,7 @@ class Command(BaseCommand): ...@@ -274,6 +291,7 @@ class Command(BaseCommand):
value=row["value"], value=row["value"],
entity_id=row["entity_id"], entity_id=row["entity_id"],
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
)] )]
def convert_classifications(self, row): def convert_classifications(self, row):
...@@ -286,6 +304,7 @@ class Command(BaseCommand): ...@@ -286,6 +304,7 @@ class Command(BaseCommand):
confidence=row["confidence"], confidence=row["confidence"],
high_confidence=row["high_confidence"], high_confidence=row["high_confidence"],
worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None, worker_version_id=self.worker_version_map[row["worker_version_id"]] if row["worker_version_id"] else None,
worker_run_id=self.worker_run_map[row["worker_run_id"]] if row["worker_run_id"] else None,
)] )]
def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True): def bulk_create_objects(self, ModelClass, convert_method, sql_query, ignore_conflicts=True):
...@@ -407,6 +426,33 @@ class Command(BaseCommand): ...@@ -407,6 +426,33 @@ class Command(BaseCommand):
} }
) )
def create_worker_run(self, row):
worker_version_id = self.worker_version_map[row['worker_version_id']]
model_version, configuration = None, None
if row['model_version_id']:
model, created = Model.objects.get_or_create(name=row['model_name'])
if created:
model.memberships.create(
user=self.user,
level=Role.Admin.value,
)
model_version, _ = model.objects.get_or_create(id=row['model_version_id'])
if row['configuration_id']:
configuration, _ = WorkerConfiguration.objects.get_or_create(
worker=Worker.objects.get(versions__id=worker_version_id),
configuration=json.loads(row['configuration']),
)
return self.local_process.worker_runs.get_or_create(
version_id=worker_version_id,
model_version=model_version,
configuration=configuration,
defaults={'parents': []},
)
def create_image_server(self, row): def create_image_server(self, row):
return ImageServer.objects.get_or_create( return ImageServer.objects.get_or_create(
url=row['url'], url=row['url'],
...@@ -479,6 +525,8 @@ class Command(BaseCommand): ...@@ -479,6 +525,8 @@ class Command(BaseCommand):
if not corpus_name: if not corpus_name:
corpus_name = f"Corpus import {date}" corpus_name = f"Corpus import {date}"
self.local_process, _ = self.user.processes.get_or_create(mode=ProcessMode.Local)
self.stdout.write(f"Creating corpus {corpus_name}") self.stdout.write(f"Creating corpus {corpus_name}")
with Timer() as t: with Timer() as t:
# Create corpus # Create corpus
...@@ -500,6 +548,7 @@ class Command(BaseCommand): ...@@ -500,6 +548,7 @@ class Command(BaseCommand):
self.worker_type_map = self.create_objects(WorkerType, self.create_worker_type, SQL_WORKER_TYPE_QUERY) self.worker_type_map = self.create_objects(WorkerType, self.create_worker_type, SQL_WORKER_TYPE_QUERY)
self.worker_map = self.create_objects(Worker, self.create_worker, SQL_WORKER_VERSION_QUERY) self.worker_map = self.create_objects(Worker, self.create_worker, SQL_WORKER_VERSION_QUERY)
self.worker_version_map = self.create_objects(WorkerVersion, self.create_worker_version, SQL_WORKER_VERSION_QUERY) self.worker_version_map = self.create_objects(WorkerVersion, self.create_worker_version, SQL_WORKER_VERSION_QUERY)
self.worker_run_map = self.create_objects(WorkerRun, self.create_worker_run, SQL_WORKER_RUN_QUERY)
# Create images and servers # Create images and servers
self.image_server_map = self.create_objects(ImageServer, self.create_image_server, SQL_IMAGE_SERVER_QUERY) self.image_server_map = self.create_objects(ImageServer, self.create_image_server, SQL_IMAGE_SERVER_QUERY)
......
...@@ -12,7 +12,7 @@ from arkindex.documents.management.commands.load_export import Command ...@@ -12,7 +12,7 @@ from arkindex.documents.management.commands.load_export import Command
from arkindex.documents.models import Corpus, Element, ElementPath, EntityType, Transcription from arkindex.documents.models import Corpus, Element, ElementPath, EntityType, Transcription
from arkindex.documents.tasks import corpus_delete from arkindex.documents.tasks import corpus_delete
from arkindex.images.models import Image, ImageServer from arkindex.images.models import Image, ImageServer
from arkindex.process.models import Repository, Worker, WorkerType, WorkerVersion from arkindex.process.models import ProcessMode, Repository, Worker, WorkerType, WorkerVersion
from arkindex.project.tests import FixtureTestCase from arkindex.project.tests import FixtureTestCase
BASE_DIR = Path(__file__).absolute().parent BASE_DIR = Path(__file__).absolute().parent
...@@ -37,6 +37,9 @@ class TestLoadExport(FixtureTestCase): ...@@ -37,6 +37,9 @@ class TestLoadExport(FixtureTestCase):
'process.worker': [], 'process.worker': [],
'process.revision': ['message', 'author'], 'process.revision': ['message', 'author'],
'process.workerversion': ['configuration', 'state', 'docker_image', 'docker_image_iid'], 'process.workerversion': ['configuration', 'state', 'docker_image', 'docker_image_iid'],
# The WorkerRuns lose their parents, use different worker versions that just got recreated,
# are assigned to the user's local process and not the original one
'process.workerrun': ['parents', 'version', 'process', 'summary'],
'process.workertype': [], 'process.workertype': [],
'images.imageserver': ['s3_bucket', 's3_region', 'created', 'updated', 'read_only'], 'images.imageserver': ['s3_bucket', 's3_region', 'created', 'updated', 'read_only'],
'images.image': ['created', 'updated', 'hash', 'status'], 'images.image': ['created', 'updated', 'hash', 'status'],
...@@ -123,11 +126,17 @@ class TestLoadExport(FixtureTestCase): ...@@ -123,11 +126,17 @@ class TestLoadExport(FixtureTestCase):
element = self.corpus.elements.get(name='Volume 1') element = self.corpus.elements.get(name='Volume 1')
transcription = Transcription.objects.first() transcription = Transcription.objects.first()
version = WorkerVersion.objects.get(worker__slug='reco')
reco_version = WorkerVersion.objects.get(worker__slug='reco')
reco_run = reco_version.worker_runs.get()
dla_version = WorkerVersion.objects.get(worker__slug='dla')
dla_run = dla_version.worker_runs.get()
element.classifications.create( element.classifications.create(
ml_class=self.corpus.ml_classes.create(name='Blah'), ml_class=self.corpus.ml_classes.create(name='Blah'),
confidence=.55555555, confidence=.55555555,
worker_version=dla_version,
worker_run=dla_run,
) )
person_type = EntityType.objects.get( person_type = EntityType.objects.get(
...@@ -161,14 +170,15 @@ class TestLoadExport(FixtureTestCase): ...@@ -161,14 +170,15 @@ class TestLoadExport(FixtureTestCase):
entity=entity1, entity=entity1,
offset=1, offset=1,
length=1, length=1,
worker_version=version, worker_version=reco_version,
worker_run=reco_run,
) )
transcription.transcription_entities.create( transcription.transcription_entities.create(
entity=entity2, entity=entity2,
offset=0, offset=0,
length=1, length=1,
worker_version=version, worker_version=reco_version,
confidence=0.42, confidence=0.42,
) )
...@@ -189,7 +199,7 @@ class TestLoadExport(FixtureTestCase): ...@@ -189,7 +199,7 @@ class TestLoadExport(FixtureTestCase):
corpus_delete(self.corpus.id) corpus_delete(self.corpus.id)
Image.objects.all().delete() Image.objects.all().delete()
ImageServer.objects.all().delete() ImageServer.objects.all().delete()
WorkerVersion.objects.filter(id=version.id).delete() WorkerVersion.objects.filter(id=reco_version.id).delete()
call_command('load_export', db_path, '--email', self.user.email, '--corpus-name', 'My corpus') call_command('load_export', db_path, '--email', self.user.email, '--corpus-name', 'My corpus')
...@@ -199,12 +209,21 @@ class TestLoadExport(FixtureTestCase): ...@@ -199,12 +209,21 @@ class TestLoadExport(FixtureTestCase):
corpus = Corpus.objects.get(name='My corpus') corpus = Corpus.objects.get(name='My corpus')
local_process = self.user.processes.get(mode=ProcessMode.Local)
data_before = self.clean_dump_data( data_before = self.clean_dump_data(
dump_path_before, dump_path_before,
# Remap the corpus, imageserver and worker version IDs # Remap the corpus, imageserver and worker version IDs
corpus={str(self.corpus.id): str(corpus.id)}, corpus={str(self.corpus.id): str(corpus.id)},
server={self.imgsrv.id: ImageServer.objects.get().id}, server={self.imgsrv.id: ImageServer.objects.get().id},
worker_version={str(version.id): str(WorkerVersion.objects.get(worker__slug='reco').id)}, worker_version={
str(reco_version.id): str(WorkerVersion.objects.get(worker__slug='reco').id),
str(dla_version.id): str(WorkerVersion.objects.get(worker__slug='dla').id),
},
worker_run={
str(reco_run.id): str(local_process.worker_runs.get(version__worker__slug='reco').id),
str(dla_run.id): str(local_process.worker_runs.get(version__worker__slug='dla').id),
},
) )
data_after = self.clean_dump_data(dump_path_after) data_after = self.clean_dump_data(dump_path_after)
......
...@@ -41,6 +41,7 @@ TABLE_NAMES = { ...@@ -41,6 +41,7 @@ TABLE_NAMES = {
'transcription', 'transcription',
'transcription_entity', 'transcription_entity',
'worker_version', 'worker_version',
'worker_run',
} }
...@@ -161,7 +162,7 @@ class TestExport(FixtureTestCase): ...@@ -161,7 +162,7 @@ class TestExport(FixtureTestCase):
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT version FROM export_version").fetchall(), [(5, )] db.execute("SELECT version FROM export_version").fetchall(), [(6, )]
) )
self.assertCountEqual( self.assertCountEqual(
...@@ -225,6 +226,7 @@ class TestExport(FixtureTestCase): ...@@ -225,6 +226,7 @@ class TestExport(FixtureTestCase):
mirrored, mirrored,
confidence, confidence,
worker_version_id, worker_version_id,
worker_run_id,
image_id, image_id,
polygon polygon
FROM element FROM element
...@@ -256,6 +258,11 @@ class TestExport(FixtureTestCase): ...@@ -256,6 +258,11 @@ class TestExport(FixtureTestCase):
else: else:
row.append(None) row.append(None)
if element.worker_run_id:
row.append(str(element.worker_run_id))
else:
row.append(None)
if element.polygon: if element.polygon:
row.append(str(element.image_id)) row.append(str(element.image_id))
row.append([ row.append([
...@@ -286,7 +293,17 @@ class TestExport(FixtureTestCase): ...@@ -286,7 +293,17 @@ class TestExport(FixtureTestCase):
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT id, element_id, text, confidence, orientation, worker_version_id FROM transcription").fetchall(), db.execute("""
SELECT
id,
element_id,
text,
confidence,
orientation,
worker_version_id,
worker_run_id
FROM transcription
""").fetchall(),
[ [
( (
str(transcription.id), str(transcription.id),
...@@ -294,14 +311,27 @@ class TestExport(FixtureTestCase): ...@@ -294,14 +311,27 @@ class TestExport(FixtureTestCase):
transcription.text, transcription.text,
transcription.confidence, transcription.confidence,
transcription.orientation.value, transcription.orientation.value,
str(transcription.worker_version_id) if transcription.worker_version_id else None str(transcription.worker_version_id) if transcription.worker_version_id else None,
str(transcription.worker_run_id) if transcription.worker_run_id else None,
) )
for transcription in Transcription.objects.filter(element__corpus=self.corpus) for transcription in Transcription.objects.filter(element__corpus=self.corpus)
] ]
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT id, element_id, class_name, state, moderator, confidence, high_confidence, worker_version_id FROM classification").fetchall(), db.execute("""
SELECT
id,
element_id,
class_name,
state,
moderator,
confidence,
high_confidence,
worker_version_id,
worker_run_id
FROM classification
""").fetchall(),
[ [
( (
str(classification.id), str(classification.id),
...@@ -311,14 +341,26 @@ class TestExport(FixtureTestCase): ...@@ -311,14 +341,26 @@ class TestExport(FixtureTestCase):
classification.moderator.email if classification.moderator else None, classification.moderator.email if classification.moderator else None,
classification.confidence, classification.confidence,
int(classification.high_confidence), int(classification.high_confidence),
str(classification.worker_version_id) if classification.worker_version_id else None str(classification.worker_version_id) if classification.worker_version_id else None,
str(classification.worker_run_id) if classification.worker_run_id else None,
) )
for classification in Classification.objects.filter(element__corpus=self.corpus) for classification in Classification.objects.filter(element__corpus=self.corpus)
] ]
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT id, element_id, name, type, value, entity_id, worker_version_id FROM metadata").fetchall(), db.execute("""
SELECT
id,
element_id,
name,
type,
value,
entity_id,
worker_version_id,
worker_run_id
FROM metadata
""").fetchall(),
[ [
( (
str(metadata.id), str(metadata.id),
...@@ -327,14 +369,26 @@ class TestExport(FixtureTestCase): ...@@ -327,14 +369,26 @@ class TestExport(FixtureTestCase):
metadata.type.value, metadata.type.value,
metadata.value, metadata.value,
str(metadata.entity_id) if metadata.entity_id else None, str(metadata.entity_id) if metadata.entity_id else None,
str(metadata.worker_version_id) if metadata.worker_version_id else None str(metadata.worker_version_id) if metadata.worker_version_id else None,
str(metadata.worker_run_id) if metadata.worker_run_id else None,
) )
for metadata in MetaData.objects.filter(element__corpus=self.corpus) for metadata in MetaData.objects.filter(element__corpus=self.corpus)
] ]
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT id, name, type_id, validated, moderator, metas, worker_version_id FROM entity").fetchall(), db.execute("""
SELECT
id,
name,
type_id,
validated,
moderator,
metas,
worker_version_id,
worker_run_id
FROM entity
""").fetchall(),
[ [
( (
str(entity.id), str(entity.id),
...@@ -344,6 +398,7 @@ class TestExport(FixtureTestCase): ...@@ -344,6 +398,7 @@ class TestExport(FixtureTestCase):
entity.moderator.email if entity.moderator else None, entity.moderator.email if entity.moderator else None,
json.dumps(entity.metas) if entity.metas else None, json.dumps(entity.metas) if entity.metas else None,
str(entity.worker_version_id) if entity.worker_version_id else None, str(entity.worker_version_id) if entity.worker_version_id else None,
str(entity.worker_run_id) if entity.worker_run_id else None,
) )
for entity in self.corpus.entities.all() for entity in self.corpus.entities.all()
] ]
...@@ -389,7 +444,18 @@ class TestExport(FixtureTestCase): ...@@ -389,7 +444,18 @@ class TestExport(FixtureTestCase):
) )
self.assertCountEqual( self.assertCountEqual(
db.execute("SELECT id, transcription_id, entity_id, offset, length, worker_version_id, confidence FROM transcription_entity").fetchall(), db.execute("""
SELECT
id,
transcription_id,
entity_id,
offset,
length,
worker_version_id,
worker_run_id,
confidence
FROM transcription_entity
""").fetchall(),
[ [
( (
str(transcription_entity.id), str(transcription_entity.id),
...@@ -398,6 +464,7 @@ class TestExport(FixtureTestCase): ...@@ -398,6 +464,7 @@ class TestExport(FixtureTestCase):
transcription_entity.offset, transcription_entity.offset,
transcription_entity.length, transcription_entity.length,
str(transcription_entity.worker_version_id) if transcription_entity.worker_version_id else None, str(transcription_entity.worker_version_id) if transcription_entity.worker_version_id else None,
str(transcription_entity.worker_run_id) if transcription_entity.worker_run_id else None,
transcription_entity.confidence, transcription_entity.confidence,
) )
for transcription_entity in TranscriptionEntity.objects.filter(entity__corpus=self.corpus) for transcription_entity in TranscriptionEntity.objects.filter(entity__corpus=self.corpus)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment