From 41f2493c75c95f3defb85268b6d477dd11bcd558 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Fri, 21 May 2021 11:01:11 +0000 Subject: [PATCH] Add SQLite export task --- Dockerfile.binary | 1 + MANIFEST.in | 1 + arkindex/documents/export/__init__.py | 102 ++++++++ arkindex/documents/export/classification.sql | 15 ++ arkindex/documents/export/element.sql | 15 ++ arkindex/documents/export/element_path.sql | 19 ++ arkindex/documents/export/entity.sql | 11 + arkindex/documents/export/entity_link.sql | 4 + arkindex/documents/export/entity_role.sql | 3 + arkindex/documents/export/image.sql | 6 + arkindex/documents/export/metadata.sql | 11 + arkindex/documents/export/structure.sql | 137 ++++++++++ arkindex/documents/export/transcription.sql | 9 + .../documents/export/transcription_entity.sql | 12 + arkindex/documents/export/worker_version.sql | 29 +++ arkindex/documents/tests/tasks/test_export.py | 246 ++++++++++++++++++ arkindex/project/triggers.py | 13 +- 17 files changed, 633 insertions(+), 1 deletion(-) create mode 100644 arkindex/documents/export/__init__.py create mode 100644 arkindex/documents/export/classification.sql create mode 100644 arkindex/documents/export/element.sql create mode 100644 arkindex/documents/export/element_path.sql create mode 100644 arkindex/documents/export/entity.sql create mode 100644 arkindex/documents/export/entity_link.sql create mode 100644 arkindex/documents/export/entity_role.sql create mode 100644 arkindex/documents/export/image.sql create mode 100644 arkindex/documents/export/metadata.sql create mode 100644 arkindex/documents/export/structure.sql create mode 100644 arkindex/documents/export/transcription.sql create mode 100644 arkindex/documents/export/transcription_entity.sql create mode 100644 arkindex/documents/export/worker_version.sql create mode 100644 arkindex/documents/tests/tasks/test_export.py diff --git a/Dockerfile.binary b/Dockerfile.binary index e7048d19f3..a69693f421 100644 --- a/Dockerfile.binary +++ b/Dockerfile.binary @@ -74,6 +74,7 @@ COPY VERSION /etc/arkindex.version # Copy templates in base dir for binary ENV BASE_DIR=/usr/share/arkindex COPY arkindex/templates /usr/share/arkindex/templates +COPY arkindex/documents/export/*.sql /usr/share/arkindex/documents/export/ # Touch python files for needed management commands # Otherwise Django will not load the compiled module diff --git a/MANIFEST.in b/MANIFEST.in index d24fcd1182..d3ad7bfbd1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,3 +5,4 @@ include tests-requirements.txt recursive-include arkindex/templates *.html recursive-include arkindex/templates *.json recursive-include arkindex/templates *.txt +include arkindex/documents/export/*.sql diff --git a/arkindex/documents/export/__init__.py b/arkindex/documents/export/__init__.py new file mode 100644 index 0000000000..33a83a7198 --- /dev/null +++ b/arkindex/documents/export/__init__.py @@ -0,0 +1,102 @@ +import json +import logging +import sqlite3 +import tempfile +import uuid +from pathlib import Path + +from django.db import connections +from django_rq import job +from rq import get_current_job + +BASE_DIR = Path(__file__).absolute().parent + +# Maximum number of rows that will be stored in memory at once before being sent to SQLite. +BATCH_SIZE = 10000 + +logger = logging.getLogger(__name__) + +# Map SQLite table names to PostgreSQL query files +EXPORT_QUERIES = [ + 'image', + 'worker_version', + 'element', + 'element_path', + 'transcription', + 'classification', + 'entity', + 'transcription_entity', + 'entity_role', + 'entity_link', + 'metadata', +] + + +def run_pg_query(query): + """ + Run a single Postgresql query and split the results into chunks. + When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name. + """ + with connections['default'].create_cursor(name=str(uuid.uuid4())) as pg_cursor: + pg_cursor.itersize = BATCH_SIZE + pg_cursor.execute(query) + + while True: + out = pg_cursor.fetchmany(BATCH_SIZE) + if not out: + return + yield out + + +def save_sqlite(rows, table, cursor): + """ + Write a chunk of rows into an SQLite table + """ + def _serialize(value): + # Serialize UUID as string + if isinstance(value, uuid.UUID): + return str(value) + + # Serialize list and dicts as json + if isinstance(value, (list, dict)): + return json.dumps(value) + + return value + + rows = [ + list(map(_serialize, row)) + for row in rows + ] + + # Build the parameterized query by counting the columns in a CSV row and repeating '?' parameters + insert_args = ",".join("?" for _ in range(len(rows[0]))) + query = f"INSERT INTO {table} VALUES ({insert_args})" + cursor.executemany(query, rows) + + +@job +def export_corpus(corpus_id: str) -> None: + rq_job = get_current_job() + + _, db_path = tempfile.mkstemp(suffix='db') + logger.info(f"Exporting corpus {corpus_id} into {db_path}") + db = sqlite3.connect(db_path) + cursor = db.cursor() + + # Initialize all the tables + cursor.executescript((BASE_DIR / 'structure.sql').read_text()) + + for i, name in enumerate(EXPORT_QUERIES): + query = (BASE_DIR / f'{name}.sql').read_text() + logger.info(f"Running query {i+1}/{len(EXPORT_QUERIES)} {name}") + + if rq_job: + rq_job.set_progress(i / len(EXPORT_QUERIES)) + for chunk in run_pg_query(query.format(corpus_id=corpus_id)): + save_sqlite(chunk, name, cursor) + + db.commit() + + db.close() + + return db_path diff --git a/arkindex/documents/export/classification.sql b/arkindex/documents/export/classification.sql new file mode 100644 index 0000000000..cb03484685 --- /dev/null +++ b/arkindex/documents/export/classification.sql @@ -0,0 +1,15 @@ +SELECT + classification.id, + classification.element_id, + mlclass.name, + classification.state, + moderator.email, + classification.confidence, + -- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1) + classification.high_confidence::integer, + classification.worker_version_id +FROM documents_classification classification +INNER JOIN documents_element element ON (element.id = classification.element_id) +INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id) +LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id) +WHERE element.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/element.sql b/arkindex/documents/export/element.sql new file mode 100644 index 0000000000..ba135c2c84 --- /dev/null +++ b/arkindex/documents/export/element.sql @@ -0,0 +1,15 @@ +-- Dates are exported as UNIX timestamps because SQLite does not have a datetime type, +-- but has date and time functions that take UNIX timestamps as arguments. +SELECT + element.id, + EXTRACT(EPOCH FROM element.created), + EXTRACT(EPOCH FROM element.updated), + element.name, + type.slug, + ST_AsGeoJSON(zone.polygon)::json->'coordinates', + zone.image_id, + element.worker_version_id +FROM documents_element element +INNER JOIN documents_elementtype type ON (element.type_id = type.id) +LEFT JOIN images_zone zone ON (element.zone_id = zone.id) +WHERE element.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/element_path.sql b/arkindex/documents/export/element_path.sql new file mode 100644 index 0000000000..ac7116b982 --- /dev/null +++ b/arkindex/documents/export/element_path.sql @@ -0,0 +1,19 @@ +-- The destination table has id, parent_id, child_id, ordering. +-- In an ElementPath, the child ID is the element ID, and its immediate parent +-- is the last item in the array (path[array_length(path, 1)]). +-- We can safely ditch the rest of the array as the parent will have its own paths +-- with its own parents as last items. +-- This uses DISTINCT ON because when the parent has two paths, the child will have +-- two paths with the same direct parent. These two paths should have the same +-- ordering, but this is not actually enforced by any constraint. ORDER BY is used +-- because DISTINCT ON requires it. +SELECT + DISTINCT ON (path.path[array_length(path.path, 1)], path.element_id) + path.id, + path.path[array_length(path.path, 1)], + path.element_id, + path.ordering +FROM documents_elementpath path +INNER JOIN documents_element element ON (path.element_id = element.id) +WHERE element.corpus_id = '{corpus_id}'::uuid +ORDER BY path.path[array_length(path.path, 1)], path.element_id diff --git a/arkindex/documents/export/entity.sql b/arkindex/documents/export/entity.sql new file mode 100644 index 0000000000..73d549953f --- /dev/null +++ b/arkindex/documents/export/entity.sql @@ -0,0 +1,11 @@ +SELECT + entity.id, + entity.name, + entity.type, + entity.validated::integer, + moderator.email, + hstore_to_json(entity.metas), + entity.worker_version_id +FROM documents_entity entity +LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id) +WHERE entity.corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/entity_link.sql b/arkindex/documents/export/entity_link.sql new file mode 100644 index 0000000000..58b113c51b --- /dev/null +++ b/arkindex/documents/export/entity_link.sql @@ -0,0 +1,4 @@ +SELECT link.id, link.parent_id, link.child_id, link.role_id +FROM documents_entitylink link +INNER JOIN documents_entityrole role ON (link.role_id = role.id) +WHERE role.corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/entity_role.sql b/arkindex/documents/export/entity_role.sql new file mode 100644 index 0000000000..00c5128ed1 --- /dev/null +++ b/arkindex/documents/export/entity_role.sql @@ -0,0 +1,3 @@ +SELECT id, parent_name, child_name, parent_type, child_type +FROM documents_entityrole +WHERE corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/image.sql b/arkindex/documents/export/image.sql new file mode 100644 index 0000000000..a6ed7601bf --- /dev/null +++ b/arkindex/documents/export/image.sql @@ -0,0 +1,6 @@ +SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height +FROM images_image image +INNER JOIN images_zone zone ON (zone.image_id = image.id) +INNER JOIN documents_element element ON (element.zone_id = zone.id) +INNER JOIN images_imageserver server ON (server.id = image.server_id) +WHERE element.corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/metadata.sql b/arkindex/documents/export/metadata.sql new file mode 100644 index 0000000000..606ca2d995 --- /dev/null +++ b/arkindex/documents/export/metadata.sql @@ -0,0 +1,11 @@ +SELECT + metadata.id, + metadata.element_id, + metadata.name, + metadata.type, + metadata.value, + metadata.entity_id, + metadata.worker_version_id +FROM documents_metadata metadata +INNER JOIN documents_element element ON (element.id = metadata.element_id) +WHERE element.corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/structure.sql b/arkindex/documents/export/structure.sql new file mode 100644 index 0000000000..264d09705a --- /dev/null +++ b/arkindex/documents/export/structure.sql @@ -0,0 +1,137 @@ +PRAGMA foreign_keys = ON; + +CREATE TABLE image ( + id VARCHAR(37) NOT NULL, + url TEXT NOT NULL, + width INTEGER NOT NULL, + height INTEGER NOT NULL, + PRIMARY KEY (id), + UNIQUE (url), + CHECK (width >= 0 AND height >= 0) +); + +CREATE TABLE worker_version ( + id VARCHAR(37) NOT NULL, + name VARCHAR(100) NOT NULL, + slug VARCHAR(100) NOT NULL, + type VARCHAR(50) NOT NULL, + revision VARCHAR(50) NOT NULL, + PRIMARY KEY (id) +); + +CREATE TABLE element ( + id VARCHAR(37) NOT NULL, + created REAL NOT NULL, + updated REAL NOT NULL, + name VARCHAR(250) NOT NULL, + type VARCHAR(50) NOT NULL, + polygon TEXT, + image_id VARCHAR(37), + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (image_id) REFERENCES image (id) ON DELETE CASCADE, + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE +); + +CREATE TABLE element_path ( + id VARCHAR(37) NOT NULL, + parent_id VARCHAR(37) NOT NULL, + child_id VARCHAR(37) NOT NULL, + ordering INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (id), + FOREIGN KEY (parent_id) REFERENCES element (id) ON DELETE CASCADE, + FOREIGN KEY (child_id) REFERENCES element (id) ON DELETE CASCADE, + UNIQUE (parent_id, child_id), + CHECK (ordering >= 0) +); + +CREATE TABLE transcription ( + id VARCHAR(37) NOT NULL, + element_id VARCHAR(37) NOT NULL, + text TEXT NOT NULL, + confidence REAL, + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)) +); + +CREATE TABLE classification ( + id VARCHAR(37) NOT NULL, + element_id VARCHAR(37) NOT NULL, + class_name VARCHAR(1024) NOT NULL, + state VARCHAR(16) NOT NULL DEFAULT 'pending', + moderator VARCHAR(255), + confidence REAL, + high_confidence INTEGER NOT NULL DEFAULT 0, + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + CHECK (confidence IS NULL OR (confidence >= 0 AND confidence <= 1)), + CHECK (high_confidence = 0 OR high_confidence = 1) +); + +CREATE TABLE entity ( + id VARCHAR(37) NOT NULL, + name TEXT NOT NULL, + type VARCHAR(50) NOT NULL, + validated INTEGER NOT NULL DEFAULT 0, + moderator VARCHAR(255), + metas TEXT, + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + CHECK (validated = 0 OR validated = 1) +); + +CREATE TABLE transcription_entity ( + id VARCHAR(37) NOT NULL, + transcription_id VARCHAR(37) NOT NULL, + entity_id VARCHAR(37) NOT NULL, + offset INTEGER NOT NULL, + length INTEGER NOT NULL, + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (transcription_id) REFERENCES transcription (id) ON DELETE CASCADE, + FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE CASCADE, + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE, + UNIQUE (transcription_id, entity_id, offset, length, worker_version_id), + CHECK (offset >= 0 AND length >= 0) +); + +CREATE TABLE entity_role ( + id VARCHAR(37) NOT NULL, + parent_name VARCHAR(250) NOT NULL, + child_name VARCHAR(250) NOT NULL, + parent_type VARCHAR(50) NOT NULL, + child_type VARCHAR(50) NOT NULL, + PRIMARY KEY (id), + UNIQUE (parent_name, child_name, parent_type, child_type) +); + +CREATE TABLE entity_link ( + id VARCHAR(37) NOT NULL, + parent_id VARCHAR(37) NOT NULL, + child_id VARCHAR(37) NOT NULL, + role_id VARCHAR(37) NOT NULL, + PRIMARY KEY (id), + FOREIGN KEY (parent_id) REFERENCES entity (id), + FOREIGN KEY (child_id) REFERENCES entity (id), + FOREIGN KEY (role_id) REFERENCES entity_role (id) +); + +CREATE TABLE metadata ( + id VARCHAR(37) NOT NULL, + element_id VARCHAR(37) NOT NULL, + name VARCHAR(250) NOT NULL, + type VARCHAR(50) NOT NULL, + value TEXT NOT NULL, + entity_id VARCHAR(37), + worker_version_id VARCHAR(37), + PRIMARY KEY (id), + FOREIGN KEY (element_id) REFERENCES element (id) ON DELETE CASCADE, + FOREIGN KEY (entity_id) REFERENCES entity (id) ON DELETE SET NULL, + FOREIGN KEY (worker_version_id) REFERENCES worker_version (id) ON DELETE CASCADE +); diff --git a/arkindex/documents/export/transcription.sql b/arkindex/documents/export/transcription.sql new file mode 100644 index 0000000000..c1bb2c76b4 --- /dev/null +++ b/arkindex/documents/export/transcription.sql @@ -0,0 +1,9 @@ +SELECT + transcription.id, + transcription.element_id, + transcription.text, + transcription.confidence, + transcription.worker_version_id +FROM documents_transcription transcription +INNER JOIN documents_element element ON (element.id = transcription.element_id) +WHERE element.corpus_id = '{corpus_id}'::uuid \ No newline at end of file diff --git a/arkindex/documents/export/transcription_entity.sql b/arkindex/documents/export/transcription_entity.sql new file mode 100644 index 0000000000..eac7954aa3 --- /dev/null +++ b/arkindex/documents/export/transcription_entity.sql @@ -0,0 +1,12 @@ +-- This joins with the entity table to filter by corpus, as going via transcriptions +-- requires two joins on larger tables (transcriptions then elements) +SELECT + te.id, + te.transcription_id, + te.entity_id, + te.offset, + te.length, + te.worker_version_id +FROM documents_transcriptionentity te +INNER JOIN documents_entity entity ON (te.entity_id = entity.id) +WHERE entity.corpus_id = '{corpus_id}'::uuid diff --git a/arkindex/documents/export/worker_version.sql b/arkindex/documents/export/worker_version.sql new file mode 100644 index 0000000000..35a13547fa --- /dev/null +++ b/arkindex/documents/export/worker_version.sql @@ -0,0 +1,29 @@ +-- This filters worker versions to only include those used in any of the kinds +-- of ML results we have. Doing it using LEFT JOIN would require 10 joins and +-- fills up the RAM. Adding DISTINCT to all the SELECT queries of the UNION +-- slows this query down by ~20%. Using multiple INs instead of a UNION makes +-- this query twice as slow. +SELECT version.id, worker.name, worker.slug, worker.type, revision.hash +FROM dataimport_workerversion version +INNER JOIN dataimport_worker worker ON (version.worker_id = worker.id) +INNER JOIN dataimport_revision revision ON (version.revision_id = revision.id) +WHERE version.id IN ( + SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid +UNION + SELECT worker_version_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid +UNION + SELECT classification.worker_version_id + FROM documents_classification classification + INNER JOIN documents_element element ON (element.id = classification.element_id) + WHERE element.corpus_id = '{corpus_id}'::uuid +UNION + SELECT transcription.worker_version_id + FROM documents_transcription transcription + INNER JOIN documents_element element ON (element.id = transcription.element_id) + WHERE element.corpus_id = '{corpus_id}'::uuid +UNION + SELECT te.worker_version_id + FROM documents_transcriptionentity te + INNER JOIN documents_entity entity ON (te.entity_id = entity.id) + WHERE entity.corpus_id = '{corpus_id}'::uuid +) diff --git a/arkindex/documents/tests/tasks/test_export.py b/arkindex/documents/tests/tasks/test_export.py new file mode 100644 index 0000000000..62428f7582 --- /dev/null +++ b/arkindex/documents/tests/tasks/test_export.py @@ -0,0 +1,246 @@ +import json +import os +import sqlite3 + +from arkindex.dataimport.models import WorkerVersion +from arkindex.documents.export import export_corpus +from arkindex.documents.models import ( + Classification, + ElementPath, + EntityLink, + EntityType, + MetaData, + Transcription, + TranscriptionEntity, +) +from arkindex.images.models import Image +from arkindex.project.tests import FixtureTestCase + + +class TestExport(FixtureTestCase): + + def test_export(self): + element = self.corpus.elements.get(name='Volume 1') + transcription = Transcription.objects.first() + version = WorkerVersion.objects.get(worker__slug='reco') + + element.classifications.create( + ml_class=self.corpus.ml_classes.create(name='Blah'), + confidence=.55555555, + ) + + entity1 = self.corpus.entities.create( + name='Arrokuda', + type=EntityType.Location, + metas={'subtype': 'pokemon'}, + ) + entity2 = self.corpus.entities.create( + name='Stonjourner', + type=EntityType.Person, + validated=True, + moderator=self.superuser, + ) + role = self.corpus.roles.create( + parent_name='parent', + child_name='child', + parent_type=EntityType.Location, + child_type=EntityType.Person, + ) + role.links.create(parent=entity1, child=entity2) + + transcription.transcription_entities.create( + entity=entity1, + offset=1, + length=1, + worker_version=version, + ) + + db_path = export_corpus(self.corpus.id) + db = sqlite3.connect(db_path) + + self.assertCountEqual( + db.execute('SELECT id, url, width, height FROM image').fetchall(), + [ + ( + str(image.id), + image.url, + image.width, + image.height + ) + for image in Image.objects.all() + ] + ) + + self.assertCountEqual( + db.execute('SELECT id, name, slug, type, revision FROM worker_version').fetchall(), + [ + ( + str(version.id), + version.worker.name, + version.worker.slug, + version.worker.type, + version.revision.hash + ) + ] + ) + + actual_rows = db.execute("SELECT id, created, updated, name, type, worker_version_id, image_id, polygon FROM element").fetchall() + + for i in range(len(actual_rows)): + # Convert the row from a tuple to a list because we'll change it + row = list(actual_rows[i]) + if row[-1] is not None: + # Parse the polygons as JSON for easier comparison + row[-1] = json.loads(row[-1]) + actual_rows[i] = row + + expected_rows = [] + for element in self.corpus.elements.all(): + row = [ + str(element.id), + element.created.timestamp(), + element.updated.timestamp(), + element.name, + element.type.slug, + ] + + if element.worker_version_id: + row.append(str(element.worker_version_id)) + else: + row.append(None) + + if element.zone: + row.append(str(element.zone.image_id)) + row.append([ + # coords returns a list of tuples of floats, we turn it into a list of lists of ints + [int(x), int(y)] for x, y in element.zone.polygon.coords + ]) + else: + row.extend([None, None]) + + expected_rows.append(row) + + self.assertCountEqual(actual_rows, expected_rows) + + self.assertCountEqual( + db.execute("SELECT id, parent_id, child_id, ordering FROM element_path").fetchall(), + [ + ( + str(id), + str(parent_id), + str(child_id), + ordering + ) + for id, parent_id, child_id, ordering + in ElementPath.objects + .filter(element__corpus=self.corpus) + .values_list('id', 'path__last', 'element_id', 'ordering') + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, element_id, text, confidence, worker_version_id FROM transcription").fetchall(), + [ + ( + str(transcription.id), + str(transcription.element_id), + transcription.text, + transcription.confidence, + str(transcription.worker_version_id) if transcription.worker_version_id else None + ) + for transcription in Transcription.objects.filter(element__corpus=self.corpus) + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, element_id, class_name, state, moderator, confidence, high_confidence, worker_version_id FROM classification").fetchall(), + [ + ( + str(classification.id), + str(classification.element_id), + classification.ml_class.name, + classification.state.value, + classification.moderator.email if classification.moderator else None, + classification.confidence, + int(classification.high_confidence), + str(classification.worker_version_id) if classification.worker_version_id else None + ) + for classification in Classification.objects.filter(element__corpus=self.corpus) + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, element_id, name, type, value, entity_id, worker_version_id FROM metadata").fetchall(), + [ + ( + str(metadata.id), + str(metadata.element_id), + metadata.name, + metadata.type.value, + metadata.value, + str(metadata.entity_id) if metadata.entity_id else None, + str(metadata.worker_version_id) if metadata.worker_version_id else None + ) + for metadata in MetaData.objects.filter(element__corpus=self.corpus) + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, name, type, validated, moderator, metas, worker_version_id FROM entity").fetchall(), + [ + ( + str(entity.id), + entity.name, + entity.type.value, + int(entity.validated), + entity.moderator.email if entity.moderator else None, + json.dumps(entity.metas) if entity.metas else None, + str(entity.worker_version_id) if entity.worker_version_id else None, + ) + for entity in self.corpus.entities.all() + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, parent_name, child_name, parent_type, child_type FROM entity_role").fetchall(), + [ + ( + str(role.id), + role.parent_name, + role.child_name, + role.parent_type.value, + role.child_type.value, + ) + for role in self.corpus.roles.all() + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, parent_id, child_id, role_id FROM entity_link").fetchall(), + [ + ( + str(link.id), + str(link.parent_id), + str(link.child_id), + str(link.role_id) + ) + for link in EntityLink.objects.filter(role__corpus=self.corpus) + ] + ) + + self.assertCountEqual( + db.execute("SELECT id, transcription_id, entity_id, offset, length, worker_version_id FROM transcription_entity").fetchall(), + [ + ( + str(transcription_entity.id), + str(transcription_entity.transcription_id), + str(transcription_entity.entity_id), + transcription_entity.offset, + transcription_entity.length, + str(transcription_entity.worker_version_id) if transcription_entity.worker_version_id else None + ) + for transcription_entity in TranscriptionEntity.objects.filter(entity__corpus=self.corpus) + ] + ) + + os.unlink(db_path) diff --git a/arkindex/project/triggers.py b/arkindex/project/triggers.py index 79f0686ba4..967b38941a 100644 --- a/arkindex/project/triggers.py +++ b/arkindex/project/triggers.py @@ -7,7 +7,7 @@ from uuid import UUID from django.conf import settings from arkindex.dataimport.models import DataImport, WorkerVersion -from arkindex.documents import tasks +from arkindex.documents import export, tasks from arkindex.documents.managers import ElementQuerySet from arkindex.documents.models import Corpus, Element, Entity @@ -142,3 +142,14 @@ def initialize_activity(process: DataImport): Initialize activity on every process elements for worker versions that are part of its workflow """ tasks.initialize_activity.delay(process) + + +def export_corpus(corpus: Corpus, user_id: Optional[int] = None) -> None: + """ + Export a corpus to a SQLite database + """ + export.export_corpus.delay( + corpus_id=corpus.id, + user_id=user_id, + description=f'Export of corpus {corpus.name}' + ) -- GitLab