Split everything into SQL files

449d0891 · Erwan Rouchet · Bastien Abadie · e13e7c7b · 449d0891 · 449d0891
Commit 449d0891 authored 3 years ago by Erwan Rouchet Committed by Bastien Abadie 3 years ago
--- a/arkindex/documents/export/__init__.py
+++ b/arkindex/documents/export/__init__.py
@@ -14,181 +14,20 @@ BASE_DIR = Path(__file__).absolute().parent
 # Maximum number of rows that will be stored in memory at once before being sent to SQLite.
 CSV_BATCH_SIZE = 10000

-# Map SQLite table names to PostgreSQL queries
-EXPORT_QUERIES = [(
+# Map SQLite table names to PostgreSQL query files
+EXPORT_QUERIES = [
    'image',
-    """
-    SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
-    FROM images_image image
-    INNER JOIN images_zone zone ON (zone.image_id = image.id)
-    INNER JOIN documents_element element ON (element.zone_id = zone.id)
-    INNER JOIN images_imageserver server ON (server.id = image.server_id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'worker_version',
-    """
-    SELECT version.id, worker.name, worker.slug, worker.type, revision.hash
-    FROM dataimport_workerversion version
-    INNER JOIN dataimport_worker worker ON (version.worker_id = worker.id)
-    INNER JOIN dataimport_revision revision ON (version.revision_id = revision.id)
-    WHERE version.id IN (
-        SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
-    UNION
-        SELECT worker_version_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid
-    UNION
-        SELECT classification.worker_version_id
-        FROM documents_classification classification
-        INNER JOIN documents_element element ON (element.id = classification.element_id)
-        WHERE element.corpus_id = '{corpus_id}'::uuid
-    UNION
-        SELECT transcription.worker_version_id
-        FROM documents_transcription transcription
-        INNER JOIN documents_element element ON (element.id = transcription.element_id)
-        WHERE element.corpus_id = '{corpus_id}'::uuid
-    UNION
-        SELECT te.worker_version_id
-        FROM documents_transcriptionentity te
-        INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
-        WHERE entity.corpus_id = '{corpus_id}'::uuid
-    )
-    """
-), (
    'element',
-    # Dates are exported as UNIX timestamps because SQLite does not have a datetime type,
-    # but has date and time functions that take UNIX timestamps as arguments.
-    """
-    SELECT
-        element.id,
-        EXTRACT(EPOCH FROM element.created),
-        EXTRACT(EPOCH FROM element.updated),
-        element.name,
-        type.slug,
-        ST_AsGeoJSON(zone.polygon)::json->'coordinates',
-        zone.image_id,
-        element.worker_version_id
-    FROM documents_element element
-    INNER JOIN documents_elementtype type ON (element.type_id = type.id)
-    LEFT JOIN images_zone zone ON (element.zone_id = zone.id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'element_path',
-    # The destination table has id, parent_id, child_id, ordering.
-    # In an ElementPath, the child ID is the element ID, and its immediate parent
-    # is the last item in the array (path[array_length(path, 1)]).
-    # We can safely ditch the rest of the array as the parent will have its own paths
-    # with its own parents as last items.
-    # This uses DISTINCT ON because when the parent has two paths, the child will have
-    # two paths with the same direct parent.  These two paths should have the same
-    # ordering, but this is not actually enforced by any constraint.  ORDER BY is used
-    # because DISTINCT ON requires it.
-    """
-    SELECT
-        DISTINCT ON (path.path[array_length(path.path, 1)], path.element_id)
-        path.id,
-        path.path[array_length(path.path, 1)],
-        path.element_id,
-        path.ordering
-    FROM documents_elementpath path
-    INNER JOIN documents_element element ON (path.element_id = element.id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    ORDER BY path.path[array_length(path.path, 1)], path.element_id
-    """
-), (
    'transcription',
-    """
-    SELECT
-        transcription.id,
-        transcription.element_id,
-        transcription.text,
-        transcription.confidence,
-        transcription.worker_version_id
-    FROM documents_transcription transcription
-    INNER JOIN documents_element element ON (element.id = transcription.element_id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'classification',
-    # SQLite has no boolean type, so high_confidence becomes an integer (0 or 1)
-    """
-    SELECT
-        classification.id,
-        classification.element_id,
-        mlclass.name,
-        classification.state,
-        moderator.email,
-        classification.confidence,
-        classification.high_confidence::integer,
-        classification.worker_version_id
-    FROM documents_classification classification
-    INNER JOIN documents_element element ON (element.id = classification.element_id)
-    INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
-    LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'entity',
-    """
-    SELECT
-        entity.id,
-        entity.name,
-        entity.type,
-        entity.validated::integer,
-        moderator.email,
-        hstore_to_json(entity.metas),
-        entity.worker_version_id
-    FROM documents_entity entity
-    LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
-    WHERE entity.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'transcription_entity',
-    # This joins with the entity table to filter by corpus, as going via transcriptions
-    # requires two joins on larger tables (transcriptions then elements)
-    """
-    SELECT
-        te.id,
-        te.transcription_id,
-        te.entity_id,
-        te.offset,
-        te.length,
-        te.worker_version_id
-    FROM documents_transcriptionentity te
-    INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
-    WHERE entity.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'entity_role',
-    """
-    SELECT id, parent_name, child_name, parent_type, child_type
-    FROM documents_entityrole
-    WHERE corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'entity_link',
-    """
-    SELECT link.id, link.parent_id, link.child_id, link.role_id
-    FROM documents_entitylink link
-    INNER JOIN documents_entityrole role ON (link.role_id = role.id)
-    WHERE role.corpus_id = '{corpus_id}'::uuid
-    """
-), (
    'metadata',
-    """
-    SELECT
-        metadata.id,
-        metadata.element_id,
-        metadata.name,
-        metadata.type,
-        metadata.value,
-        metadata.entity_id,
-        metadata.worker_version_id
-    FROM documents_metadata metadata
-    INNER JOIN documents_element element ON (element.id = metadata.element_id)
-    WHERE element.corpus_id = '{corpus_id}'::uuid
-    """
-)]
+]


 def pg_to_csv(query):
@@ -227,16 +66,18 @@ def export_corpus(corpus_id: str) -> None:
    db = sqlite3.connect(db_path)
    cursor = db.cursor()
    # Initialize all the tables
-    cursor.executescript((BASE_DIR / 'tables.sql').read_text())
+    cursor.executescript((BASE_DIR / 'structure.sql').read_text())
+
+    for i, name in enumerate(EXPORT_QUERIES):
+        query = (BASE_DIR / f'{name}.sql').read_text()

-    for i, (table_name, query) in enumerate(EXPORT_QUERIES):
        if rq_job:
            rq_job.set_progress(i / len(EXPORT_QUERIES))
        csv_file = pg_to_csv(query.format(corpus_id=corpus_id))

        if rq_job:
            rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
-        csv_to_sqlite(csv_file, table_name, cursor)
+        csv_to_sqlite(csv_file, name, cursor)
        db.commit()

    db.close()

--- a/arkindex/documents/export/classification.sql
+++ b/arkindex/documents/export/classification.sql
+SELECT
+    classification.id,
+    classification.element_id,
+    mlclass.name,
+    classification.state,
+    moderator.email,
+    classification.confidence,
+    -- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1)
+    classification.high_confidence::integer,
+    classification.worker_version_id
+FROM documents_classification classification
+INNER JOIN documents_element element ON (element.id = classification.element_id)
+INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
+LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
--- a/arkindex/documents/export/element.sql
+++ b/arkindex/documents/export/element.sql
+-- Dates are exported as UNIX timestamps because SQLite does not have a datetime type,
+-- but has date and time functions that take UNIX timestamps as arguments.
+SELECT
+    element.id,
+    EXTRACT(EPOCH FROM element.created),
+    EXTRACT(EPOCH FROM element.updated),
+    element.name,
+    type.slug,
+    ST_AsGeoJSON(zone.polygon)::json->'coordinates',
+    zone.image_id,
+    element.worker_version_id
+FROM documents_element element
+INNER JOIN documents_elementtype type ON (element.type_id = type.id)
+LEFT JOIN images_zone zone ON (element.zone_id = zone.id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
--- a/arkindex/documents/export/element_path.sql
+++ b/arkindex/documents/export/element_path.sql
+-- The destination table has id, parent_id, child_id, ordering.
+-- In an ElementPath, the child ID is the element ID, and its immediate parent
+-- is the last item in the array (path[array_length(path, 1)]).
+-- We can safely ditch the rest of the array as the parent will have its own paths
+-- with its own parents as last items.
+-- This uses DISTINCT ON because when the parent has two paths, the child will have
+-- two paths with the same direct parent.  These two paths should have the same
+-- ordering, but this is not actually enforced by any constraint.  ORDER BY is used
+-- because DISTINCT ON requires it.
+SELECT
+    DISTINCT ON (path.path[array_length(path.path, 1)], path.element_id)
+    path.id,
+    path.path[array_length(path.path, 1)],
+    path.element_id,
+    path.ordering
+FROM documents_elementpath path
+INNER JOIN documents_element element ON (path.element_id = element.id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
+ORDER BY path.path[array_length(path.path, 1)], path.element_id
--- a/arkindex/documents/export/entity.sql
+++ b/arkindex/documents/export/entity.sql
+SELECT
+    entity.id,
+    entity.name,
+    entity.type,
+    entity.validated::integer,
+    moderator.email,
+    hstore_to_json(entity.metas),
+    entity.worker_version_id
+FROM documents_entity entity
+LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
+WHERE entity.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/entity_link.sql
+++ b/arkindex/documents/export/entity_link.sql
+SELECT link.id, link.parent_id, link.child_id, link.role_id
+FROM documents_entitylink link
+INNER JOIN documents_entityrole role ON (link.role_id = role.id)
+WHERE role.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/entity_role.sql
+++ b/arkindex/documents/export/entity_role.sql
+SELECT id, parent_name, child_name, parent_type, child_type
+FROM documents_entityrole
+WHERE corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/image.sql
+++ b/arkindex/documents/export/image.sql
+SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
+FROM images_image image
+INNER JOIN images_zone zone ON (zone.image_id = image.id)
+INNER JOIN documents_element element ON (element.zone_id = zone.id)
+INNER JOIN images_imageserver server ON (server.id = image.server_id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/metadata.sql
+++ b/arkindex/documents/export/metadata.sql
+SELECT
+    metadata.id,
+    metadata.element_id,
+    metadata.name,
+    metadata.type,
+    metadata.value,
+    metadata.entity_id,
+    metadata.worker_version_id
+FROM documents_metadata metadata
+INNER JOIN documents_element element ON (element.id = metadata.element_id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/tables.sql
+++ b/arkindex/documents/export/tables.sql
--- a/arkindex/documents/export/transcription.sql
+++ b/arkindex/documents/export/transcription.sql
+SELECT
+    transcription.id,
+    transcription.element_id,
+    transcription.text,
+    transcription.confidence,
+    transcription.worker_version_id
+FROM documents_transcription transcription
+INNER JOIN documents_element element ON (element.id = transcription.element_id)
+WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
--- a/arkindex/documents/export/transcription_entity.sql
+++ b/arkindex/documents/export/transcription_entity.sql
+-- This joins with the entity table to filter by corpus, as going via transcriptions
+-- requires two joins on larger tables (transcriptions then elements)
+SELECT
+    te.id,
+    te.transcription_id,
+    te.entity_id,
+    te.offset,
+    te.length,
+    te.worker_version_id
+FROM documents_transcriptionentity te
+INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
+WHERE entity.corpus_id = '{corpus_id}'::uuid
--- a/arkindex/documents/export/worker_version.sql
+++ b/arkindex/documents/export/worker_version.sql
+-- This filters worker versions to only include those used in any of the kinds
+-- of ML results we have.  Doing it using LEFT JOIN would require 10 joins and
+-- fills up the RAM.  Adding DISTINCT to all the SELECT queries of the UNION
+-- slows this query down by ~20%.  Using multiple INs instead of a UNION makes
+-- this query twice as slow.
+SELECT version.id, worker.name, worker.slug, worker.type, revision.hash
+FROM dataimport_workerversion version
+INNER JOIN dataimport_worker worker ON (version.worker_id = worker.id)
+INNER JOIN dataimport_revision revision ON (version.revision_id = revision.id)
+WHERE version.id IN (
+    SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
+UNION
+    SELECT worker_version_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid
+UNION
+    SELECT classification.worker_version_id
+    FROM documents_classification classification
+    INNER JOIN documents_element element ON (element.id = classification.element_id)
+    WHERE element.corpus_id = '{corpus_id}'::uuid
+UNION
+    SELECT transcription.worker_version_id
+    FROM documents_transcription transcription
+    INNER JOIN documents_element element ON (element.id = transcription.element_id)
+    WHERE element.corpus_id = '{corpus_id}'::uuid
+UNION
+    SELECT te.worker_version_id
+    FROM documents_transcriptionentity te
+    INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
+    WHERE entity.corpus_id = '{corpus_id}'::uuid
+)