Skip to content
Snippets Groups Projects
Commit 449d0891 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Split everything into SQL files

parent e13e7c7b
No related branches found
No related tags found
No related merge requests found
Showing
with 142 additions and 167 deletions
......@@ -14,181 +14,20 @@ BASE_DIR = Path(__file__).absolute().parent
# Maximum number of rows that will be stored in memory at once before being sent to SQLite.
CSV_BATCH_SIZE = 10000
# Map SQLite table names to PostgreSQL queries
EXPORT_QUERIES = [(
# Map SQLite table names to PostgreSQL query files
EXPORT_QUERIES = [
'image',
"""
SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
FROM images_image image
INNER JOIN images_zone zone ON (zone.image_id = image.id)
INNER JOIN documents_element element ON (element.zone_id = zone.id)
INNER JOIN images_imageserver server ON (server.id = image.server_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
'worker_version',
"""
SELECT version.id, worker.name, worker.slug, worker.type, revision.hash
FROM dataimport_workerversion version
INNER JOIN dataimport_worker worker ON (version.worker_id = worker.id)
INNER JOIN dataimport_revision revision ON (version.revision_id = revision.id)
WHERE version.id IN (
SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
UNION
SELECT worker_version_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid
UNION
SELECT classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION
SELECT transcription.worker_version_id
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION
SELECT te.worker_version_id
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
)
"""
), (
'element',
# Dates are exported as UNIX timestamps because SQLite does not have a datetime type,
# but has date and time functions that take UNIX timestamps as arguments.
"""
SELECT
element.id,
EXTRACT(EPOCH FROM element.created),
EXTRACT(EPOCH FROM element.updated),
element.name,
type.slug,
ST_AsGeoJSON(zone.polygon)::json->'coordinates',
zone.image_id,
element.worker_version_id
FROM documents_element element
INNER JOIN documents_elementtype type ON (element.type_id = type.id)
LEFT JOIN images_zone zone ON (element.zone_id = zone.id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
'element_path',
# The destination table has id, parent_id, child_id, ordering.
# In an ElementPath, the child ID is the element ID, and its immediate parent
# is the last item in the array (path[array_length(path, 1)]).
# We can safely ditch the rest of the array as the parent will have its own paths
# with its own parents as last items.
# This uses DISTINCT ON because when the parent has two paths, the child will have
# two paths with the same direct parent. These two paths should have the same
# ordering, but this is not actually enforced by any constraint. ORDER BY is used
# because DISTINCT ON requires it.
"""
SELECT
DISTINCT ON (path.path[array_length(path.path, 1)], path.element_id)
path.id,
path.path[array_length(path.path, 1)],
path.element_id,
path.ordering
FROM documents_elementpath path
INNER JOIN documents_element element ON (path.element_id = element.id)
WHERE element.corpus_id = '{corpus_id}'::uuid
ORDER BY path.path[array_length(path.path, 1)], path.element_id
"""
), (
'transcription',
"""
SELECT
transcription.id,
transcription.element_id,
transcription.text,
transcription.confidence,
transcription.worker_version_id
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
'classification',
# SQLite has no boolean type, so high_confidence becomes an integer (0 or 1)
"""
SELECT
classification.id,
classification.element_id,
mlclass.name,
classification.state,
moderator.email,
classification.confidence,
classification.high_confidence::integer,
classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
'entity',
"""
SELECT
entity.id,
entity.name,
entity.type,
entity.validated::integer,
moderator.email,
hstore_to_json(entity.metas),
entity.worker_version_id
FROM documents_entity entity
LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
"""
), (
'transcription_entity',
# This joins with the entity table to filter by corpus, as going via transcriptions
# requires two joins on larger tables (transcriptions then elements)
"""
SELECT
te.id,
te.transcription_id,
te.entity_id,
te.offset,
te.length,
te.worker_version_id
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
"""
), (
'entity_role',
"""
SELECT id, parent_name, child_name, parent_type, child_type
FROM documents_entityrole
WHERE corpus_id = '{corpus_id}'::uuid
"""
), (
'entity_link',
"""
SELECT link.id, link.parent_id, link.child_id, link.role_id
FROM documents_entitylink link
INNER JOIN documents_entityrole role ON (link.role_id = role.id)
WHERE role.corpus_id = '{corpus_id}'::uuid
"""
), (
'metadata',
"""
SELECT
metadata.id,
metadata.element_id,
metadata.name,
metadata.type,
metadata.value,
metadata.entity_id,
metadata.worker_version_id
FROM documents_metadata metadata
INNER JOIN documents_element element ON (element.id = metadata.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
)]
]
def pg_to_csv(query):
......@@ -227,16 +66,18 @@ def export_corpus(corpus_id: str) -> None:
db = sqlite3.connect(db_path)
cursor = db.cursor()
# Initialize all the tables
cursor.executescript((BASE_DIR / 'tables.sql').read_text())
cursor.executescript((BASE_DIR / 'structure.sql').read_text())
for i, name in enumerate(EXPORT_QUERIES):
query = (BASE_DIR / f'{name}.sql').read_text()
for i, (table_name, query) in enumerate(EXPORT_QUERIES):
if rq_job:
rq_job.set_progress(i / len(EXPORT_QUERIES))
csv_file = pg_to_csv(query.format(corpus_id=corpus_id))
if rq_job:
rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
csv_to_sqlite(csv_file, table_name, cursor)
csv_to_sqlite(csv_file, name, cursor)
db.commit()
db.close()
......
SELECT
classification.id,
classification.element_id,
mlclass.name,
classification.state,
moderator.email,
classification.confidence,
-- SQLite has no boolean type, so high_confidence becomes an integer (0 or 1)
classification.high_confidence::integer,
classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
-- Dates are exported as UNIX timestamps because SQLite does not have a datetime type,
-- but has date and time functions that take UNIX timestamps as arguments.
SELECT
element.id,
EXTRACT(EPOCH FROM element.created),
EXTRACT(EPOCH FROM element.updated),
element.name,
type.slug,
ST_AsGeoJSON(zone.polygon)::json->'coordinates',
zone.image_id,
element.worker_version_id
FROM documents_element element
INNER JOIN documents_elementtype type ON (element.type_id = type.id)
LEFT JOIN images_zone zone ON (element.zone_id = zone.id)
WHERE element.corpus_id = '{corpus_id}'::uuid
-- The destination table has id, parent_id, child_id, ordering.
-- In an ElementPath, the child ID is the element ID, and its immediate parent
-- is the last item in the array (path[array_length(path, 1)]).
-- We can safely ditch the rest of the array as the parent will have its own paths
-- with its own parents as last items.
-- This uses DISTINCT ON because when the parent has two paths, the child will have
-- two paths with the same direct parent. These two paths should have the same
-- ordering, but this is not actually enforced by any constraint. ORDER BY is used
-- because DISTINCT ON requires it.
SELECT
DISTINCT ON (path.path[array_length(path.path, 1)], path.element_id)
path.id,
path.path[array_length(path.path, 1)],
path.element_id,
path.ordering
FROM documents_elementpath path
INNER JOIN documents_element element ON (path.element_id = element.id)
WHERE element.corpus_id = '{corpus_id}'::uuid
ORDER BY path.path[array_length(path.path, 1)], path.element_id
SELECT
entity.id,
entity.name,
entity.type,
entity.validated::integer,
moderator.email,
hstore_to_json(entity.metas),
entity.worker_version_id
FROM documents_entity entity
LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
SELECT link.id, link.parent_id, link.child_id, link.role_id
FROM documents_entitylink link
INNER JOIN documents_entityrole role ON (link.role_id = role.id)
WHERE role.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
SELECT id, parent_name, child_name, parent_type, child_type
FROM documents_entityrole
WHERE corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
FROM images_image image
INNER JOIN images_zone zone ON (zone.image_id = image.id)
INNER JOIN documents_element element ON (element.zone_id = zone.id)
INNER JOIN images_imageserver server ON (server.id = image.server_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
SELECT
metadata.id,
metadata.element_id,
metadata.name,
metadata.type,
metadata.value,
metadata.entity_id,
metadata.worker_version_id
FROM documents_metadata metadata
INNER JOIN documents_element element ON (element.id = metadata.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
SELECT
transcription.id,
transcription.element_id,
transcription.text,
transcription.confidence,
transcription.worker_version_id
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
\ No newline at end of file
-- This joins with the entity table to filter by corpus, as going via transcriptions
-- requires two joins on larger tables (transcriptions then elements)
SELECT
te.id,
te.transcription_id,
te.entity_id,
te.offset,
te.length,
te.worker_version_id
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
-- This filters worker versions to only include those used in any of the kinds
-- of ML results we have. Doing it using LEFT JOIN would require 10 joins and
-- fills up the RAM. Adding DISTINCT to all the SELECT queries of the UNION
-- slows this query down by ~20%. Using multiple INs instead of a UNION makes
-- this query twice as slow.
SELECT version.id, worker.name, worker.slug, worker.type, revision.hash
FROM dataimport_workerversion version
INNER JOIN dataimport_worker worker ON (version.worker_id = worker.id)
INNER JOIN dataimport_revision revision ON (version.revision_id = revision.id)
WHERE version.id IN (
SELECT worker_version_id FROM documents_element WHERE corpus_id = '{corpus_id}'::uuid
UNION
SELECT worker_version_id FROM documents_entity WHERE corpus_id = '{corpus_id}'::uuid
UNION
SELECT classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION
SELECT transcription.worker_version_id
FROM documents_transcription transcription
INNER JOIN documents_element element ON (element.id = transcription.element_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
UNION
SELECT te.worker_version_id
FROM documents_transcriptionentity te
INNER JOIN documents_entity entity ON (te.entity_id = entity.id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment