Skip to content
Snippets Groups Projects
Verified Commit 712662f3 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

First working export

parent b6ec014f
No related branches found
No related tags found
No related merge requests found
import csv
import sqlite3
import tempfile
from io import StringIO
from itertools import islice
from pathlib import Path
......@@ -16,12 +17,12 @@ CSV_BATCH_SIZE = 10000
# Map SQLite table names to PostgreSQL queries
EXPORT_QUERIES = [(
'image',
# TODO: Build URLs
"""
SELECT image.id, '', image.width, image.height
SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
FROM images_image image
INNER JOIN images_zone zone ON (zone.image_id = image.id)
INNER JOIN documents_element element ON (element.zone_id = zone.id)
INNER JOIN images_imageserver server ON (server.id = image.server_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
......@@ -97,14 +98,14 @@ EXPORT_QUERIES = [(
classification.element_id,
mlclass.name,
classification.state,
user.email,
moderator.email,
classification.confidence,
classification.high_confidence::integer,
classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
LEFT JOIN users_user user ON (user.id = classification.moderator_id)
LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
"""
), (
......@@ -115,11 +116,11 @@ EXPORT_QUERIES = [(
entity.name,
entity.type,
entity.validated::integer,
user.email,
moderator.email,
hstore_to_json(entity.metas),
entity.worker_version_id
FROM documents_entity entity
INNER JOIN users_user user ON (user.id = entity.moderator_id)
LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
WHERE entity.corpus_id = '{corpus_id}'::uuid
"""
), (
......@@ -171,10 +172,11 @@ EXPORT_QUERIES = [(
)]
def pg_to_csv(csv_file, query):
csv_file.seek(0)
def pg_to_csv(query):
output = StringIO()
with connections['default'].cursor() as pg_cursor:
pg_cursor.copy_expert(f"COPY ({query}) TO STDOUT WITH FORMAT CSV, HEADER OFF", csv_file)
pg_cursor.copy_expert(f"COPY ({query}) TO STDOUT WITH (FORMAT CSV, HEADER OFF, NULL '__null__')", output)
return output
def csv_to_sqlite(csv_file, table, cursor):
......@@ -186,6 +188,12 @@ def csv_to_sqlite(csv_file, table, cursor):
if not len(rows):
return
# Replace null strings with None
for row in rows:
for i in range(len(row)):
if row[i] == "__null__":
row[i] = None
# Build the parameterized query by counting the columns in a CSV row and repeating '?' parameters
insert_args = ",".join("?" for _ in range(len(rows[0])))
query = f"INSERT INTO {table} VALUES ({insert_args})"
......@@ -202,17 +210,16 @@ def export_corpus(corpus_id: str) -> None:
# Initialize all the tables
cursor.executescript((BASE_DIR / 'tables.sql').read_text())
with tempfile.TemporaryFile() as csv_file:
for i, (table_name, query) in enumerate(EXPORT_QUERIES):
if rq_job:
rq_job.set_progress(i / len(EXPORT_QUERIES))
pg_to_csv(csv_file, query.format(corpus_id=corpus_id))
for i, (table_name, query) in enumerate(EXPORT_QUERIES):
if rq_job:
rq_job.set_progress(i / len(EXPORT_QUERIES))
csv_file = pg_to_csv(query.format(corpus_id=corpus_id))
if rq_job:
rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
csv_to_sqlite(csv_file, table_name, cursor)
if rq_job:
rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
csv_to_sqlite(csv_file, table_name, cursor)
db.commit()
db.commit()
db.close()
return db_path
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment