First working export

712662f3 · Erwan Rouchet · b6ec014f · 712662f3
Verified Commit 712662f3 authored 3 years ago by Erwan Rouchet
--- a/arkindex/documents/export/__init__.py
+++ b/arkindex/documents/export/__init__.py
 import csv
 import sqlite3
 import tempfile
+from io import StringIO
 from itertools import islice
 from pathlib import Path

@@ -16,12 +17,12 @@ CSV_BATCH_SIZE = 10000
 # Map SQLite table names to PostgreSQL queries
 EXPORT_QUERIES = [(
    'image',
-    # TODO: Build URLs
    """
-    SELECT image.id, '', image.width, image.height
+    SELECT DISTINCT image.id, CONCAT(TRIM(TRAILING '/' FROM server.url), '/', image.path), image.width, image.height
    FROM images_image image
    INNER JOIN images_zone zone ON (zone.image_id = image.id)
    INNER JOIN documents_element element ON (element.zone_id = zone.id)
+    INNER JOIN images_imageserver server ON (server.id = image.server_id)
    WHERE element.corpus_id = '{corpus_id}'::uuid
    """
 ), (
@@ -97,14 +98,14 @@ EXPORT_QUERIES = [(
        classification.element_id,
        mlclass.name,
        classification.state,
-        user.email,
+        moderator.email,
        classification.confidence,
        classification.high_confidence::integer,
        classification.worker_version_id
    FROM documents_classification classification
    INNER JOIN documents_element element ON (element.id = classification.element_id)
    INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
-    LEFT JOIN users_user user ON (user.id = classification.moderator_id)
+    LEFT JOIN users_user moderator ON (moderator.id = classification.moderator_id)
    WHERE element.corpus_id = '{corpus_id}'::uuid
    """
 ), (
@@ -115,11 +116,11 @@ EXPORT_QUERIES = [(
        entity.name,
        entity.type,
        entity.validated::integer,
-        user.email,
+        moderator.email,
        hstore_to_json(entity.metas),
        entity.worker_version_id
    FROM documents_entity entity
-    INNER JOIN users_user user ON (user.id = entity.moderator_id)
+    LEFT JOIN users_user moderator ON (moderator.id = entity.moderator_id)
    WHERE entity.corpus_id = '{corpus_id}'::uuid
    """
 ), (
@@ -171,10 +172,11 @@ EXPORT_QUERIES = [(
 )]


-def pg_to_csv(csv_file, query):
-    csv_file.seek(0)
+def pg_to_csv(query):
+    output = StringIO()
    with connections['default'].cursor() as pg_cursor:
-        pg_cursor.copy_expert(f"COPY ({query}) TO STDOUT WITH FORMAT CSV, HEADER OFF", csv_file)
+        pg_cursor.copy_expert(f"COPY ({query}) TO STDOUT WITH (FORMAT CSV, HEADER OFF, NULL '__null__')", output)
+    return output


 def csv_to_sqlite(csv_file, table, cursor):
@@ -186,6 +188,12 @@ def csv_to_sqlite(csv_file, table, cursor):
        if not len(rows):
            return

+        # Replace null strings with None
+        for row in rows:
+            for i in range(len(row)):
+                if row[i] == "__null__":
+                    row[i] = None
+
        # Build the parameterized query by counting the columns in a CSV row and repeating '?' parameters
        insert_args = ",".join("?" for _ in range(len(rows[0])))
        query = f"INSERT INTO {table} VALUES ({insert_args})"
@@ -202,17 +210,16 @@ def export_corpus(corpus_id: str) -> None:
    # Initialize all the tables
    cursor.executescript((BASE_DIR / 'tables.sql').read_text())

-    with tempfile.TemporaryFile() as csv_file:
-        for i, (table_name, query) in enumerate(EXPORT_QUERIES):
-            if rq_job:
-                rq_job.set_progress(i / len(EXPORT_QUERIES))
-            pg_to_csv(csv_file, query.format(corpus_id=corpus_id))
+    for i, (table_name, query) in enumerate(EXPORT_QUERIES):
+        if rq_job:
+            rq_job.set_progress(i / len(EXPORT_QUERIES))
+        csv_file = pg_to_csv(query.format(corpus_id=corpus_id))

-            if rq_job:
-                rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
-            csv_to_sqlite(csv_file, table_name, cursor)
+        if rq_job:
+            rq_job.set_progress((i + 0.5) / len(EXPORT_QUERIES))
+        csv_to_sqlite(csv_file, table_name, cursor)
+        db.commit()

-    db.commit()
    db.close()

    return db_path