Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Commits
712662f3
Verified
Commit
712662f3
authored
3 years ago
by
Erwan Rouchet
Browse files
Options
Downloads
Patches
Plain Diff
First working export
parent
b6ec014f
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
arkindex/documents/export/__init__.py
+25
-18
25 additions, 18 deletions
arkindex/documents/export/__init__.py
with
25 additions
and
18 deletions
arkindex/documents/export/__init__.py
+
25
−
18
View file @
712662f3
import
csv
import
sqlite3
import
tempfile
from
io
import
StringIO
from
itertools
import
islice
from
pathlib
import
Path
...
...
@@ -16,12 +17,12 @@ CSV_BATCH_SIZE = 10000
# Map SQLite table names to PostgreSQL queries
EXPORT_QUERIES
=
[(
'
image
'
,
# TODO: Build URLs
"""
SELECT image.id,
''
, image.width, image.height
SELECT
DISTINCT
image.id,
CONCAT(TRIM(TRAILING
'
/
'
FROM server.url),
'
/
'
, image.path)
, image.width, image.height
FROM images_image image
INNER JOIN images_zone zone ON (zone.image_id = image.id)
INNER JOIN documents_element element ON (element.zone_id = zone.id)
INNER JOIN images_imageserver server ON (server.id = image.server_id)
WHERE element.corpus_id =
'
{corpus_id}
'
::uuid
"""
),
(
...
...
@@ -97,14 +98,14 @@ EXPORT_QUERIES = [(
classification.element_id,
mlclass.name,
classification.state,
use
r.email,
moderato
r.email,
classification.confidence,
classification.high_confidence::integer,
classification.worker_version_id
FROM documents_classification classification
INNER JOIN documents_element element ON (element.id = classification.element_id)
INNER JOIN documents_mlclass mlclass ON (mlclass.id = classification.ml_class_id)
LEFT JOIN users_user
user ON (use
r.id = classification.moderator_id)
LEFT JOIN users_user
moderator ON (moderato
r.id = classification.moderator_id)
WHERE element.corpus_id =
'
{corpus_id}
'
::uuid
"""
),
(
...
...
@@ -115,11 +116,11 @@ EXPORT_QUERIES = [(
entity.name,
entity.type,
entity.validated::integer,
use
r.email,
moderato
r.email,
hstore_to_json(entity.metas),
entity.worker_version_id
FROM documents_entity entity
INNER
JOIN users_user
user ON (use
r.id = entity.moderator_id)
LEFT
JOIN users_user
moderator ON (moderato
r.id = entity.moderator_id)
WHERE entity.corpus_id =
'
{corpus_id}
'
::uuid
"""
),
(
...
...
@@ -171,10 +172,11 @@ EXPORT_QUERIES = [(
)]
def
pg_to_csv
(
csv_file
,
query
):
csv_file
.
seek
(
0
)
def
pg_to_csv
(
query
):
output
=
StringIO
(
)
with
connections
[
'
default
'
].
cursor
()
as
pg_cursor
:
pg_cursor
.
copy_expert
(
f
"
COPY (
{
query
}
) TO STDOUT WITH FORMAT CSV, HEADER OFF
"
,
csv_file
)
pg_cursor
.
copy_expert
(
f
"
COPY (
{
query
}
) TO STDOUT WITH (FORMAT CSV, HEADER OFF, NULL
'
__null__
'
)
"
,
output
)
return
output
def
csv_to_sqlite
(
csv_file
,
table
,
cursor
):
...
...
@@ -186,6 +188,12 @@ def csv_to_sqlite(csv_file, table, cursor):
if
not
len
(
rows
):
return
# Replace null strings with None
for
row
in
rows
:
for
i
in
range
(
len
(
row
)):
if
row
[
i
]
==
"
__null__
"
:
row
[
i
]
=
None
# Build the parameterized query by counting the columns in a CSV row and repeating '?' parameters
insert_args
=
"
,
"
.
join
(
"
?
"
for
_
in
range
(
len
(
rows
[
0
])))
query
=
f
"
INSERT INTO
{
table
}
VALUES (
{
insert_args
}
)
"
...
...
@@ -202,17 +210,16 @@ def export_corpus(corpus_id: str) -> None:
# Initialize all the tables
cursor
.
executescript
((
BASE_DIR
/
'
tables.sql
'
).
read_text
())
with
tempfile
.
TemporaryFile
()
as
csv_file
:
for
i
,
(
table_name
,
query
)
in
enumerate
(
EXPORT_QUERIES
):
if
rq_job
:
rq_job
.
set_progress
(
i
/
len
(
EXPORT_QUERIES
))
pg_to_csv
(
csv_file
,
query
.
format
(
corpus_id
=
corpus_id
))
for
i
,
(
table_name
,
query
)
in
enumerate
(
EXPORT_QUERIES
):
if
rq_job
:
rq_job
.
set_progress
(
i
/
len
(
EXPORT_QUERIES
))
csv_file
=
pg_to_csv
(
query
.
format
(
corpus_id
=
corpus_id
))
if
rq_job
:
rq_job
.
set_progress
((
i
+
0.5
)
/
len
(
EXPORT_QUERIES
))
csv_to_sqlite
(
csv_file
,
table_name
,
cursor
)
if
rq_job
:
rq_job
.
set_progress
((
i
+
0.5
)
/
len
(
EXPORT_QUERIES
))
csv_to_sqlite
(
csv_file
,
table_name
,
cursor
)
db
.
commit
()
db
.
commit
()
db
.
close
()
return
db_path
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment