Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Merge requests
!1349
Add SQLite export task
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Add SQLite export task
sqlite-export
into
master
Overview
0
Commits
11
Pipelines
0
Changes
17
Merged
Erwan Rouchet
requested to merge
sqlite-export
into
master
3 years ago
Overview
0
Commits
11
Pipelines
0
Changes
17
Expand
Closes
#743 (closed)
Edited
3 years ago
by
Erwan Rouchet
0
0
Merge request reports
Compare
master
master (base)
and
latest version
latest version
27e39218
11 commits,
3 years ago
17 files
+
633
−
1
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
17
Search (e.g. *.vue) (Ctrl+P)
arkindex/documents/export/__init__.py
0 → 100644
+
102
−
0
Options
import
json
import
logging
import
sqlite3
import
tempfile
import
uuid
from
pathlib
import
Path
from
django.db
import
connections
from
django_rq
import
job
from
rq
import
get_current_job
BASE_DIR
=
Path
(
__file__
).
absolute
().
parent
# Maximum number of rows that will be stored in memory at once before being sent to SQLite.
BATCH_SIZE
=
10000
logger
=
logging
.
getLogger
(
__name__
)
# Map SQLite table names to PostgreSQL query files
EXPORT_QUERIES
=
[
'
image
'
,
'
worker_version
'
,
'
element
'
,
'
element_path
'
,
'
transcription
'
,
'
classification
'
,
'
entity
'
,
'
transcription_entity
'
,
'
entity_role
'
,
'
entity_link
'
,
'
metadata
'
,
]
def
run_pg_query
(
query
):
"""
Run a single Postgresql query and split the results into chunks.
When a name is given to a cursor, psycopg2 uses a server-side cursor; we just use a random string as a name.
"""
with
connections
[
'
default
'
].
create_cursor
(
name
=
str
(
uuid
.
uuid4
()))
as
pg_cursor
:
pg_cursor
.
itersize
=
BATCH_SIZE
pg_cursor
.
execute
(
query
)
while
True
:
out
=
pg_cursor
.
fetchmany
(
BATCH_SIZE
)
if
not
out
:
return
yield
out
def
save_sqlite
(
rows
,
table
,
cursor
):
"""
Write a chunk of rows into an SQLite table
"""
def
_serialize
(
value
):
# Serialize UUID as string
if
isinstance
(
value
,
uuid
.
UUID
):
return
str
(
value
)
# Serialize list and dicts as json
if
isinstance
(
value
,
(
list
,
dict
)):
return
json
.
dumps
(
value
)
return
value
rows
=
[
list
(
map
(
_serialize
,
row
))
for
row
in
rows
]
# Build the parameterized query by counting the columns in a CSV row and repeating '?' parameters
insert_args
=
"
,
"
.
join
(
"
?
"
for
_
in
range
(
len
(
rows
[
0
])))
query
=
f
"
INSERT INTO
{
table
}
VALUES (
{
insert_args
}
)
"
cursor
.
executemany
(
query
,
rows
)
@job
def
export_corpus
(
corpus_id
:
str
)
->
None
:
rq_job
=
get_current_job
()
_
,
db_path
=
tempfile
.
mkstemp
(
suffix
=
'
db
'
)
logger
.
info
(
f
"
Exporting corpus
{
corpus_id
}
into
{
db_path
}
"
)
db
=
sqlite3
.
connect
(
db_path
)
cursor
=
db
.
cursor
()
# Initialize all the tables
cursor
.
executescript
((
BASE_DIR
/
'
structure.sql
'
).
read_text
())
for
i
,
name
in
enumerate
(
EXPORT_QUERIES
):
query
=
(
BASE_DIR
/
f
'
{
name
}
.sql
'
).
read_text
()
logger
.
info
(
f
"
Running query
{
i
+
1
}
/
{
len
(
EXPORT_QUERIES
)
}
{
name
}
"
)
if
rq_job
:
rq_job
.
set_progress
(
i
/
len
(
EXPORT_QUERIES
))
for
chunk
in
run_pg_query
(
query
.
format
(
corpus_id
=
corpus_id
)):
save_sqlite
(
chunk
,
name
,
cursor
)
db
.
commit
()
db
.
close
()
return
db_path
Loading