Skip to content
Snippets Groups Projects

Merge parents caches into the current task one

Merged Eva Bardou requested to merge merge-parents-cache into master
2 files
+ 51
38
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 43
15
# -*- coding: utf-8 -*-
import json
import logging
import os
import sqlite3
from peewee import (
BooleanField,
@@ -33,21 +35,6 @@ class JSONField(Field):
return json.loads(value)
def merge_parents_caches(self, parents_cache_paths):
for idx, parent_cache in enumerate(parents_cache_paths):
statements = [
"PRAGMA page_size=80000;",
"PRAGMA synchronous=OFF;",
f"ATTACH DATABASE '{parent_cache}' AS source{idx};",
f"REPLACE INTO elements SELECT * FROM source{idx}.elements;",
f"REPLACE INTO transcriptions SELECT * FROM source{idx}.transcriptions;",
]
for statement in statements:
self.cursor.execute(statement)
self.db.commit()
class CachedElement(Model):
id = UUIDField(primary_key=True)
parent_id = UUIDField(null=True)
@@ -91,3 +78,44 @@ def create_tables():
Creates the tables in the cache DB only if they do not already exist.
"""
db.create_tables([CachedElement, CachedTranscription])
def merge_parents_caches(parent_ids, current_database, data_dir="/data"):
"""
Merge all the potential parent task's databases into the existing local one
"""
assert isinstance(parent_ids, list)
assert os.path.isdir(data_dir)
assert os.path.exists(current_database)
# TODO: handle chunk
# Find all the paths for these databases
paths = filter(
lambda p: os.path.isfile(p),
[os.path.join(data_dir, parent, "db.sqlite") for parent in parent_ids],
)
if not paths:
logger.info("No parents cache to use")
return
# Open a connection on current database
connection = sqlite3.connect(current_database)
with connection.cursor() as cursor:
for idx, path in enumerate(paths):
# Merge each table into the local database
statements = [
"PRAGMA page_size=80000;",
"PRAGMA synchronous=OFF;",
f"ATTACH DATABASE '{path}' AS source_{idx};",
f"REPLACE INTO elements SELECT * FROM source_{idx}.elements;",
f"REPLACE INTO transcriptions SELECT * FROM source_{idx}.transcriptions;",
]
for statement in statements:
cursor.execute(statement)
connection.commit()
# TODO: maybe reopen peewee connection ?
Loading