Skip to content
Snippets Groups Projects
Commit 316f067b authored by Eva Bardou's avatar Eva Bardou :frog:
Browse files

Save the cache in the archive too

parent 2ce6237a
No related branches found
No related tags found
1 merge request!8New DatasetExtractor using a DatasetWorker
Pipeline #138669 passed
......@@ -19,11 +19,13 @@ def test_process_split(tmp_path, downloaded_images):
worker = DatasetExtractor()
# Parse some arguments
worker.args = Namespace(database=None)
worker.data_folder = tmp_path
worker.configure_cache()
worker.cached_images = dict()
# Where to save the downloaded images
worker.image_folder = tmp_path
worker.images_folder = tmp_path / "images"
worker.images_folder.mkdir(parents=True)
first_page_id = UUID("e26e6803-18da-4768-be30-a0a68132107c")
second_page_id = UUID("c673bd94-96b1-4a2e-8662-a4d806940b5f")
......@@ -80,11 +82,6 @@ def test_process_split(tmp_path, downloaded_images):
== f"https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fiam%2F{page_name}.png"
)
assert sorted(tmp_path.rglob("*")) == [
tmp_path / f"{first_image_id}.jpg",
tmp_path / f"{second_image_id}.jpg",
]
# Should have created 17 transcriptions
assert CachedTranscription.select().count() == 17
# Check transcription of first line on first page
......@@ -127,3 +124,11 @@ def test_process_split(tmp_path, downloaded_images):
assert tr_entity.length == 23
assert tr_entity.confidence == 1.0
assert tr_entity.worker_run_id is None
# Full structure of the archive
assert sorted(tmp_path.rglob("*")) == [
tmp_path / "db.sqlite",
tmp_path / "images",
tmp_path / "images" / f"{first_image_id}.jpg",
tmp_path / "images" / f"{second_image_id}.jpg",
]
......@@ -61,6 +61,9 @@ class DatasetExtractor(DatasetWorker):
# Download corpus
self.download_latest_export()
def configure_storage(self) -> None:
self.data_folder = Path(tempfile.mkdtemp(suffix="-arkindex-data"))
# Initialize db that will be written
self.configure_cache()
......@@ -68,17 +71,17 @@ class DatasetExtractor(DatasetWorker):
self.cached_images = dict()
# Where to save the downloaded images
self.image_folder = Path(tempfile.mkdtemp(suffix="-arkindex-data"))
logger.info(f"Images will be saved at `{self.image_folder}`.")
self.images_folder = self.data_folder / "images"
self.images_folder.mkdir(parents=True)
logger.info(f"Images will be saved at `{self.images_folder}`.")
def configure_cache(self) -> None:
"""
Create an SQLite database compatible with base-worker cache and initialize it.
"""
self.use_cache = True
self.cache_path: Path = self.args.database or self.work_dir / "db.sqlite"
# Remove previous execution result if present
self.cache_path.unlink(missing_ok=True)
self.cache_path: Path = self.data_folder / "db.sqlite"
logger.info(f"Cached database will be saved at `{self.data_folder}`.")
init_cache_db(self.cache_path)
......@@ -242,7 +245,7 @@ class DatasetExtractor(DatasetWorker):
# Download image
logger.info("Downloading image")
download_image(url=build_image_url(element)).save(
self.image_folder / f"{element.image.id}.jpg"
self.images_folder / f"{element.image.id}.jpg"
)
# Insert image
logger.info("Inserting image")
......@@ -304,15 +307,18 @@ class DatasetExtractor(DatasetWorker):
self.insert_element(child, parent_id=element.id)
def process_dataset(self, dataset: Dataset):
# Configure temporary storage for the dataset data (cache + images)
self.configure_storage()
# Iterate over given splits
for split_name, elements in self.list_dataset_elements_per_split(dataset):
casted_elements = list(map(_format_element, elements))
self.process_split(split_name, casted_elements)
# TAR + ZSTD Image folder and store as task artifact
# TAR + ZSTD the cache and the images folder, and store as task artifact
zstd_archive_path: Path = self.work_dir / f"{dataset.id}.zstd"
logger.info(f"Compressing the images to {zstd_archive_path}")
create_tar_zst_archive(source=self.image_folder, destination=zstd_archive_path)
create_tar_zst_archive(source=self.data_folder, destination=zstd_archive_path)
def main():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment