Skip to content
Snippets Groups Projects
Commit 316f067b authored by Eva Bardou's avatar Eva Bardou :frog:
Browse files

Save the cache in the archive too

parent 2ce6237a
No related branches found
No related tags found
1 merge request!8New DatasetExtractor using a DatasetWorker
Pipeline #138669 passed
...@@ -19,11 +19,13 @@ def test_process_split(tmp_path, downloaded_images): ...@@ -19,11 +19,13 @@ def test_process_split(tmp_path, downloaded_images):
worker = DatasetExtractor() worker = DatasetExtractor()
# Parse some arguments # Parse some arguments
worker.args = Namespace(database=None) worker.args = Namespace(database=None)
worker.data_folder = tmp_path
worker.configure_cache() worker.configure_cache()
worker.cached_images = dict() worker.cached_images = dict()
# Where to save the downloaded images # Where to save the downloaded images
worker.image_folder = tmp_path worker.images_folder = tmp_path / "images"
worker.images_folder.mkdir(parents=True)
first_page_id = UUID("e26e6803-18da-4768-be30-a0a68132107c") first_page_id = UUID("e26e6803-18da-4768-be30-a0a68132107c")
second_page_id = UUID("c673bd94-96b1-4a2e-8662-a4d806940b5f") second_page_id = UUID("c673bd94-96b1-4a2e-8662-a4d806940b5f")
...@@ -80,11 +82,6 @@ def test_process_split(tmp_path, downloaded_images): ...@@ -80,11 +82,6 @@ def test_process_split(tmp_path, downloaded_images):
== f"https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fiam%2F{page_name}.png" == f"https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fiam%2F{page_name}.png"
) )
assert sorted(tmp_path.rglob("*")) == [
tmp_path / f"{first_image_id}.jpg",
tmp_path / f"{second_image_id}.jpg",
]
# Should have created 17 transcriptions # Should have created 17 transcriptions
assert CachedTranscription.select().count() == 17 assert CachedTranscription.select().count() == 17
# Check transcription of first line on first page # Check transcription of first line on first page
...@@ -127,3 +124,11 @@ def test_process_split(tmp_path, downloaded_images): ...@@ -127,3 +124,11 @@ def test_process_split(tmp_path, downloaded_images):
assert tr_entity.length == 23 assert tr_entity.length == 23
assert tr_entity.confidence == 1.0 assert tr_entity.confidence == 1.0
assert tr_entity.worker_run_id is None assert tr_entity.worker_run_id is None
# Full structure of the archive
assert sorted(tmp_path.rglob("*")) == [
tmp_path / "db.sqlite",
tmp_path / "images",
tmp_path / "images" / f"{first_image_id}.jpg",
tmp_path / "images" / f"{second_image_id}.jpg",
]
...@@ -61,6 +61,9 @@ class DatasetExtractor(DatasetWorker): ...@@ -61,6 +61,9 @@ class DatasetExtractor(DatasetWorker):
# Download corpus # Download corpus
self.download_latest_export() self.download_latest_export()
def configure_storage(self) -> None:
self.data_folder = Path(tempfile.mkdtemp(suffix="-arkindex-data"))
# Initialize db that will be written # Initialize db that will be written
self.configure_cache() self.configure_cache()
...@@ -68,17 +71,17 @@ class DatasetExtractor(DatasetWorker): ...@@ -68,17 +71,17 @@ class DatasetExtractor(DatasetWorker):
self.cached_images = dict() self.cached_images = dict()
# Where to save the downloaded images # Where to save the downloaded images
self.image_folder = Path(tempfile.mkdtemp(suffix="-arkindex-data")) self.images_folder = self.data_folder / "images"
logger.info(f"Images will be saved at `{self.image_folder}`.") self.images_folder.mkdir(parents=True)
logger.info(f"Images will be saved at `{self.images_folder}`.")
def configure_cache(self) -> None: def configure_cache(self) -> None:
""" """
Create an SQLite database compatible with base-worker cache and initialize it. Create an SQLite database compatible with base-worker cache and initialize it.
""" """
self.use_cache = True self.use_cache = True
self.cache_path: Path = self.args.database or self.work_dir / "db.sqlite" self.cache_path: Path = self.data_folder / "db.sqlite"
# Remove previous execution result if present logger.info(f"Cached database will be saved at `{self.data_folder}`.")
self.cache_path.unlink(missing_ok=True)
init_cache_db(self.cache_path) init_cache_db(self.cache_path)
...@@ -242,7 +245,7 @@ class DatasetExtractor(DatasetWorker): ...@@ -242,7 +245,7 @@ class DatasetExtractor(DatasetWorker):
# Download image # Download image
logger.info("Downloading image") logger.info("Downloading image")
download_image(url=build_image_url(element)).save( download_image(url=build_image_url(element)).save(
self.image_folder / f"{element.image.id}.jpg" self.images_folder / f"{element.image.id}.jpg"
) )
# Insert image # Insert image
logger.info("Inserting image") logger.info("Inserting image")
...@@ -304,15 +307,18 @@ class DatasetExtractor(DatasetWorker): ...@@ -304,15 +307,18 @@ class DatasetExtractor(DatasetWorker):
self.insert_element(child, parent_id=element.id) self.insert_element(child, parent_id=element.id)
def process_dataset(self, dataset: Dataset): def process_dataset(self, dataset: Dataset):
# Configure temporary storage for the dataset data (cache + images)
self.configure_storage()
# Iterate over given splits # Iterate over given splits
for split_name, elements in self.list_dataset_elements_per_split(dataset): for split_name, elements in self.list_dataset_elements_per_split(dataset):
casted_elements = list(map(_format_element, elements)) casted_elements = list(map(_format_element, elements))
self.process_split(split_name, casted_elements) self.process_split(split_name, casted_elements)
# TAR + ZSTD Image folder and store as task artifact # TAR + ZSTD the cache and the images folder, and store as task artifact
zstd_archive_path: Path = self.work_dir / f"{dataset.id}.zstd" zstd_archive_path: Path = self.work_dir / f"{dataset.id}.zstd"
logger.info(f"Compressing the images to {zstd_archive_path}") logger.info(f"Compressing the images to {zstd_archive_path}")
create_tar_zst_archive(source=self.image_folder, destination=zstd_archive_path) create_tar_zst_archive(source=self.data_folder, destination=zstd_archive_path)
def main(): def main():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment