Skip to content
Snippets Groups Projects
Commit c7bfc99d authored by Manon Blanco's avatar Manon Blanco Committed by Yoann Schneider
Browse files

Implement function to extract content of a tar+zstd archive

parent 0f187e68
No related branches found
No related tags found
1 merge request!343Implement function to extract content of a tar+zstd archive
Pipeline #80308 passed
# -*- coding: utf-8 -*-
import logging
import os
import tarfile
import tempfile
from pathlib import Path
from typing import Tuple
import zstandard
logger = logging.getLogger(__name__)
def decompress_zst_archive(compressed_archive: Path) -> Tuple[int, Path]:
"""
Decompress a ZST-compressed tar archive in data dir. The tar archive is not extracted.
This returns the path to the archive and the file descriptor.
Beware of closing the file descriptor explicitly or the main
process will keep the memory held even if the file is deleted.
:param compressed_archive: Path to the target ZST-compressed archive
:return: File descriptor and path to the uncompressed tar archive
"""
dctx = zstandard.ZstdDecompressor()
archive_fd, archive_path = tempfile.mkstemp(suffix=".tar")
logger.debug(f"Uncompressing file to {archive_path}")
try:
with open(compressed_archive, "rb") as compressed, open(
archive_path, "wb"
) as decompressed:
dctx.copy_stream(compressed, decompressed)
logger.debug(f"Successfully uncompressed archive {compressed_archive}")
except zstandard.ZstdError as e:
raise Exception(f"Couldn't uncompressed archive: {e}")
return archive_fd, Path(archive_path)
def extract_tar_archive(archive_path: Path, destination: Path):
"""
Extract the tar archive's content to a specific destination
:param archive_path: Path to the archive
:param destination: Path where the archive's data will be extracted
"""
try:
with tarfile.open(archive_path) as tar_archive:
tar_archive.extractall(destination)
except tarfile.ReadError as e:
raise Exception(f"Couldn't handle the decompressed Tar archive: {e}")
def extract_tar_zst_archive(
compressed_archive: Path, destination: Path
) -> Tuple[int, Path]:
"""
Extract a ZST-compressed tar archive's content to a specific destination
:param compressed_archive: Path to the target ZST-compressed archive
:param destination: Path where the archive's data will be extracted
:return: File descriptor and path to the uncompressed tar archive
"""
archive_fd, archive_path = decompress_zst_archive(compressed_archive)
extract_tar_archive(archive_path, destination)
return archive_fd, archive_path
def close_delete_file(file_descriptor: int, file_path: Path):
"""
Close the file descriptor of the file and delete the file
:param file_descriptor: File descriptor of the archive
:param file_path: Path to the archive
"""
try:
os.close(file_descriptor)
file_path.unlink()
except OSError as e:
logger.warning(f"Unable to delete file {file_path}: {e}")
# Utils
::: arkindex_worker.utils
......@@ -90,6 +90,7 @@ nav:
- Image utilities: ref/image.md
- Reporting: ref/reporting.md
- Cache: ref/cache.md
- Utils: ref/utils.md
- Releases: releases.md
- Documentation development: dev/build_docs.md
......
File added
# -*- coding: utf-8 -*-
from pathlib import Path
from arkindex_worker.utils import close_delete_file, extract_tar_zst_archive
FIXTURES = Path(__file__).absolute().parent / "data"
ARCHIVE = FIXTURES / "archive.tar.zst"
def test_extract_tar_zst_archive(mocker, tmp_path):
destination = tmp_path / "destination"
_, archive_path = extract_tar_zst_archive(ARCHIVE, destination)
assert archive_path.is_file()
assert archive_path.suffix == ".tar"
assert sorted(list(destination.rglob("*"))) == [
destination / "archive.tar.zst",
destination / "cache",
destination / "cache/tables.sqlite",
destination / "line_transcriptions_small.json",
destination / "mirrored_image.jpg",
destination / "page_element.json",
destination / "rotated_image.jpg",
destination / "rotated_mirrored_image.jpg",
destination / "test_image.jpg",
destination / "tiled_image.jpg",
destination / "ufcn_line_historical_worker_version.json",
]
def test_delete_tar_archive(mocker, tmp_path):
destination = tmp_path / "destination"
archive_fd, archive_path = extract_tar_zst_archive(ARCHIVE, destination)
close_delete_file(archive_fd, archive_path)
assert not archive_path.exists()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment