Skip to content
Snippets Groups Projects
Commit e59f6108 authored by ml bonhomme's avatar ml bonhomme :bee: Committed by Bastien Abadie
Browse files

Support ZIP files import from bucket

parent fe874ce7
No related branches found
No related tags found
1 merge request!349Support ZIP files import from bucket
Pipeline #21500 passed
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from collections.abc import Iterator
PATH_DELIMITER = "/"
......@@ -73,6 +74,13 @@ class Node(object):
"""
return self.is_final and self.key.lower().endswith(".pdf")
@property
def is_zip(self):
"""
This node is a zip archive, which needs to be unzipped before its elements are imported.
"""
return self.is_final and self.key.lower().endswith(".zip")
def __iter__(self):
return iter(self.children.values())
......@@ -80,3 +88,8 @@ class Node(object):
if not self.children:
return 1
return sum(len(child) for child in self.children.values()) + 1
def recurse(self) -> Iterator["Node"]:
yield from self
for child in self:
yield from child.recurse()
......@@ -3,6 +3,7 @@
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from urllib.parse import quote_plus, urljoin
......@@ -14,6 +15,7 @@ from arkindex_tasks.import_files.pdf import count_pdf_pages, upload_pdf_text
from arkindex_tasks.import_s3.boto import get_client_from_env
from arkindex_tasks.import_s3.graph import PATH_DELIMITER, Node
from arkindex_tasks.utils import download_file, retried_request
from botocore.exceptions import ClientError
logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)
......@@ -287,8 +289,12 @@ class S3Import(object):
finally:
pdf_path.unlink(missing_ok=True)
def build_elements(self, node):
def build_elements(self, node: Node) -> None:
"""Creates elements on the Arkindex corpus from a hierarchical node on the S3 bucket"""
if node.is_zip:
# Skip ZIP nodes, as those have been extracted separately.
return
# Continuously log progress
elt_count = (
self.progress["completed"]
......@@ -354,6 +360,60 @@ class S3Import(object):
for child_node in node:
self.build_elements(child_node)
def handle_zip_nodes(self) -> int:
"""
Extract all .zip archives found on the bucket, returning the count of extracted archives.
If any archive is extracted, the S3 objects graph is rebuilt.
"""
extracted = 0
for node in self.root_node.recurse():
if node.is_zip:
self.extract_zip_node(node)
extracted += 1
if extracted:
logger.info("Rebuilding graph after archive extraction")
self.build_graph()
return extracted
def extract_zip_node(self, node: Node) -> None:
assert node.is_zip, "Only ZIP nodes are supported"
assert node.parent is not None, "The root node cannot be extracted"
logger.info(f"Extracting files from archive {node.key}")
_, file_path = tempfile.mkstemp(prefix="tasks-", suffix=".zip")
file_path = Path(file_path)
try:
self.boto_resource.meta.client.download_file(
self.bucket, node.key, file_path
)
with zipfile.ZipFile(file_path, "r") as zip_file:
for info in zip_file.infolist():
# Ignore directories
if info.is_dir():
continue
key = info.filename
# Ignore the empty name of the root node, to avoid extracting archives at the root level
# to keys that will start with a /
if node.parent.name:
key = PATH_DELIMITER.join((node.parent.name, key))
with zip_file.open(info) as f:
try:
self.boto_resource.meta.client.upload_fileobj(
f, self.bucket, key
)
except ClientError as e:
logging.error(e)
finally:
file_path.unlink(missing_ok=True)
def run(self):
assert WORKER_RUN_ID, "A WorkerRun ID is required"
......@@ -365,8 +425,11 @@ class S3Import(object):
)
self.build_graph()
zip_count = self.handle_zip_nodes()
# Build arkindex elements from the first level (e.g. skip the root node)
self.progress["total"] = len(self.root_node) - 1
# Subtract the ZIP archive count since we know won't be importing those
self.progress["total"] = len(self.root_node) - 1 - zip_count
logger.info(
f"Creating {self.progress['total']} elements in corpus '{self.corpus['name']}'"
)
......
coverage==4.5.3
discover==0.4.0
moto==4.1.9
requests-mock==1.6.0
File added
......@@ -7,8 +7,10 @@ from unittest.mock import MagicMock, patch
import requests_mock
import boto3
from arkindex_tasks.import_s3.graph import Node
from arkindex_tasks.import_s3.worker import S3Import
from moto import mock_s3
SAMPLES = Path(__file__).absolute().parent.parent / "import_files" / "samples"
......@@ -30,6 +32,7 @@ class TestS3Import(TestCase):
status_code=200,
json={
"id": "corpus_id",
"name": "Corpus Name",
"types": [
{"id": "page_id", "slug": "page", "folder": False},
{"id": "folder_id", "slug": "folder", "folder": True},
......@@ -613,3 +616,170 @@ class TestS3Import(TestCase):
"total": 2,
},
)
@mock_s3
@requests_mock.Mocker()
@patch(
"arkindex_tasks.import_s3.worker.WORKER_RUN_ID",
"aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
)
def test_zip_archives_upload(self, requests_mock):
s3 = boto3.resource("s3", region_name="us-east-1")
bucket = s3.create_bucket(Bucket="testbucket")
with open(SAMPLES / "test_archive.zip", "rb") as f:
bucket.upload_fileobj(f, "test_archive.zip")
self.assertListEqual(
[obj.key for obj in bucket.objects.all()],
# Only the archive is in the bucket
["test_archive.zip"],
)
requests_mock.get(
"https://arkindex.teklia.com/api/v1/corpus/corpus_id/",
status_code=200,
json={
"id": "corpus_id",
"name": "Corpus Name",
"types": [
{"id": "page_id", "slug": "page", "folder": False},
{"id": "folder_id", "slug": "folder", "folder": True},
],
},
)
requests_mock.get(
"https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False",
json={},
)
requests_mock.get(
"https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False",
json={},
)
requests_mock.post(
"https://arkindex.teklia.com/api/v1/image/iiif/url/",
[
{"json": {"id": "200x200_img_id"}},
{"json": {"id": "600x600_img_id"}},
],
)
requests_mock.post(
"https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True",
[
{"json": {"id": "200x200_id"}},
{"json": {"id": "testfolder_id"}},
{"json": {"id": "600x600_id"}},
],
)
with patch(
"arkindex_tasks.import_s3.worker.get_client_from_env", return_value=s3
):
s3_import = S3Import(
corpus="corpus_id",
element=None,
bucket="testbucket",
prefix="",
iiif_base_url="https://server.test/iiif/2",
folder_type="folder",
page_type="page",
verbose=False,
)
s3_import.run()
self.assertDictEqual(
s3_import.progress,
{
"completed": 3,
"existing": 0,
"errors": 0,
"total": 3,
},
)
self.assertEqual(
draw_tree(s3_import.root_node),
dedent(
"""
.
├─ 200x200.jpg
├─ test_archive.zip
├─ test_folder
│ ├─ 600x600.png
"""
).strip(),
)
self.assertListEqual(
[obj.key for obj in bucket.objects.all()],
# Both the archive and its contents are in the bucket
[
"200x200.jpg",
"test_archive.zip",
"test_folder/600x600.png",
],
)
self.assertListEqual(
[
(req.method, req.url, json.loads(req.body) if req.body else None)
for req in requests_mock.request_history
],
[
("GET", "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", None),
(
"GET",
"https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False",
None,
),
(
"GET",
"https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False",
None,
),
(
"POST",
"https://arkindex.teklia.com/api/v1/image/iiif/url/",
{"url": "https://server.test/iiif/2/testbucket%2F200x200.jpg"},
),
(
"POST",
"https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True",
{
"type": "page",
"name": "200x200.jpg",
"corpus": "corpus_id",
"image": "200x200_img_id",
"worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
},
),
(
"POST",
"https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True",
{
"type": "folder",
"name": "test_folder",
"corpus": "corpus_id",
"worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
},
),
(
"POST",
"https://arkindex.teklia.com/api/v1/image/iiif/url/",
{
"url": "https://server.test/iiif/2/testbucket%2Ftest_folder%2F600x600.png"
},
),
(
"POST",
"https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True",
{
"type": "page",
"name": "600x600.png",
"corpus": "corpus_id",
"image": "600x600_img_id",
"parent": "testfolder_id",
"worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
},
),
],
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment