diff --git a/arkindex_tasks/import_files/pdf.py b/arkindex_tasks/import_files/pdf.py index 5dc02f26fd64fd85a9417b787399575a6cd8ba2e..dab86ed1202f55a32ac9f18fa6e5cb812d029161 100644 --- a/arkindex_tasks/import_files/pdf.py +++ b/arkindex_tasks/import_files/pdf.py @@ -92,15 +92,20 @@ def build_transcription(pdf_element, pdf_page, ark_page): } -def extract_pdf_text(path, ark_pages): +def extract_pdf_text(path, ark_pages, existing_pages=None): # Load all pages and children pdf_pages = list(extract_pages(path)) assert len(pdf_pages) == len( ark_pages ), f"Invalid nb of pages: pdf has {len(pdf_pages)}, ark has {len(ark_pages)}" + # Do not upload transcriptions for pages that already existed on Arkindex (retried imports) + if not existing_pages: + existing_pages = [] out = {} for ark_page, pdf_page in zip(ark_pages, pdf_pages): + if ark_page["id"] in existing_pages: + continue logger.debug( f"PDF text extraction on arkindex element {ark_page['id']} and pdf page {pdf_page}" ) @@ -150,7 +155,7 @@ def save_pdf_transcriptions(parent_id, transcriptions) -> None: raise -def upload_pdf_text(pdf_path, ark_pages) -> None: +def upload_pdf_text(pdf_path, ark_pages, existing_pages=None) -> None: """ Upload transcriptions from the text found in a PDF file to existing Arkindex elements. @@ -158,6 +163,11 @@ def upload_pdf_text(pdf_path, ark_pages) -> None: :type pdf_path: str or pathlib.Path :param ark_pages list: List of existing Arkindex elements matching each page of the PDF, as they would be returned by the `ListElements` or `RetrieveElement` API endpoints. + :param existing_pages: List of Arkindex elements that should be skipped, + as they already have transcriptions. + :type existing_pages: list or None """ - for page_id, transcriptions in extract_pdf_text(pdf_path, ark_pages).items(): + for page_id, transcriptions in extract_pdf_text( + pdf_path, ark_pages, existing_pages=existing_pages + ).items(): save_pdf_transcriptions(page_id, transcriptions) diff --git a/arkindex_tasks/import_s3/worker.py b/arkindex_tasks/import_s3/worker.py index 287096ca691e37daffc24cd25b04b421e3f4ffb9..ef56d37804e890b9af6354f9750aad52e44d7a9c 100644 --- a/arkindex_tasks/import_s3/worker.py +++ b/arkindex_tasks/import_s3/worker.py @@ -8,13 +8,14 @@ from pathlib import Path from urllib.parse import quote_plus, urljoin from apistar.exceptions import ErrorResponse +from pdf2image import convert_from_path from arkindex_tasks import default_client from arkindex_tasks.base import WORKER_RUN_ID -from arkindex_tasks.import_files.pdf import count_pdf_pages, upload_pdf_text +from arkindex_tasks.import_files.pdf import upload_pdf_text from arkindex_tasks.import_s3.boto import get_client_from_env from arkindex_tasks.import_s3.graph import PATH_DELIMITER, Node -from arkindex_tasks.utils import download_file, retried_request +from arkindex_tasks.utils import retried_request from botocore.exceptions import ClientError logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) @@ -50,6 +51,9 @@ class S3Import(object): logger.setLevel(logging.DEBUG) # Store progress statistics self.progress = {"completed": 0, "existing": 0, "errors": 0, "total": 0} + # Maps S3 key prefixes to paths of temporary files for extracted PDF files, + # so that transcriptions can be added once page elements are created for each image. + self.pdf_paths = {} # Ensure all the parameters are valid before starting a full import try: @@ -116,7 +120,7 @@ class S3Import(object): """Retrieves elements hierarchy in the Arkindex corpus Stores a global mapping between element's path and its ID """ - # Do never serialize corpus nor zone on listed elements + # Never serialize corpus nor zone on listed elements api_params = {"with_corpus": False, "with_zone": False} paths_prefix = (self.prefix,) if self.prefix else () @@ -230,74 +234,46 @@ class S3Import(object): return retried_request("RetrieveImage", id=image_id) raise e - def build_pdf_pages(self, node): - """ - For PDF files, we download the file to extract the transcriptions for each page. - We use Cantaloupe's meta-identifiers to create one child element per page, - then upload all the transcriptions. - - https://cantaloupe-project.github.io/manual/5.0/images.html#MetaIdentifiers - """ + def extract_pdf(self, node): assert node.is_pdf, "Only PDF nodes are supported" - assert node.arkindex_id, "Missing parent folder ID" - - assert WORKER_RUN_ID, "A WorkerRun ID is required to upload PDF transcriptions" - - pdf_url = self.boto_resource.meta.client.generate_presigned_url( - "get_object", Params={"Bucket": self.bucket, "Key": node.key} - ) _, pdf_path = tempfile.mkstemp(prefix="tasks-", suffix=".pdf") - pdf_path = Path(pdf_path) - try: - download_file(pdf_url, pdf_path) - - # Extracting PDF transcriptions requires that we first create all pages, as it needs access - # to the resulting JPEG image width/height to scale the polygon coordinates to them. - # Since we rely on Cantaloupe's PDF processing, only Cantaloupe can tell us the image's size. - # To create all pages, we need a page count, which Cantaloupe does not provide and pdfminer - # does not explicitly provide or document an API for, so we use undocumented methods that - # pdfminer.high_level.extract_pages uses to list all pages. - page_count = count_pdf_pages(pdf_path) - - # extract_pdf_text will require the pages' zone.image.width/height attributes, - # which would force us to create each element one by one with slim_output=False. - # We build fake elements instead that we will assign the element IDs to afterwards. - # We still won't be able to use any bulk endpoint since none support elements on multiple images. - pages = [] - for i in range(1, page_count + 1): - # Add the ;<number> suffix to build the meta-identifier for this page - image = self.create_image(node, f";{i}") - - page = retried_request( - "CreateElement", - body={ - "corpus": self.corpus_id, - "parent": node.arkindex_id, - "name": str(i), - "type": self.page_type, - "image": image["id"], - "worker_run_id": WORKER_RUN_ID, - }, - slim_output=True, - ) + # If the PDF suffix is not removed, then the "parent" PDF node which is created + # has the same path as the existing PDF object, which breaks the bucket + key_stripped = node.key.replace(".pdf", "") + self.pdf_paths[key_stripped] = pdf_path - pages.append( - { - "id": page["id"], - "zone": {"image": image}, - } - ) + pdf_path = Path(pdf_path) + self.boto_resource.meta.client.download_file(self.bucket, node.key, pdf_path) + + self.upload_pdf_pages(pdf_path, key_stripped) + + def upload_pdf_pages(self, pdf_path, key): + with tempfile.TemporaryDirectory() as base_path: + images = convert_from_path( + pdf_path, + output_folder=base_path, + output_file="pdf-", # prefix image names + dpi=300, + fmt="jpg", + ) - upload_pdf_text(pdf_path, pages) - finally: - pdf_path.unlink(missing_ok=True) + for image in images: + local_path = image.filename + bucket_path = f"{key}/{Path(local_path).name}" + try: + self.boto_resource.meta.client.upload_file( + local_path, self.bucket, bucket_path + ) + self.root_node.add_descendant(bucket_path) + except ClientError as e: + logging.error(e) def build_elements(self, node: Node) -> None: """Creates elements on the Arkindex corpus from a hierarchical node on the S3 bucket""" - if node.is_zip: - # Skip ZIP nodes, as those have been extracted separately. + if node.is_zip or node.is_pdf: + # Skip ZIP and PDF nodes, as those have been extracted separately. return # Continuously log progress @@ -315,14 +291,16 @@ class S3Import(object): elt_id = self.arkindex_elements.get(node.lineage) if elt_id: logger.debug(f"Using existing element {node.name} ({elt_id})") - self.progress["existing"] += 1 node.arkindex_id = elt_id - if node.is_pdf: + self.progress["existing"] += 1 + # Handle PDF files separately: they only have "final" children, the pages, and we need + # to store these pages created on Arkindex in order to upload the corresponding text + if node.key in self.pdf_paths: self.build_pdf_pages(node) - else: - # Recursively handle node's children - for child_node in node: - self.build_elements(child_node) + return + # Recursively handle node's children + for child_node in node: + self.build_elements(child_node) return try: @@ -332,7 +310,7 @@ class S3Import(object): "corpus": str(self.corpus_id), "worker_run_id": WORKER_RUN_ID, } - if node.is_final and not node.is_pdf: + if node.is_final: # This element should be created with its image image = self.create_image(node) body.update({"type": self.page_type, "image": image["id"]}) @@ -343,7 +321,7 @@ class S3Import(object): body.update({"parent": self.top_folder_id}) # Create the element and save its ID to the current node - element = retried_request("CreateElement", slim_output=True, body=body) + element = retried_request("CreateElement", body=body) node.arkindex_id = element["id"] except Exception as e: @@ -358,12 +336,99 @@ class S3Import(object): else: self.progress["completed"] += 1 - if node.is_pdf: + # Handle PDF files separately: they only have "final" children, the pages, and we need + # to store these pages created on Arkindex in order to upload the corresponding text. + if node.key in self.pdf_paths: self.build_pdf_pages(node) + return + # Recursively handle node's children + for child_node in node: + self.build_elements(child_node) + + def build_pdf_pages(self, node): + assert node.arkindex_id, "Missing parent folder ID" + + arkindex_pages = [] + # List of IDs of pages that already exist on Arkindex, to skip when importing transcriptions as well + existing_pages = [] + for child_node in node: + child_id = self.arkindex_elements.get(child_node.lineage) + if not child_id: + try: + body = { + "type": self.page_type, + "name": child_node.name, + "corpus": str(self.corpus_id), + "worker_run_id": WORKER_RUN_ID, + } + if child_node.is_final: + # This element should be created with its image + image = self.create_image(child_node) + body.update({"image": image["id"]}) + else: + # Children nodes of a PDF file should always be final + logger.error( + f"An error occurred processing PDF node '{node.key}': non-final child node found." + ) + skip_count = len(node) + self.progress["errors"] += skip_count + logger.warning( + f"Skipping object {node.key} and its descendants" + ) + break + + if child_node.parent and child_node.parent.arkindex_id: + body.update({"parent": child_node.parent.arkindex_id}) + + # Create the element and save its ID to the current node + element = retried_request("CreateElement", body=body) + child_node.arkindex_id = element["id"] + self.progress["completed"] += 1 + arkindex_pages.append(element) + + except Exception as e: + skip_count = len(child_node) + self.progress["errors"] += skip_count + # Log information about the error + error = getattr(e, "content", e) + logger.error( + f"An error occurred processing object '{child_node.key}': {error}" + ) + if skip_count > 1: + logger.warning( + f"{skip_count} descendant objects will be skipped" + ) + return else: - # Recursively handle node's children - for child_node in node: - self.build_elements(child_node) + # Skip creating page element if it already exists + logger.debug(f"Using existing element {child_node.name} ({child_id})") + child_node.arkindex_id = child_id + self.progress["existing"] += 1 + arkindex_pages.append({"id": child_id}) + existing_pages.append(child_id) + + # Create transcriptions + upload_pdf_text( + self.pdf_paths[node.key], arkindex_pages, existing_pages=existing_pages + ) + + # Remove temporary PDF file + Path(self.pdf_paths[node.key]).unlink(missing_ok=True) + + def handle_pdf_nodes(self) -> int: + """ + Extract and save images from all PDF files on the bucket, returning the PDF count. + Rebuilds the S3 objects graph if any PDF files are found. + """ + extracted = 0 + # Copy the root_node as as we extract pages from the PDF files the graph will change + nodes = list(self.root_node.recurse()) + for node in nodes: + if node.is_pdf: + self.extract_pdf(node) + extracted += 1 + + return extracted def handle_zip_nodes(self) -> int: """ @@ -371,15 +436,13 @@ class S3Import(object): If any archive is extracted, the S3 objects graph is rebuilt. """ extracted = 0 - for node in self.root_node.recurse(): + # Copy the root_node as as files are extracted from the archive the graph will change + nodes = list(self.root_node.recurse()) + for node in nodes: if node.is_zip: self.extract_zip_node(node) extracted += 1 - if extracted: - logger.info("Rebuilding graph after archive extraction") - self.build_graph() - return extracted def extract_zip_node(self, node: Node) -> None: @@ -409,12 +472,19 @@ class S3Import(object): key = PATH_DELIMITER.join((node.parent.name, key)) with zip_file.open(info) as f: - try: - self.boto_resource.meta.client.upload_fileobj( - f, self.bucket, key - ) - except ClientError as e: - logging.error(e) + if key.lower().endswith(".pdf"): + temp_dir = tempfile.mkdtemp() + pdf_path = zip_file.extract(info, path=temp_dir) + self.upload_pdf_pages(pdf_path, key.replace(".pdf", "")) + self.pdf_paths[key.replace(".pdf", "")] = pdf_path + else: + try: + self.boto_resource.meta.client.upload_fileobj( + f, self.bucket, key + ) + self.root_node.add_descendant(key) + except ClientError as e: + logging.error(e) finally: file_path.unlink(missing_ok=True) @@ -430,11 +500,12 @@ class S3Import(object): ) self.build_graph() + pdf_count = self.handle_pdf_nodes() zip_count = self.handle_zip_nodes() # Build arkindex elements from the first level (e.g. skip the root node) # Subtract the ZIP archive count since we know won't be importing those - self.progress["total"] = len(self.root_node) - 1 - zip_count + self.progress["total"] = len(self.root_node) - 1 - zip_count - pdf_count logger.info( f"Creating {self.progress['total']} elements in corpus '{self.corpus['name']}'" ) diff --git a/tests/import_files/test_base.py b/tests/import_files/test_base.py index 148ccd32f733dfd266e3674e4aee9217f96b0417..3b409b8e27998840bd0c733e584f82f306865f11 100644 --- a/tests/import_files/test_base.py +++ b/tests/import_files/test_base.py @@ -494,6 +494,7 @@ class TestFileImport(TestCase): ) def test_run_pdf(self, mock): # Process info + self.maxDiff = None mock.get( "/api/v1/process/processid/", json={ diff --git a/tests/import_s3/test_worker.py b/tests/import_s3/test_worker.py index 27f1e97a780c12d051353aea2dde8651c55efc72..5c60b155db43582c3b7914f119f4e77f1788024c 100644 --- a/tests/import_s3/test_worker.py +++ b/tests/import_s3/test_worker.py @@ -105,7 +105,7 @@ class TestS3Import(TestCase): ).strip(), ) mock.post( - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", [ {"json": {"id": "folder_1_id"}}, {"json": {"id": "elt_1_id"}}, @@ -131,7 +131,7 @@ class TestS3Import(TestCase): [ ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "folder", "name": "folder_1", @@ -148,7 +148,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", "name": "img_1.jpg", @@ -160,7 +160,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "folder", "name": "folder_2", @@ -178,7 +178,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", "name": "img_2.jpg", @@ -219,7 +219,7 @@ class TestS3Import(TestCase): ).strip(), ) mock.post( - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", [ {"json": {"id": "folder_1_id"}}, {"json": {"id": "elt_1_id"}}, @@ -242,7 +242,7 @@ class TestS3Import(TestCase): [ ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "folder", "name": "folder_1", @@ -257,7 +257,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", "name": "img_1.jpg", @@ -308,7 +308,7 @@ class TestS3Import(TestCase): ): "folder_1_id", } mock.post( - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", json={"id": "elt_1_id"}, ) mock.post( @@ -336,7 +336,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", "name": "img_1.jpg", @@ -358,115 +358,354 @@ class TestS3Import(TestCase): }, ) + @mock_s3 @requests_mock.Mocker() - @patch( - "arkindex_tasks.import_files.pdf.WORKER_RUN_ID", - "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - ) @patch( "arkindex_tasks.import_s3.worker.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) - def test_build_pdf(self, mock): - """ - The S3 import should support PDFs, creating a folder containing one element - for each page of the PDF, with extracted transcriptions - """ - node = Node(name="folder_1") - node.add_descendant("folder_1/file_1.pdf") + def test_zip_archives_upload(self, requests_mock): + s3 = boto3.resource("s3", region_name="us-east-1") + bucket = s3.create_bucket(Bucket="testbucket") + with open(SAMPLES / "test_archive.zip", "rb") as f: + bucket.upload_fileobj(f, "test_archive.zip") + + self.assertListEqual( + [obj.key for obj in bucket.objects.all()], + # Only the archive is in the bucket + ["test_archive.zip"], + ) + + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", + status_code=200, + json={ + "id": "corpus_id", + "name": "Corpus Name", + "types": [ + {"id": "page_id", "slug": "page", "folder": False}, + {"id": "folder_id", "slug": "folder", "folder": True}, + ], + }, + ) + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", + json={}, + ) + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", + json={}, + ) + requests_mock.post( + "https://arkindex.teklia.com/api/v1/image/iiif/url/", + [ + {"json": {"id": "200x200_img_id"}}, + {"json": {"id": "600x600_img_id"}}, + ], + ) + requests_mock.post( + "https://arkindex.teklia.com/api/v1/elements/create/", + [ + {"json": {"id": "200x200_id"}}, + {"json": {"id": "testfolder_id"}}, + {"json": {"id": "600x600_id"}}, + ], + ) + + with patch( + "arkindex_tasks.import_s3.worker.get_client_from_env", return_value=s3 + ): + s3_import = S3Import( + corpus="corpus_id", + element=None, + bucket="testbucket", + prefix="", + iiif_base_url="https://server.test/iiif/2", + bucket_prefix=True, + folder_type="folder", + page_type="page", + verbose=False, + ) + s3_import.run() + + self.assertDictEqual( + s3_import.progress, + { + "completed": 3, + "existing": 0, + "errors": 0, + "total": 3, + }, + ) + self.assertEqual( - draw_tree(node), + draw_tree(s3_import.root_node), dedent( """ - folder_1 - ├─ file_1.pdf + . + ├─ test_archive.zip + ├─ 200x200.jpg + ├─ test_folder + │ ├─ 600x600.png """ ).strip(), ) - self.s3_import.boto_resource.meta.client.generate_presigned_url.return_value = ( - "http://s3/file.pdf" + self.assertListEqual( + [obj.key for obj in bucket.objects.all()], + # Both the archive and its contents are in the bucket + [ + "200x200.jpg", + "test_archive.zip", + "test_folder/600x600.png", + ], ) - mock.get("http://s3/file.pdf", body=(SAMPLES / "file.pdf").open("rb")) - mock.post( - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + self.assertListEqual( [ - {"json": {"id": "folder_1_id"}}, - {"json": {"id": "file_1_id"}}, - {"json": {"id": "page_1_id"}}, - {"json": {"id": "page_2_id"}}, + (req.method, req.url, json.loads(req.body) if req.body else None) + for req in requests_mock.request_history + ], + [ + ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", None), + ( + "GET", + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", + None, + ), + ( + "GET", + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", + None, + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/image/iiif/url/", + {"url": "https://server.test/iiif/2/testbucket%2F200x200.jpg"}, + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/elements/create/", + { + "type": "page", + "name": "200x200.jpg", + "corpus": "corpus_id", + "image": "200x200_img_id", + "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + }, + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/elements/create/", + { + "type": "folder", + "name": "test_folder", + "corpus": "corpus_id", + "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + }, + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/image/iiif/url/", + { + "url": "https://server.test/iiif/2/testbucket%2Ftest_folder%2F600x600.png" + }, + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/elements/create/", + { + "type": "page", + "name": "600x600.png", + "corpus": "corpus_id", + "image": "600x600_img_id", + "parent": "testfolder_id", + "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + }, + ), ], ) - mock.post( - "https://arkindex.teklia.com/api/v1/image/iiif/url/", + + @mock_s3 + @requests_mock.Mocker() + @patch( + "arkindex_tasks.import_s3.worker.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + ) + @patch( + "arkindex_tasks.import_files.pdf.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + ) + def test_pdf_upload(self, requests_mock): + s3 = boto3.resource("s3", region_name="us-east-1") + bucket = s3.create_bucket(Bucket="testbucket") + with open(SAMPLES / "file.pdf", "rb") as f: + bucket.upload_fileobj(f, "file.pdf") + + self.assertListEqual( + [obj.key for obj in bucket.objects.all()], + ["file.pdf"], + ) + + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", + status_code=200, + json={ + "id": "corpus_id", + "name": "Corpus Name", + "types": [ + {"id": "page_id", "slug": "page", "folder": False}, + {"id": "folder_id", "slug": "folder", "folder": True}, + ], + }, + ) + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", + json={}, + ) + requests_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", + json={}, + ) + requests_mock.post( + "https://arkindex.teklia.com/api/v1/elements/create/", [ + {"json": {"id": "file_id"}}, { "json": { - "id": "img_1_id", - "width": 1200, - "height": 3000, + "id": "pdf_page_1_id", + "name": "pdf-0001-1.jpg", + "zone": { + "image": { + "id": "pdf_img_1_id", + "width": 1200, + "height": 3000, + }, + "polygon": [ + [0, 0], + [1200, 0], + [1200, 3000], + [0, 3000], + [0, 0], + ], + }, } }, { "json": { - "id": "img_2_id", - "width": 1200, - "height": 3000, + "id": "pdf_page_2_id", + "name": "pdf-0001-2.jpg", + "zone": { + "image": { + "id": "pdf_img_2_id", + "width": 1200, + "height": 3000, + }, + "polygon": [ + [0, 0], + [1200, 0], + [1200, 3000], + [0, 3000], + [0, 0], + ], + }, } }, ], ) - mock.post("/api/v1/element/page_1_id/transcriptions/bulk/") - mock.post("/api/v1/element/page_2_id/transcriptions/bulk/") + requests_mock.post( + "https://arkindex.teklia.com/api/v1/image/iiif/url/", + [ + {"json": {"id": "pdf_img_1_id"}}, + {"json": {"id": "pdf_img_2_id"}}, + ], + ) + requests_mock.post("/api/v1/element/pdf_page_1_id/transcriptions/bulk/") + requests_mock.post("/api/v1/element/pdf_page_2_id/transcriptions/bulk/") - self.s3_import.progress["total"] = len(node) - self.s3_import.build_elements(node) + with patch( + "arkindex_tasks.import_s3.worker.get_client_from_env", return_value=s3 + ): + s3_import = S3Import( + corpus="corpus_id", + element=None, + bucket="testbucket", + prefix="", + iiif_base_url="https://server.test/iiif/2", + bucket_prefix=True, + folder_type="folder", + page_type="page", + verbose=False, + ) + s3_import.run() + + self.assertEqual( + draw_tree(s3_import.root_node), + dedent( + """ + . + ├─ file.pdf + ├─ file + │ ├─ pdf-0001-1.jpg + │ ├─ pdf-0001-2.jpg + """ + ).strip(), + ) + + self.assertListEqual( + [obj.key for obj in bucket.objects.all()], + [ + "file.pdf", + "file/pdf-0001-1.jpg", + "file/pdf-0001-2.jpg", + ], + ) + + self.assertEqual("file" in s3_import.pdf_paths, True) self.assertListEqual( [ (req.method, req.url, json.loads(req.body) if req.body else None) - for req in mock.request_history + for req in requests_mock.request_history ], [ + ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", None), ( - "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", - { - "type": "folder", - "name": "folder_1", - "corpus": "corpus_id", - "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - }, + "GET", + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", + None, + ), + ( + "GET", + "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", + None, ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "folder", - "name": "file_1.pdf", + "name": "file", "corpus": "corpus_id", - "parent": "folder_1_id", "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", }, ), - ("GET", "http://s3/file.pdf", None), ( "POST", "https://arkindex.teklia.com/api/v1/image/iiif/url/", { - "url": "https://server.test/iiif/2/s3_bucket%2Ffolder_1%2Ffile_1.pdf;1" + "url": "https://server.test/iiif/2/testbucket%2Ffile%2Fpdf-0001-1.jpg" }, ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", - "name": "1", + "name": "pdf-0001-1.jpg", "corpus": "corpus_id", - "image": "img_1_id", - "parent": "file_1_id", + "image": "pdf_img_1_id", + "parent": "file_id", "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", }, ), @@ -474,24 +713,24 @@ class TestS3Import(TestCase): "POST", "https://arkindex.teklia.com/api/v1/image/iiif/url/", { - "url": "https://server.test/iiif/2/s3_bucket%2Ffolder_1%2Ffile_1.pdf;2" + "url": "https://server.test/iiif/2/testbucket%2Ffile%2Fpdf-0001-2.jpg" }, ), ( "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", + "https://arkindex.teklia.com/api/v1/elements/create/", { "type": "page", - "name": "2", + "name": "pdf-0001-2.jpg", "corpus": "corpus_id", - "image": "img_2_id", - "parent": "file_1_id", + "image": "pdf_img_2_id", + "parent": "file_id", "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", }, ), ( "POST", - "https://arkindex.teklia.com/api/v1/element/page_1_id/transcriptions/bulk/", + "https://arkindex.teklia.com/api/v1/element/pdf_page_1_id/transcriptions/bulk/", { "element_type": "text_line", "transcriptions": [ @@ -611,7 +850,7 @@ class TestS3Import(TestCase): ), ( "POST", - "https://arkindex.teklia.com/api/v1/element/page_2_id/transcriptions/bulk/", + "https://arkindex.teklia.com/api/v1/element/pdf_page_2_id/transcriptions/bulk/", { "element_type": "text_line", "transcriptions": [ @@ -687,180 +926,3 @@ class TestS3Import(TestCase): ), ], ) - self.assertDictEqual( - self.s3_import.progress, - { - "completed": 2, - "existing": 0, - "errors": 0, - "total": 2, - }, - ) - - @mock_s3 - @requests_mock.Mocker() - @patch( - "arkindex_tasks.import_s3.worker.WORKER_RUN_ID", - "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - ) - def test_zip_archives_upload(self, requests_mock): - s3 = boto3.resource("s3", region_name="us-east-1") - bucket = s3.create_bucket(Bucket="testbucket") - with open(SAMPLES / "test_archive.zip", "rb") as f: - bucket.upload_fileobj(f, "test_archive.zip") - - self.assertListEqual( - [obj.key for obj in bucket.objects.all()], - # Only the archive is in the bucket - ["test_archive.zip"], - ) - - requests_mock.get( - "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", - status_code=200, - json={ - "id": "corpus_id", - "name": "Corpus Name", - "types": [ - {"id": "page_id", "slug": "page", "folder": False}, - {"id": "folder_id", "slug": "folder", "folder": True}, - ], - }, - ) - requests_mock.get( - "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", - json={}, - ) - requests_mock.get( - "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", - json={}, - ) - requests_mock.post( - "https://arkindex.teklia.com/api/v1/image/iiif/url/", - [ - {"json": {"id": "200x200_img_id"}}, - {"json": {"id": "600x600_img_id"}}, - ], - ) - requests_mock.post( - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", - [ - {"json": {"id": "200x200_id"}}, - {"json": {"id": "testfolder_id"}}, - {"json": {"id": "600x600_id"}}, - ], - ) - - with patch( - "arkindex_tasks.import_s3.worker.get_client_from_env", return_value=s3 - ): - s3_import = S3Import( - corpus="corpus_id", - element=None, - bucket="testbucket", - prefix="", - iiif_base_url="https://server.test/iiif/2", - bucket_prefix=True, - folder_type="folder", - page_type="page", - verbose=False, - ) - s3_import.run() - - self.assertDictEqual( - s3_import.progress, - { - "completed": 3, - "existing": 0, - "errors": 0, - "total": 3, - }, - ) - - self.assertEqual( - draw_tree(s3_import.root_node), - dedent( - """ - . - ├─ 200x200.jpg - ├─ test_archive.zip - ├─ test_folder - │ ├─ 600x600.png - """ - ).strip(), - ) - - self.assertListEqual( - [obj.key for obj in bucket.objects.all()], - # Both the archive and its contents are in the bucket - [ - "200x200.jpg", - "test_archive.zip", - "test_folder/600x600.png", - ], - ) - - self.assertListEqual( - [ - (req.method, req.url, json.loads(req.body) if req.body else None) - for req in requests_mock.request_history - ], - [ - ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpus_id/", None), - ( - "GET", - "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?top_level=True&type=page&with_corpus=False&with_zone=False", - None, - ), - ( - "GET", - "https://arkindex.teklia.com/api/v1/corpus/corpus_id/elements/?type=folder&with_corpus=False&with_zone=False", - None, - ), - ( - "POST", - "https://arkindex.teklia.com/api/v1/image/iiif/url/", - {"url": "https://server.test/iiif/2/testbucket%2F200x200.jpg"}, - ), - ( - "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", - { - "type": "page", - "name": "200x200.jpg", - "corpus": "corpus_id", - "image": "200x200_img_id", - "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - }, - ), - ( - "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", - { - "type": "folder", - "name": "test_folder", - "corpus": "corpus_id", - "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - }, - ), - ( - "POST", - "https://arkindex.teklia.com/api/v1/image/iiif/url/", - { - "url": "https://server.test/iiif/2/testbucket%2Ftest_folder%2F600x600.png" - }, - ), - ( - "POST", - "https://arkindex.teklia.com/api/v1/elements/create/?slim_output=True", - { - "type": "page", - "name": "600x600.png", - "corpus": "corpus_id", - "image": "600x600_img_id", - "parent": "testfolder_id", - "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", - }, - ), - ], - )