diff --git a/README.md b/README.md index 10ac7b18278a11b0edaf6873afae411f9f6609aa..3eaed9e501b6d0282bc58ee924a849b041294e77 100644 --- a/README.md +++ b/README.md @@ -72,14 +72,3 @@ To run the unit tests with `tox`, you will have to use an argument to switch its ```console tox -e local ``` - -## Tasks - -### Import a transkribus collection - -``` -export TRANSKRIBUS_EMAIL=someone@teklia.com TRANSKRIBUS_PASSWORD=abc123 -export ARKINDEX_WORKER_RUN_ID=1234 -export ARKINDEX_API_URL=https://arkindex.dev.teklia.com ARKINDEX_API_TOKEN=xxxxxxxxx -python3 -m arkindex_tasks.import_balsac <ID collection> --corpus <UUID Corpus Arkindex> -``` diff --git a/arkindex_tasks/base.py b/arkindex_tasks/base.py index e8d74d86a16cf9a5d07f1976dda38b3dbf21b0ae..bb9caf85164f38d71068af004725255d5459a6e7 100644 --- a/arkindex_tasks/base.py +++ b/arkindex_tasks/base.py @@ -48,25 +48,6 @@ def dump_json(elements, path=None, filename="elements.json"): json.dump(elements, f, indent=4) -def dump_elements(path=None, **elements_lists): - """ - Helper method to write a list of elements in task artifacts directory - """ - elements = [] - for elt_type in elements_lists: - elements_ids = list(map(_get_id, elements_lists[elt_type])) - elements.extend([{"type": elt_type, "id": elt_id} for elt_id in elements_ids]) - dump_json(elements=elements, path=path) - - -def dump_transcriptions(transcriptions, path=None, filename="transcriptions.json"): - file_path = path and Path(path, filename) - if not file_path: - file_path = get_working_dir() / filename - with file_path.open("w") as f: - json.dump(transcriptions, f, indent=4) - - def split_chunks(items, n): """ Yield n number of elements from a given list with a balanced distribution diff --git a/arkindex_tasks/enums.py b/arkindex_tasks/enums.py index 3c170c37717d9529df910af16bba3934729122a3..75be6db9c556eacdc76fe9912575ab2f909ce42e 100644 --- a/arkindex_tasks/enums.py +++ b/arkindex_tasks/enums.py @@ -19,6 +19,7 @@ class ProcessMode(Enum): IIIF = "iiif" Workers = "workers" Template = "template" - Transkribus = "transkribus" S3 = "s3" Training = "training" + Local = "local" + Dataset = "dataset" diff --git a/arkindex_tasks/export_transkribus.py b/arkindex_tasks/export_transkribus.py deleted file mode 100644 index 104360c17b2605189372b754bef6d92e6d1e07c4..0000000000000000000000000000000000000000 --- a/arkindex_tasks/export_transkribus.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import argparse -import json -import logging -import os - -from transkribus import TranskribusAPI, options_from_env -from transkribus.models import Collection - -from arkindex_tasks.base import get_working_dir - -logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) -logger = logging.getLogger(__name__) - -transkribus_client = TranskribusAPI(**options_from_env()) - -COLLECTION_EXPORT_PARAMS = { - "commonPars": { - "doWriteImages": True, - "doExportPageXml": True, - "doExportAltoXml": False, - # Not needed by this task, but setting this to False causes the archive to be empty - "doWriteMets": True, - "doWriteStructureInMets": False, - # Get the highest available image quality: the original images - "remoteImgQuality": "orig", - "fileNamePattern": "${pageNr}_${pageId}", - # No subdirectory for images - "useOcrMasterDir": False, - # No subdirectory for PAGE XML files - "pageDirName": "", - # Do not export document and page metadata to a separate metadata.xml - "doExportDocMetadata": False, - # Add the document and page metadata to each PAGE XML file directly - "exportTranscriptMetadata": True, - } -} - - -class TranskribusExporter(object): - def __init__( - self, - collection_id=None, - ): - self.collection = Collection(collection_id) - - def run(self): - logger.info("Starting collection export…") - job = self.collection.export( - transkribus_client, export_params=COLLECTION_EXPORT_PARAMS - ) - - timeout = int(os.environ.get("TRANSKRIBUS_JOB_TIMEOUT", 7200)) - job.wait_for_result(transkribus_client, timeout=timeout) - - (get_working_dir() / "transkribus_export_job.json").write_text( - json.dumps(job.data) - ) - - -def main(): - parser = argparse.ArgumentParser( - description="Export a Transkribus collection to a ZIP" - ) - parser.add_argument( - "collection_id", help="ID of a Transkribus collection to import from", type=int - ) - args = vars(parser.parse_args()) - - TranskribusExporter(**args).run() - - -if __name__ == "__main__": - main() diff --git a/arkindex_tasks/import_files/base.py b/arkindex_tasks/import_files/base.py index 426f17e645a95ef72c132478ece70097bb6bf859..220e9b18066f3d904ab08ba633b2e270bd314499 100644 --- a/arkindex_tasks/import_files/base.py +++ b/arkindex_tasks/import_files/base.py @@ -3,6 +3,7 @@ import hashlib import logging import sys from datetime import datetime +from zipfile import BadZipFile, ZipFile import requests from apistar.exceptions import ErrorResponse @@ -13,6 +14,7 @@ from arkindex_tasks import default_client from arkindex_tasks.base import WORKER_RUN_ID, ProcessTask, dump_json from arkindex_tasks.import_files.image import check_image from arkindex_tasks.import_files.pdf import extract_pdf_images, upload_pdf_text +from arkindex_tasks.import_files.transkribus import TranskribusImporter logger = logging.getLogger(__name__) @@ -164,6 +166,28 @@ class FileImport(ProcessTask): ) return elements + def is_transkribus_export(self, datafile) -> bool: + try: + # Opening the file with ZipFile will raise a BadZipFile exception + # when the file is not a valid ZIP file. + with ZipFile(datafile["local_path"]) as archive: + # To detect that an archive is a Transkribus export, we look for a `mets.xml` file, + # which should be present in every export, under the subdirectories for each document. + if any( + filename.endswith("/mets.xml") for filename in archive.namelist() + ): + return True + + logger.error( + f"Archive {datafile['name']} does not appear to be a Transkribus export." + ) + return False + except BadZipFile: + logger.error( + f"File {datafile['name']} does not appear to be a valid ZIP archive." + ) + return False + def run(self): assert WORKER_RUN_ID, "A WorkerRun ID is required" @@ -197,12 +221,35 @@ class FileImport(ProcessTask): successful_datafiles = [] for df in datafiles: - logger.info("Fetching images for {}".format(df["name"])) - images = self.get_images(df) - if not images: - continue - logger.info("Creating {} elements".format(len(images))) - new_elements = self.save_elements(df, images) + # application/zip is the standard, but all browsers on Windows + # upload files with application/x-zip-compressed instead. + if df["content_type"] in ( + "application/zip", + "application/x-zip-compressed", + ): + if not self.is_transkribus_export(df): + continue + + logger.info(f'Starting Transkribus import for {df["name"]}') + + importer = TranskribusImporter( + df["local_path"], + corpus=self.process["corpus"], + parent_id=self.process["element"]["id"] + if self.process["element"] + else None, + folder_type=self.process["folder_type"], + element_type=self.process["element_type"], + ) + new_elements = importer.run() + + else: + logger.info("Fetching images for {}".format(df["name"])) + images = self.get_images(df) + if not images: + continue + logger.info("Creating {} elements".format(len(images))) + new_elements = self.save_elements(df, images) # Add text extraction on PDF files if df["content_type"] == "application/pdf": diff --git a/arkindex_tasks/pagexml.py b/arkindex_tasks/import_files/pagexml.py similarity index 100% rename from arkindex_tasks/pagexml.py rename to arkindex_tasks/import_files/pagexml.py diff --git a/arkindex_tasks/import_transkribus.py b/arkindex_tasks/import_files/transkribus.py similarity index 78% rename from arkindex_tasks/import_transkribus.py rename to arkindex_tasks/import_files/transkribus.py index 723d4398f600f81a971ef33710022fdb48dff3f3..a519d3b1f86f52ca4af37dac90080d9caaea7553 100644 --- a/arkindex_tasks/import_transkribus.py +++ b/arkindex_tasks/import_files/transkribus.py @@ -2,9 +2,9 @@ # -*- coding: utf-8 -*- import argparse import hashlib -import json import logging import os +import shutil import tempfile import uuid from itertools import groupby @@ -15,10 +15,9 @@ import requests from apistar.exceptions import ErrorResponse from lxml import etree from PIL import Image, ImageOps -from transkribus.models import Job -from arkindex_tasks.base import WORKER_RUN_ID, dump_elements -from arkindex_tasks.pagexml import PageXmlParser +from arkindex_tasks.base import WORKER_RUN_ID, dump_json +from arkindex_tasks.import_files.pagexml import PageXmlParser from arkindex_tasks.utils import default_client, retried_request logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO) @@ -60,8 +59,17 @@ class TranskribusElement(object): def save_image(self): logger.info(f"Extracting image {self.image_filename}") - with self.archive.open(self.image_filename) as f: - img = Image.open(f) + + # Extract the image first before editing it. + # Using Image.open(ZipFile.open()) is 15 times slower than extracting first! + # Do not save in get_working_dir() here as it would become an artifact + _, img_path = tempfile.mkstemp() + with self.archive.open(self.image_filename) as source, open( + img_path, "wb" + ) as destination: + shutil.copyfileobj(source, destination) + + with Image.open(img_path) as img: if img.format not in VALID_IMAGE_FORMATS: # Convert to RGB if needed if img.mode != "RGB": @@ -69,12 +77,9 @@ class TranskribusElement(object): # Apply the rotation/transposition from the EXIF orientation tag if it is used img = ImageOps.exif_transpose(img) - - # Save to a temporary file - # Do not save in get_working_dir() here as it would become an artifact - _, img_path = tempfile.mkstemp() img.save(img_path, format="JPEG") - return img_path + + return img_path def upload_image(self): image_path = self.save_image() @@ -103,14 +108,13 @@ class TranskribusElement(object): ) def upload_transcriptions(self): - data = [] - + element = None logger.info(f"Importing transcript {self.xml_filename}") try: # Read and parse the xml file xml_text = self.archive.read(self.xml_filename) parser = PageXmlParser(etree.fromstring(xml_text), self.image) - element, transcriptions = parser.save( + element, _ = parser.save( corpus_id=self.corpus_id, parent_id=self.folder_id, image_id=self.image["id"], @@ -120,20 +124,6 @@ class TranskribusElement(object): ) if element is not None: - # Update data with: - # xml_file: to retrieve the xml file - # page_id: to reindex - # corpus_id: to create roles, a role is linked to a corpus - # transcriptions: to create entities, an entity is linked to a transcription - data.append( - { - "xml_file": self.xml_filename, - "corpus_id": element["corpus"]["id"], - "page_id": element["id"], - "transcriptions": transcriptions, - } - ) - retried_request( "CreateMetaData", id=element["id"], @@ -154,20 +144,20 @@ class TranskribusElement(object): ) except requests.exceptions.RequestException as e: logger.error(f"Failed importing transcript {self.xml_filename}: {e}") - return data + return element def run(self): try: self.upload_image() return self.upload_transcriptions() - except Exception as e: - logger.error( - f"Failed importing {self.image_filename} and {self.xml_filename}: {e}" - ) except ErrorResponse as e: logger.error( f"Failed importing {self.image_filename} and {self.xml_filename}: {e.status_code} - {e.content}" ) + except Exception as e: + logger.error( + f"Failed importing {self.image_filename} and {self.xml_filename}: {e}" + ) return [] @@ -182,11 +172,13 @@ class TranskribusFolder(object): element_type, paragraph_type, line_type, + parent_id=None, ): self.archive = archive self.filenames = filenames self.title = title self.corpus_id = corpus_id + self.parent_id = parent_id self.folder_type = folder_type self.element_type = element_type self.paragraph_type = paragraph_type @@ -194,12 +186,22 @@ class TranskribusFolder(object): self.folder = None def get_or_create_folder(self): - search = default_client.paginate( - "ListElements", - corpus=self.corpus_id, - name=self.title, - type=self.folder_type["slug"], - ) + if self.parent_id is None: + search = default_client.paginate( + "ListElements", + corpus=self.corpus_id, + name=self.title, + type=self.folder_type["slug"], + ) + else: + search = default_client.paginate( + "ListElementChildren", + id=self.parent_id, + name=self.title, + type=self.folder_type["slug"], + recursive=True, + ) + for folder in search: # The name filter on ListElements will match if the name of any element contains self.title (case insensitive) # so we need to check again for strict equality. @@ -212,6 +214,7 @@ class TranskribusFolder(object): "CreateElement", body={ "corpus": self.corpus_id, + "parent": self.parent_id, "type": self.folder_type["slug"], "name": self.title, "worker_run_id": WORKER_RUN_ID, @@ -220,16 +223,17 @@ class TranskribusFolder(object): return True def run(self): - transcriptions = [] + elements = [] try: created = self.get_or_create_folder() + elements.append(self.folder) if not created: logger.info( "Skipping existing {} {}".format( self.folder_type["display_name"], self.folder["name"] ) ) - return self.folder, transcriptions + return elements # Group files by name without extension: each group will hold the PAGE XML file and the image for name, filenames in groupby( @@ -250,22 +254,18 @@ class TranskribusFolder(object): else: image_filename, xml_filename = filenames - transcriptions += TranskribusElement( - archive=self.archive, - image_filename=str(image_filename), - xml_filename=str(xml_filename), - corpus_id=self.corpus_id, - folder_id=self.folder["id"], - element_type=self.element_type, - paragraph_type=self.paragraph_type, - line_type=self.line_type, - ).run() - except (AssertionError, Exception) as e: - logger.error( - "Failed importing {} {}: {}".format( - self.folder_type["display_name"], self.title, e + elements.append( + TranskribusElement( + archive=self.archive, + image_filename=str(image_filename), + xml_filename=str(xml_filename), + corpus_id=self.corpus_id, + folder_id=self.folder["id"], + element_type=self.element_type, + paragraph_type=self.paragraph_type, + line_type=self.line_type, + ).run() ) - ) except ErrorResponse as e: logger.error( "Failed importing {} {}: {} - {}".format( @@ -275,32 +275,30 @@ class TranskribusFolder(object): e.content, ) ) - return self.folder, transcriptions + except Exception as e: + logger.error( + "Failed importing {} {}: {}".format( + self.folder_type["display_name"], self.title, e + ) + ) + return elements class TranskribusImporter(object): def __init__( self, - job_path=None, - archive_path=None, - corpus=None, - folder_type=None, - element_type=None, - paragraph_type=None, - line_type=None, + archive_path, + *, + corpus, + folder_type, + element_type, + parent_id=None, + paragraph_type="paragraph", + line_type="text_line", ): - assert (job_path is not None) ^ ( - archive_path is not None - ), "Either specify job path or archive path" - - if job_path is not None: - assert job_path.is_file(), f"JSON file at {job_path} not found" - self.job_path = job_path - self.archive_path = None - else: - assert archive_path.is_file(), f"Archive at {archive_path} not found" - self.archive_path = archive_path - self.job_path = None + + assert archive_path.is_file(), f"Archive at {archive_path} not found" + self.archive_path = archive_path assert WORKER_RUN_ID, "A WorkerRun ID is required" @@ -312,7 +310,23 @@ class TranskribusImporter(object): logger.error("Corpus {} not found".format(corpus)) raise - # Check folder + # Check parent element + if parent_id: + try: + self.parent = retried_request("RetrieveElement", id=parent_id) + except ErrorResponse as e: + if e.status_code == 404: + logger.error(f"Parent element {parent_id} not found") + raise + + if self.parent["corpus"]["id"] != self.corpus["id"]: + raise ValueError( + f"Parent element {parent_id} is not in corpus {self.corpus['id']}" + ) + else: + self.parent = None + + # Check folder type try: self.folder_type = next( t for t in self.corpus["types"] if t["slug"] == folder_type @@ -373,29 +387,10 @@ class TranskribusImporter(object): raise ValueError("Type {} should not be a folder type".format(line_type)) def run(self): - - if self.job_path is not None: - # Download a Transkribus export and import it - job = Job(json.loads(self.job_path.read_text())) - - _, archive_path = tempfile.mkstemp() - try: - job.download_result(archive_path) - self._import(archive_path) - finally: - os.unlink(archive_path) - elif self.archive_path is not None: - # Directly import a locally available Transkribus export - self._import(self.archive_path) - else: - raise NotImplementedError - - def _import(self, archive_path): elements = [] - transcriptions = [] # Archive structure: <document ID>/<document title>/[<optional subdir>/]<page number>_<page ID>.[xml,jpg,tif…] # Also includes a log.txt file at the root and a mets.xml in each document folder. - with ZipFile(archive_path) as archive: + with ZipFile(self.archive_path) as archive: paths = sorted(map(Path, archive.namelist())) @@ -419,37 +414,30 @@ class TranskribusImporter(object): continue try: - folder, trs = TranskribusFolder( + elements += TranskribusFolder( archive, filenames, # `parent` here is <doc ID>/<doc title>, so we extract the title title=parent.name, corpus_id=self.corpus["id"], + parent_id=self.parent["id"] if self.parent else None, folder_type=self.folder_type, element_type=self.element_type, paragraph_type=self.paragraph_type, line_type=self.line_type, ).run() - elements.append(folder) - transcriptions += trs except Exception as e: logger.error(f"Failed importing folder {parent}: {e}") - dump_elements(elements=elements) + return elements def main(): parser = argparse.ArgumentParser( description="Import a Transkribus ZIP export into Arkindex" ) - paths = parser.add_mutually_exclusive_group() - paths.add_argument( - "--job-path", - help="Path to a JSON file holding a Transkribus import job", - type=Path, - ) - paths.add_argument( - "--archive-path", + parser.add_argument( + "archive_path", help="Path to a ZIP file holding a Transkribus export", type=Path, ) @@ -459,6 +447,11 @@ def main(): type=uuid.UUID, required=True, ) + parser.add_argument( + "--parent-id", + help="UUID of an existing parent element in the corpus to import into", + type=uuid.UUID, + ) parser.add_argument( "--folder-type", help="Slug of an element type to use for documents", @@ -489,7 +482,8 @@ def main(): default_client.sleep_duration = args.pop("sleep") - TranskribusImporter(**args).run() + elements = TranskribusImporter(**args).run() + dump_json(elements) if __name__ == "__main__": diff --git a/tests/import_files/test_base.py b/tests/import_files/test_base.py index dd59bd2a2ca85caef1e6bce14c2984bbc8f97a7e..894cb683d41f80d1b1083ff9d741ca0afbffc06c 100644 --- a/tests/import_files/test_base.py +++ b/tests/import_files/test_base.py @@ -2,8 +2,10 @@ import json from datetime import datetime from pathlib import Path +from tempfile import TemporaryFile from unittest import TestCase from unittest.mock import call, patch +from zipfile import ZipFile import requests_mock from apistar.exceptions import ErrorResponse @@ -1091,3 +1093,167 @@ class TestFileImport(TestCase): {"id": "pageid", "type": "page"}, ], ) + + @patch( + "arkindex_tasks.import_files.base.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + ) + @patch("arkindex_tasks.import_files.base.TranskribusImporter") + @patch("arkindex_tasks.import_files.base.logger.error") + def test_run_zip(self, mock, logger_mock, importer_mock): + # Process info + mock.get( + "/api/v1/process/processid/", + json={ + "id": "processid", + "corpus": "corpusid", + "mode": "files", + "element": {"id": "folderid", "name": "Untitled", "type": "book"}, + "files": ["file1", "file2", "file3", "file4"], + "element_type": "page", + "folder_type": "book", + }, + ) + # DataFile info + mock.get( + "/api/v1/process/file/file1/", + json={ + "id": "file1", + "name": "file1.zip", + "content_type": "application/zip", + "s3_url": "http://s3/file1.zip", + }, + ) + mock.get( + "/api/v1/process/file/file2/", + json={ + "id": "file2", + "name": "file2.zip", + "content_type": "application/zip", + "s3_url": "http://s3/file2.zip", + }, + ) + mock.get( + "/api/v1/process/file/file3/", + json={ + "id": "file3", + "name": "file3.zip", + # Use the Windows MIME type + "content_type": "application/x-zip-compressed", + "s3_url": "http://s3/file3.zip", + }, + ) + mock.get( + "/api/v1/process/file/file4/", + json={ + "id": "file4", + "name": "file4.zip", + "content_type": "application/zip", + "s3_url": "http://s3/file4.zip", + }, + ) + + # Create an archive that looks like a Transkribus export + transkribus_zip = TemporaryFile() + with ZipFile(transkribus_zip, "a") as archive: + archive.writestr("1234/document_title/mets.xml", "lol") + transkribus_zip.flush() + transkribus_zip.seek(0) + mock.get("http://s3/file1.zip", body=transkribus_zip) + + # An archive that isn't a Transkribus export + dummy_zip = TemporaryFile() + with ZipFile(dummy_zip, "a") as archive: + archive.writestr("lol.txt", "lol") + dummy_zip.flush() + dummy_zip.seek(0) + mock.get("http://s3/file2.zip", body=dummy_zip) + + # Windows MIME type test: we just use a dummy archive. + # We cannot re-use the dummy_zip defined above since it will be consumed by the mocked request + windows_zip = TemporaryFile() + with ZipFile(windows_zip, "a") as archive: + archive.writestr("lol.txt", "lol") + windows_zip.flush() + windows_zip.seek(0) + mock.get("http://s3/file3.zip", body=windows_zip) + + # Not actually a zip file + mock.get("http://s3/file4.zip", body=open(SAMPLES / "file2.pdf", "rb")) + + mock.post( + "/api/v1/elements/create/", + [ + {"json": {"id": "box1", "type": "book"}}, + {"json": {"id": "box2", "type": "book"}}, + ], + ) + # Process update + mock.patch("/api/v1/process/processid/", status_code=200) + # Only file1 gets deleted, since the other files will not be imported successfully + mock.delete("/api/v1/process/file/file1/", status_code=204) + + importer_mock.run.return_value = [{"id": "imported", "type": "thing"}] + + fi = FileImport("processid") + fi.run() + + # file1.zip triggers a Transkribus import + importer_mock.assert_called_once() + + # The only positional argument is a path to the temporary file + # that the file import uses to download the file. + self.assertEqual(len(importer_mock.call_args.args), 1) + # To check that this is the Transkribus export, we open the archive and look for the XML file we put in there + with ZipFile(importer_mock.call_args.args[0]) as archive: + self.assertSequenceEqual( + archive.namelist(), ["1234/document_title/mets.xml"] + ) + + self.assertDictEqual( + importer_mock.call_args.kwargs, + { + "corpus": "corpusid", + "parent_id": "folderid", + "folder_type": "book", + "element_type": "page", + }, + ) + + self.assertListEqual( + logger_mock.call_args_list, + [ + # Dummy ZIP file + call("Archive file2.zip does not appear to be a Transkribus export."), + # Dummy ZIP file with Windows MIME type + call("Archive file3.zip does not appear to be a Transkribus export."), + # Not a ZIP file + call("File file4.zip does not appear to be a valid ZIP archive."), + ], + ) + + self.assertListEqual( + [(req.method, req.url) for req in mock.request_history], + [ + ("GET", "https://arkindex.teklia.com/api/v1/process/processid/"), + ("GET", "https://arkindex.teklia.com/api/v1/process/file/file1/"), + ("GET", "http://s3/file1.zip"), + ("GET", "https://arkindex.teklia.com/api/v1/process/file/file2/"), + ("GET", "http://s3/file2.zip"), + ("GET", "https://arkindex.teklia.com/api/v1/process/file/file3/"), + ("GET", "http://s3/file3.zip"), + ("GET", "https://arkindex.teklia.com/api/v1/process/file/file4/"), + ("GET", "http://s3/file4.zip"), + ("PATCH", "https://arkindex.teklia.com/api/v1/process/processid/"), + ("DELETE", "https://arkindex.teklia.com/api/v1/process/file/file1/"), + ], + ) + + with (get_working_dir() / "elements.json").open() as f: + elements = json.load(f) + self.assertListEqual( + elements, + [ + {"id": "folderid", "type": "book"}, + ], + ) diff --git a/tests/test_pagexml.py b/tests/import_files/test_pagexml.py similarity index 96% rename from tests/test_pagexml.py rename to tests/import_files/test_pagexml.py index 2a737fc94c45182cff81c57eb907da6dbe32b5e0..b7dd23a313fd374b073b9c2beb57f3fc614a8b9c 100644 --- a/tests/test_pagexml.py +++ b/tests/import_files/test_pagexml.py @@ -8,13 +8,13 @@ from unittest.mock import patch import requests_mock from lxml import etree -from arkindex_tasks.pagexml import PageXmlParser +from arkindex_tasks.import_files.pagexml import PageXmlParser EMPTY_XML = etree.fromstring( '<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"><Page></Page></PcGts>' ) -SAMPLES = Path(__file__).absolute().parent / "samples" +SAMPLES = Path(__file__).absolute().parent.parent / "samples" fake_region = namedtuple( "fake_region", @@ -155,7 +155,8 @@ class TestPageXmlParser(TestCase): @requests_mock.Mocker() @patch( - "arkindex_tasks.pagexml.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + "arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) def test_save(self, mock): xml_file = (SAMPLES / "transcript.xml").read_bytes() @@ -263,7 +264,8 @@ class TestPageXmlParser(TestCase): @requests_mock.Mocker() @patch( - "arkindex_tasks.pagexml.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + "arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) def test_save_no_lines(self, mock): xml_file = (SAMPLES / "no_lines.xml").read_bytes() @@ -320,7 +322,8 @@ class TestPageXmlParser(TestCase): @requests_mock.Mocker() @patch( - "arkindex_tasks.pagexml.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + "arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) def test_save_no_valid_regions(self, mock): bad_image = { diff --git a/tests/import_transkribus/__init__.py b/tests/import_files/transkribus/__init__.py similarity index 100% rename from tests/import_transkribus/__init__.py rename to tests/import_files/transkribus/__init__.py diff --git a/tests/import_transkribus/test_element.py b/tests/import_files/transkribus/test_element.py similarity index 97% rename from tests/import_transkribus/test_element.py rename to tests/import_files/transkribus/test_element.py index 620fd4173757dd9d9dd7b63843d94efaaf711639..4a468554adc8c312226d8413591dcc9fff33b183 100644 --- a/tests/import_transkribus/test_element.py +++ b/tests/import_files/transkribus/test_element.py @@ -10,13 +10,13 @@ import requests_mock from apistar.exceptions import ErrorResponse from arkindex_tasks.base import get_working_dir -from arkindex_tasks.import_transkribus import TranskribusElement +from arkindex_tasks.import_files.transkribus import TranskribusElement -SAMPLES = Path(__file__).absolute().parent.parent / "samples" +SAMPLES = Path(__file__).absolute().parent.parent.parent / "samples" @patch( - "arkindex_tasks.import_transkribus.retried_request.retry.wait", + "arkindex_tasks.import_files.transkribus.retried_request.retry.wait", 0, ) class TestTranskribusElement(TestCase): @@ -148,11 +148,11 @@ class TestTranskribusElement(TestCase): ) @patch( - "arkindex_tasks.pagexml.WORKER_RUN_ID", + "arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) @patch( - "arkindex_tasks.import_transkribus.WORKER_RUN_ID", + "arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) @requests_mock.Mocker() @@ -316,7 +316,7 @@ class TestTranskribusElement(TestCase): ], ) - @patch("arkindex_tasks.pagexml.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", str(uuid4())) @requests_mock.Mocker() def test_run(self, mock): mock.post( diff --git a/tests/import_transkribus/test_folder.py b/tests/import_files/transkribus/test_folder.py similarity index 94% rename from tests/import_transkribus/test_folder.py rename to tests/import_files/transkribus/test_folder.py index 232c907d87b125ce129e4cd8dc1a4fcc198659cf..5d4631352dfe29c21dd7f40e8657abe14d1966aa 100644 --- a/tests/import_transkribus/test_folder.py +++ b/tests/import_files/transkribus/test_folder.py @@ -4,7 +4,7 @@ from unittest.mock import patch import requests_mock -from arkindex_tasks.import_transkribus import TranskribusFolder +from arkindex_tasks.import_files.transkribus import TranskribusFolder class TestTranskribusFolder(TestCase): @@ -51,7 +51,7 @@ class TestTranskribusFolder(TestCase): @requests_mock.Mocker() @patch( - "arkindex_tasks.import_transkribus.WORKER_RUN_ID", + "arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", ) def test_create_folder(self, mock): @@ -82,6 +82,7 @@ class TestTranskribusFolder(TestCase): { "corpus": "corpusid", "type": "book", + "parent": None, "name": "Untitled (3)", "worker_run_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", }, diff --git a/tests/import_transkribus/test_importer.py b/tests/import_files/transkribus/test_importer.py similarity index 65% rename from tests/import_transkribus/test_importer.py rename to tests/import_files/transkribus/test_importer.py index e75d4735be07a9f204577f9109f46c93d084fe32..fae57006fc099b5e85ef224e2efca68bcfbfc520 100644 --- a/tests/import_transkribus/test_importer.py +++ b/tests/import_files/transkribus/test_importer.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import json import tempfile from pathlib import Path from unittest import TestCase @@ -10,10 +9,9 @@ from zipfile import ZipFile import requests_mock from apistar.exceptions import ErrorResponse -from arkindex_tasks.base import get_working_dir -from arkindex_tasks.import_transkribus import TranskribusImporter +from arkindex_tasks.import_files.transkribus import TranskribusImporter -SAMPLES = Path(__file__).absolute().parent.parent / "samples" +SAMPLES = Path(__file__).absolute().parent.parent.parent / "samples" EXPECTED = SAMPLES / "expected" CORPUS_INFO = { @@ -41,12 +39,6 @@ CORPUS_INFO = { class TestTranskribusImporter(TestCase): @classmethod def setUpClass(cls): - _, cls.job_path = tempfile.mkstemp() - cls.job_path = Path(cls.job_path) - cls.job_path.write_text( - json.dumps({"jobId": 999999, "result": "https://transkribus.eu/export.zip"}) - ) - _, cls.archive_path = tempfile.mkstemp() cls.archive_path = Path(cls.archive_path) with ZipFile(cls.archive_path, mode="w") as archive: @@ -71,14 +63,13 @@ class TestTranskribusImporter(TestCase): @classmethod def tearDownClass(cls): - cls.job_path.unlink() cls.archive_path.unlink() def setUp(self): self.maxDiff = None @requests_mock.Mocker() - @patch("arkindex_tasks.import_transkribus.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) def test_init(self, mock): mock.get("https://arkindex.teklia.com/api/v1/corpus/notfound/", status_code=404) mock.get( @@ -87,7 +78,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ErrorResponse): TranskribusImporter( - self.job_path, + self.archive_path, corpus="notfound", folder_type="book", element_type="page", @@ -97,7 +88,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type hook not found in corpus Korpus"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="hook", element_type="page", @@ -107,7 +98,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type mage not found in corpus Korpus"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="mage", @@ -117,7 +108,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type page is not a folder type"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="page", element_type="page", @@ -127,7 +118,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type book should not be a folder type"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="book", @@ -139,7 +130,7 @@ class TestTranskribusImporter(TestCase): ValueError, msg="Type paragraf not found in corpus Korpus" ): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", @@ -149,7 +140,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type book should not be a folder type"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", @@ -159,7 +150,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type lin not found in corpus Korpus"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", @@ -169,7 +160,7 @@ class TestTranskribusImporter(TestCase): with self.assertRaises(ValueError, msg="Type book should not be a folder type"): TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", @@ -178,7 +169,7 @@ class TestTranskribusImporter(TestCase): ) importer = TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", @@ -186,7 +177,7 @@ class TestTranskribusImporter(TestCase): line_type="line", ) - self.assertEqual(importer.job_path, self.job_path) + self.assertEqual(importer.archive_path, self.archive_path) self.assertDictEqual(importer.corpus, CORPUS_INFO) self.assertDictEqual( importer.folder_type, @@ -215,15 +206,10 @@ class TestTranskribusImporter(TestCase): }, ) - @patch("arkindex_tasks.pagexml.WORKER_RUN_ID", str(uuid4())) - @patch("arkindex_tasks.import_transkribus.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) @requests_mock.Mocker() def test_run(self, mock): - mock.get( - "https://transkribus.eu/export.zip", - headers={"Content-Length": str(self.archive_path.stat().st_size)}, - body=self.archive_path.open("rb"), - ) mock.get( "https://arkindex.teklia.com/api/v1/corpus/corpusid/", json=CORPUS_INFO ) @@ -305,20 +291,31 @@ class TestTranskribusImporter(TestCase): ) importer = TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", paragraph_type="paragraph", line_type="line", ) - importer.run() + + self.assertListEqual( + importer.run(), + [ + {"id": "folderid"}, + { + "id": "elementid", + "name": "new element", + "corpus": {"id": "corpusid"}, + "zone": {"image": {"id": "imageid"}}, + }, + ], + ) self.assertListEqual( [(req.method, req.url) for req in mock.request_history], [ ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/"), - ("GET", "https://transkribus.eu/export.zip"), ( "GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/elements/?name=document&type=book", @@ -384,22 +381,10 @@ class TestTranskribusImporter(TestCase): ], ) - with (get_working_dir() / "elements.json").open() as f: - elements = json.load(f) - self.assertListEqual( - elements, - [{"type": "elements", "id": "folderid"}], - ) - - @patch("arkindex_tasks.pagexml.WORKER_RUN_ID", str(uuid4())) - @patch("arkindex_tasks.import_transkribus.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) @requests_mock.Mocker() def test_run_subdir(self, mock): - mock.get( - "https://transkribus.eu/export.zip", - headers={"Content-Length": str(self.subdir_archive_path.stat().st_size)}, - body=self.subdir_archive_path.open("rb"), - ) mock.get( "https://arkindex.teklia.com/api/v1/corpus/corpusid/", json=CORPUS_INFO ) @@ -481,20 +466,31 @@ class TestTranskribusImporter(TestCase): ) importer = TranskribusImporter( - self.job_path, + self.subdir_archive_path, corpus="corpusid", folder_type="book", element_type="page", paragraph_type="paragraph", line_type="line", ) - importer.run() + + self.assertListEqual( + importer.run(), + [ + {"id": "folderid"}, + { + "id": "elementid", + "name": "new element", + "corpus": {"id": "corpusid"}, + "zone": {"image": {"id": "imageid"}}, + }, + ], + ) self.assertListEqual( [(req.method, req.url) for req in mock.request_history], [ ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/"), - ("GET", "https://transkribus.eu/export.zip"), ( "GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/elements/?name=document&type=book", @@ -560,45 +556,248 @@ class TestTranskribusImporter(TestCase): ], ) - with (get_working_dir() / "elements.json").open() as f: - elements = json.load(f) + @patch("arkindex_tasks.import_files.pagexml.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) + @requests_mock.Mocker() + def test_run_parent(self, mock): + mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpusid/", json=CORPUS_INFO + ) + mock.get( + "https://arkindex.teklia.com/api/v1/element/parentid/", + json={ + "id": "parentid", + "corpus": {"id": "corpusid"}, + }, + ) + mock.get( + "https://arkindex.teklia.com/api/v1/elements/parentid/children/?name=document&type=book&recursive=true", + json={"count": 0, "number": 1, "results": []}, + ) + mock.post( + "https://arkindex.teklia.com/api/v1/image/", + json={ + "id": "imageid", + "s3_put_url": "http://s3/put", + "width": 0, + "height": 0, + }, + ) + mock.put("http://s3/put", status_code=200) + mock.patch( + "https://arkindex.teklia.com/api/v1/image/imageid/", + json={ + "id": "imageid", + "s3_put_url": None, + "width": 12558699, + "height": 12558699, + }, + ) + mock.post( + "https://arkindex.teklia.com/api/v1/elements/create/", + [ + {"json": {"id": "folderid"}}, + { + "json": { + "id": "elementid", + "name": "new element", + "corpus": {"id": "corpusid"}, + "zone": {"image": {"id": "imageid"}}, + } + }, + ], + ) + mock.post("/api/v1/element/elementid/metadata/") + mock.post( + "https://arkindex.teklia.com/api/v1/element/elementid/children/bulk/", + status_code=201, + json=[{"id": "region1"}, {"id": "region2"}], + ) + mock.post( + "https://arkindex.teklia.com/api/v1/element/region1/children/bulk/", + status_code=201, + json=[{"id": "line1"}, {"id": "line2"}, {"id": "line3"}, {"id": "line4"}], + ) + mock.post("/api/v1/element/line1/metadata/") + mock.post("/api/v1/element/line2/metadata/") + mock.post("/api/v1/element/line3/metadata/") + mock.post("/api/v1/element/line4/metadata/") + mock.post( + "https://arkindex.teklia.com/api/v1/element/region2/children/bulk/", + status_code=201, + json=[ + {"id": "line1"}, + {"id": "line2"}, + ], + ) + mock.post( + "https://arkindex.teklia.com/api/v1/transcription/bulk/", + status_code=201, + json={ + "transcriptions": [ + {"id": "transcription1"}, + {"id": "transcription2"}, + {"id": "transcription3"}, + {"id": "transcription4"}, + {"id": "transcription5"}, + {"id": "transcription6"}, + {"id": "transcription7"}, + {"id": "transcription8"}, + ] + }, + ) + + importer = TranskribusImporter( + self.archive_path, + corpus="corpusid", + parent_id="parentid", + folder_type="book", + element_type="page", + paragraph_type="paragraph", + line_type="line", + ) + self.assertListEqual( - elements, - [{"type": "elements", "id": "folderid"}], + importer.run(), + [ + {"id": "folderid"}, + { + "id": "elementid", + "name": "new element", + "corpus": {"id": "corpusid"}, + "zone": {"image": {"id": "imageid"}}, + }, + ], + ) + + self.assertListEqual( + [(req.method, req.url) for req in mock.request_history], + [ + ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/"), + ("GET", "https://arkindex.teklia.com/api/v1/element/parentid/"), + ( + "GET", + "https://arkindex.teklia.com/api/v1/elements/parentid/children/?name=document&recursive=True&type=book", + ), + ("POST", "https://arkindex.teklia.com/api/v1/elements/create/"), + ("POST", "https://arkindex.teklia.com/api/v1/image/"), + ("PUT", "http://s3/put"), + ("PATCH", "https://arkindex.teklia.com/api/v1/image/imageid/"), + ("POST", "https://arkindex.teklia.com/api/v1/elements/create/"), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/elementid/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/elementid/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/elementid/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/elementid/children/bulk/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/region1/children/bulk/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line1/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line2/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line3/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line4/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/region2/children/bulk/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line1/metadata/", + ), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/line2/metadata/", + ), + ("POST", "https://arkindex.teklia.com/api/v1/transcription/bulk/"), + ( + "POST", + "https://arkindex.teklia.com/api/v1/element/elementid/metadata/", + ), + ], ) @requests_mock.Mocker() - @patch("arkindex_tasks.import_transkribus.WORKER_RUN_ID", str(uuid4())) - @patch("arkindex_tasks.import_transkribus.TranskribusFolder.run") + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) + @patch("arkindex_tasks.import_files.transkribus.TranskribusFolder.run") def test_run_folder_error(self, req_mock, run_mock): - req_mock.get( - "https://transkribus.eu/export.zip", - headers={"Content-Length": str(self.archive_path.stat().st_size)}, - body=self.archive_path.open("rb"), - ) req_mock.get( "https://arkindex.teklia.com/api/v1/corpus/corpusid/", json=CORPUS_INFO ) run_mock.side_effect = TypeError("Oh snap!") importer = TranskribusImporter( - self.job_path, + self.archive_path, corpus="corpusid", folder_type="book", element_type="page", paragraph_type="paragraph", line_type="line", ) - # An error occurs after the import finishes, since there are zero elements in the whole import, - # but the folder's TypeError does not cause a crash - with self.assertRaisesRegex(AssertionError, "No elements could be written"): - importer.run() + # Nothing is imported + self.assertListEqual(importer.run(), []) self.assertListEqual( [(req.method, req.url) for req in req_mock.request_history], [ ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/"), - ("GET", "https://transkribus.eu/export.zip"), ], ) self.assertEqual(run_mock.call_count, 1) + + @requests_mock.Mocker() + @patch("arkindex_tasks.import_files.transkribus.WORKER_RUN_ID", str(uuid4())) + def test_run_parent_wrong_corpus(self, req_mock): + req_mock.get( + "https://arkindex.teklia.com/api/v1/corpus/corpusid/", json=CORPUS_INFO + ) + req_mock.get( + "https://arkindex.teklia.com/api/v1/element/parentid/", + json={"id": "parentid", "corpus": {"id": "not_corpusid"}}, + ) + + with self.assertRaises(ValueError) as ctx: + TranskribusImporter( + self.archive_path, + corpus="corpusid", + parent_id="parentid", + folder_type="book", + element_type="page", + paragraph_type="paragraph", + line_type="line", + ) + + self.assertEqual( + ctx.exception.args, ("Parent element parentid is not in corpus corpusid",) + ) + + self.assertListEqual( + [(req.method, req.url) for req in req_mock.request_history], + [ + ("GET", "https://arkindex.teklia.com/api/v1/corpus/corpusid/"), + ("GET", "https://arkindex.teklia.com/api/v1/element/parentid/"), + ], + ) diff --git a/tests/test_base.py b/tests/test_base.py index 869e3ef1d66262764304cc0db5547ee8a1811253..59699f048bf800a314a5e1c55575285fbd04f674 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,9 +1,7 @@ # -*- coding: utf-8 -*- -import json -from io import StringIO from pathlib import Path from unittest import TestCase -from unittest.mock import MagicMock, call, patch +from unittest.mock import patch import requests_mock @@ -27,32 +25,6 @@ class TestBase(TestCase): finally: arkindex_tasks.base._working_dir = None - @patch("arkindex_tasks.base.Path") - def test_dump_elements(self, path_mock): - file_mock = StringIO() - file_mock.close = MagicMock - path_mock.return_value.open.return_value = file_mock - elements = {"dog": ["Milo", "Archie", "Ollie"], "cat": ["Kung", "Fury"]} - arkindex_tasks.base.dump_elements(path="/garden/", **elements) - self.assertListEqual( - path_mock.call_args_list, [call("/garden/", "elements.json")] - ) - self.assertListEqual(path_mock.return_value.open.call_args_list, [call("w")]) - self.assertListEqual( - json.loads(file_mock.getvalue()), - [{"type": "dog", "id": dog_id} for dog_id in elements["dog"]] - + [{"type": "cat", "id": cat_id} for cat_id in elements["cat"]], - ) - - @patch("arkindex_tasks.base.Path") - def test_dump_empty_list(self, path_mock): - file_mock = StringIO() - file_mock.close = MagicMock - path_mock.return_value.open.return_value = file_mock - elements = {} - with self.assertRaisesRegex(AssertionError, "No elements could be written"): - arkindex_tasks.base.dump_elements(**elements) - def test_chunks(self): self.assertListEqual( list(arkindex_tasks.base.split_chunks([i for i in range(1, 6)], 2)), diff --git a/tests/test_export_transkribus.py b/tests/test_export_transkribus.py deleted file mode 100644 index 00233e5a537a25d1a21350e548e366eda3f0fb03..0000000000000000000000000000000000000000 --- a/tests/test_export_transkribus.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from unittest import TestCase -from unittest.mock import call, patch - -import requests_mock - -from arkindex_tasks.base import get_working_dir -from arkindex_tasks.export_transkribus import TranskribusExporter, transkribus_client - - -class TestExporter(TestCase): - @patch("transkribus.models.time.sleep", lambda duration: None) - @requests_mock.Mocker() - def test_exporter(self, mock): - job_data = { - "jobId": "999999", - "status": "FINISHED", - "description": "Done", - "progress": 1, - "totalWork": 0, - "nrOfErrors": 0, - "result": "https://transkribus.eu/export.zip", - } - mock.post( - "https://transkribus.eu/TrpServer/rest/collections/1234/export", - headers={"Content-Type": "text/plain"}, - text="999999", - ) - mock.get( - "https://transkribus.eu/TrpServer/rest/jobs/999999", - # requests_mock does not return the content-type even when using the json argument, - # but the Transkribus client uses this header to return JSON instead of bytes - headers={"Content-Type": "application/json"}, - json=job_data, - ) - - TranskribusExporter(1234).run() - - self.assertListEqual( - [(req.method, req.url) for req in mock.request_history], - [ - ( - "POST", - "https://transkribus.eu/TrpServer/rest/collections/1234/export", - ), - ("GET", "https://transkribus.eu/TrpServer/rest/jobs/999999"), - ], - ) - - self.assertEqual( - json.dumps(job_data), - (get_working_dir() / "transkribus_export_job.json").read_text(), - ) - - @requests_mock.Mocker() - @patch("transkribus.models.Job.wait_for_result") - def test_exporter_timeout_default(self, mock, result_mock): - mock.post( - "https://transkribus.eu/TrpServer/rest/collections/1234/export", - headers={"Content-Type": "text/plain"}, - text="999999", - ) - - TranskribusExporter(1234).run() - - self.assertEqual(result_mock.call_count, 1) - self.assertEqual(result_mock.call_args, call(transkribus_client, timeout=7200)) - - @requests_mock.Mocker() - @patch("transkribus.models.Job.wait_for_result") - @patch.dict("os.environ", {"TRANSKRIBUS_JOB_TIMEOUT": "9999"}) - def test_exporter_timeout_from_env(self, mock, result_mock): - mock.post( - "https://transkribus.eu/TrpServer/rest/collections/1234/export", - headers={"Content-Type": "text/plain"}, - text="999999", - ) - - TranskribusExporter(1234).run() - - self.assertEqual(result_mock.call_count, 1) - self.assertEqual(result_mock.call_args, call(transkribus_client, timeout=9999))