diff --git a/arkindex_tasks/generate_thumbnails.py b/arkindex_tasks/generate_thumbnails.py deleted file mode 100644 index 4a0650744817718f5daa10afadcc922367202a96..0000000000000000000000000000000000000000 --- a/arkindex_tasks/generate_thumbnails.py +++ /dev/null @@ -1,249 +0,0 @@ -# -*- coding: utf-8 -*- -import argparse -import json -import logging -import sys -from io import BytesIO -from pathlib import Path - -import requests -from apistar.exceptions import ErrorResponse -from PIL import Image -from requests.exceptions import RequestException -from teklia_toolbox.requests import should_verify_cert -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed - -from arkindex_tasks import default_client - -logger = logging.getLogger(__name__) - - -class ThumbnailGenerator(object): - def __init__(self, path): - assert path.is_file(), "Path does not exist" - with path.open() as f: - data = json.load(f) - assert isinstance(data, list), "File should hold a list of elements" - self.elements = data - - def get_folder(self, element): - """ - Retrieve an element from an item of elements.json and only return the element if it is a folder. - """ - assert "id" in element, "Missing element ID" - - try: - element = default_client.request("RetrieveElement", id=element["id"]) - except ErrorResponse as e: - if e.status_code == 404: - logger.warning("Element {!s} not found".format(element["id"])) - return - raise - - # Find the element's type attributes - corpus = next( - corpus for corpus in self.corpora if corpus["id"] == element["corpus"]["id"] - ) - element_type = next(t for t in corpus["types"] if t["slug"] == element["type"]) - - if not element_type["folder"]: - return - return element - - def open_image(self, base_url, image_width, max_width=300): - """ - Open a IIIF image with Pillow, resized to a maximum width, using a base URL - """ - max_width = min(max_width, image_width) - - url = f"{base_url}/full/{max_width},/0/default.jpg" - - # PIL.Image.open requires the seek(int) method that the urllib responses do not provide - # We therefore get the whole response content and put it in a BytesIO - resp = requests.get(url, timeout=(30, 60), verify=should_verify_cert(url)) - resp.raise_for_status() - return Image.open(BytesIO(resp.content)) - - def _list_folder_elements(self, folder_id): - """ - List all elements in a folder that could have images and could be used as thumbnails. - """ - # First try with the first page of results returned by a non-recursive list, much faster than recursive - first_page = default_client.request( - "ListElementChildren", id=folder_id, folder=False, with_zone=True - )["results"] - yield from first_page - - # Fallback to going through all of the non-folder elements recursively. - # If there were no results at all, we do not even try to make the other query, - # since we already know there will be no results on this one too. - if first_page: - yield from default_client.paginate( - "ListElementChildren", - id=folder_id, - folder=False, - with_zone=True, - recursive=True, - ) - - def get_first_images(self, folder_id, n=3, width=900, height=400): - """ - Retrieve the images corresponding to the n first elements with - a zone inside a folder - """ - elements = self._list_folder_elements(folder_id) - stored_urls = [] - images = [] - while len(images) < n: - try: - element = next(elements) - except StopIteration: - break - - zone = element.get("zone") - if not zone or zone["image"]["url"] in stored_urls: - # Skip elements with no image or that were already retrieved - continue - try: - image = self.open_image( - zone["image"]["url"], zone["image"]["width"], max_width=width - ) - except RequestException as e: - # Skip elements with a non valid image - logger.warning( - f"""Image with URL "{zone['image']['url']}" returned an error: {e}""" - ) - # Store the URL anyway to not retry this image again, even if it is found on another element - stored_urls.append(zone["image"]["url"]) - continue - - stored_urls.append(zone["image"]["url"]) - images.append(image) - return images - - def generate_thumbnail(self, images, width=900, height=400): - """ - Generate a Pillow image holding a thumbnail from a list of Pillow images. - The thumbnail will be divided in 'columns', one for each image. - Each image is then resized and cropped, relative to its center, to fill the whole column. - """ - thumbnail = Image.new("RGB", (width, height)) - # Width of a single image in the thumbnail - single_width = int(width / len(images)) - - for i, image in enumerate(images): - - # Resize - image_width, image_height = image.size - ratio = max(single_width / image_width, height / image_height) - newsize = int(image_width * ratio), int(image_height * ratio) - image = image.resize(newsize, Image.BICUBIC) - - # Crop - image_width, image_height = image.size - left = int((image_width - single_width) / 2) - top = int((image_height - height) / 2) - image = image.crop((left, top, left + single_width, top + height)) - - # Assemble - thumbnail.paste(image, (i * single_width, 0)) - - return thumbnail - - @retry( - reraise=True, - retry=retry_if_exception_type(requests.RequestException), - stop=stop_after_attempt(3), - wait=wait_fixed(5), - ) - def upload_thumbnail(self, thumbnail, url): - """ - Upload a Pillow image to a S3 PUT URL - """ - b = BytesIO() - thumbnail.save(b, format="jpeg") - b.seek(0) - resp = requests.put(url, data=b, verify=should_verify_cert(url)) - resp.raise_for_status() - - def run(self): - self.corpora = default_client.request("ListCorpus") - - processed_count, error_count = 0, 0 - for element in filter(None, map(self.get_folder, self.elements)): - processed_count += 1 - - logger.info( - "Downloading images of the first elements in folder {} ({})".format( - element["name"], element["id"] - ) - ) - images = self.get_first_images(element["id"]) - if len(images) < 1: - logger.warning( - "No elements with a valid image were found for folder {} ({}) - skipping generation".format( - element["name"], element["id"] - ) - ) - error_count += 1 - continue - - logger.info( - "Generating thumbnail for folder {} ({})".format( - element["name"], element["id"] - ) - ) - try: - thumbnail = self.generate_thumbnail(images) - except Exception as e: - logger.warning( - "Thumbnail generation failed for folder {} ({}): {!s}".format( - element["name"], element["id"], e - ) - ) - error_count += 1 - continue - - logger.info( - "Uploading thumbnail for folder {} ({})".format( - element["name"], element["id"] - ) - ) - try: - self.upload_thumbnail(thumbnail, element["thumbnail_put_url"]) - except Exception as e: - logger.warning( - "Thumbnail upload failed for folder {} ({}): {!s}".format( - element["name"], element["id"], e - ) - ) - error_count += 1 - - logger.info( - "Ran on {} folders ({} failed)".format(processed_count, error_count) - ) - if processed_count and processed_count <= error_count: - # All folders have failed - sys.exit(1) - - -def main(): - parser = argparse.ArgumentParser( - description="Generate thumbnails for one or more folders" - ) - parser.add_argument( - "path", help="Path to a JSON file holding a list of elements", type=Path - ) - parser.add_argument( - "--sleep", - help="Throttle API requests by waiting for a given number of seconds", - type=float, - default=0, - ) - args = vars(parser.parse_args()) - default_client.sleep_duration = args.pop("sleep") - ThumbnailGenerator(**args).run() - - -if __name__ == "__main__": - main() diff --git a/tests/image_helpers.py b/tests/image_helpers.py deleted file mode 100644 index fafade74db75ec10f5be67a5f606ace6ebb12c2d..0000000000000000000000000000000000000000 --- a/tests/image_helpers.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -import math - -from PIL import ImageChops - - -def root_mean_square(img_a, img_b): - """ - Get the root-mean-square difference between two images - See https://effbot.org/zone/pil-comparing-images.htm - """ - - h = ImageChops.difference(img_a, img_b).histogram() - return math.sqrt( - sum((value * ((idx % 256) ** 2) for idx, value in enumerate(h))) - / float(img_a.size[0] * img_a.size[1]) - ) diff --git a/tests/samples/img2.jpg b/tests/samples/img2.jpg deleted file mode 100644 index 000c8ccf3e4c15d93daa63d947b1d03322b30e2e..0000000000000000000000000000000000000000 Binary files a/tests/samples/img2.jpg and /dev/null differ diff --git a/tests/samples/img3.jpg b/tests/samples/img3.jpg deleted file mode 100644 index 454e11cceaa8b309825bef6ae86e8cda21741e8c..0000000000000000000000000000000000000000 Binary files a/tests/samples/img3.jpg and /dev/null differ diff --git a/tests/samples/thumb.jpg b/tests/samples/thumb.jpg deleted file mode 100644 index e3e968877722c315a9280bf117f05659f5d18101..0000000000000000000000000000000000000000 Binary files a/tests/samples/thumb.jpg and /dev/null differ diff --git a/tests/test_generate_thumbnails.py b/tests/test_generate_thumbnails.py deleted file mode 100644 index 57ffde5b5d3eb219d79ee9f59acf0a608ab7bdd5..0000000000000000000000000000000000000000 --- a/tests/test_generate_thumbnails.py +++ /dev/null @@ -1,406 +0,0 @@ -# -*- coding: utf-8 -*- -import json -import tempfile -from pathlib import Path -from unittest import TestCase -from unittest.mock import patch - -import requests_mock -from PIL import Image, ImageChops - -from arkindex_tasks.generate_thumbnails import ThumbnailGenerator - -from .image_helpers import root_mean_square - -SAMPLES = Path(__file__).absolute().parent / "samples" - - -@patch( - "arkindex_tasks.generate_thumbnails.ThumbnailGenerator.upload_thumbnail.retry.wait.wait_fixed", - new=0, -) -class TestThumbnailGenerator(TestCase): - def setUp(self): - self.path = None - - def _write_file(self, data): - self.path = Path(tempfile.mkstemp()[1]) - with self.path.open("w") as f: - json.dump(data, f) - return self.path - - def tearDown(self): - if self.path: - self.path.unlink() - - def test_file_exists(self): - with self.assertRaisesRegex(AssertionError, "does not exist"): - ThumbnailGenerator(Path("/dev/null")) - - def test_file_is_list(self): - self._write_file({"not": "a list"}) - with self.assertRaisesRegex(AssertionError, "should hold a list"): - ThumbnailGenerator(self.path) - - def test_get_folder_requires_id(self): - self._write_file([{"id": "volumeid"}]) - with self.assertRaisesRegex(AssertionError, "Missing element ID"): - ThumbnailGenerator(self.path).get_folder({"without": "an ID"}) - - @requests_mock.Mocker() - def test_get_folder_not_found(self, mock): - mock.get("/api/v1/element/notfound/", status_code=404) - self._write_file([{"id": "notfound"}]) - self.assertIsNone(ThumbnailGenerator(self.path).get_folder({"id": "notfound"})) - - @requests_mock.Mocker() - def test_get_folder_not_a_folder(self, mock): - self._write_file([{"id": "pageid"}]) - mock.get( - "/api/v1/element/pageid/", - json={"id": "pageid", "type": "page", "corpus": {"id": "corpusid"}}, - ) - gen = ThumbnailGenerator(self.path) - gen.corpora = [ - { - "id": "corpusid", - "types": [{"slug": "page", "display_name": "Page", "folder": False}], - } - ] - self.assertIsNone(gen.get_folder({"id": "pageid"})) - - @requests_mock.Mocker() - def test_get_folder(self, mock): - self._write_file([{"id": "volumeid"}]) - volume_data = {"id": "volumeid", "type": "volume", "corpus": {"id": "corpusid"}} - mock.get("/api/v1/element/volumeid/", json=volume_data) - gen = ThumbnailGenerator(self.path) - gen.corpora = [ - { - "id": "corpusid", - "types": [{"slug": "volume", "display_name": "Volume", "folder": True}], - } - ] - self.assertDictEqual(gen.get_folder({"id": "volumeid"}), volume_data) - - @requests_mock.Mocker() - def test_get_first_images_max(self, mock): - self._write_file([{"id": "volumeid"}]) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True", - # Require an exact match of the entire query string, not just a portion of it - complete_qs=True, - json={ - "count": 3, - "number": 1, - "results": [ - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - {"zone": {"image": {"url": "https://url2", "width": 1000}}}, - {"zone": {"image": {"url": "https://url3", "width": 1000}}}, - ], - }, - ) - mock.get( - "https://url1/full/50,/0/default.jpg", - [{"body": open(SAMPLES / "img1.jpg", "rb")}], - ) - mock.get( - "https://url2/full/50,/0/default.jpg", - [{"body": open(SAMPLES / "img2.jpg", "rb")}], - ) - gen = ThumbnailGenerator(self.path) - self.assertListEqual( - gen.get_first_images("volumeid", n=2, width=50, height=40), - [Image.open(SAMPLES / "img1.jpg"), Image.open(SAMPLES / "img2.jpg")], - ) - self.assertEqual(mock.call_count, 3) - - @requests_mock.Mocker() - def test_get_first_images_deduplicates_url(self, mock): - self._write_file([{"id": "volumeid"}]) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True", - complete_qs=True, - json={ - "count": 3, - "number": 1, - "results": [ - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - ], - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True", - complete_qs=True, - json={ - "count": 3, - "number": 1, - "results": [ - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - ], - }, - ) - mock.get( - "https://url1/full/50,/0/default.jpg", - [{"body": open(SAMPLES / "img1.jpg", "rb")}], - ) - gen = ThumbnailGenerator(self.path) - self.assertListEqual( - gen.get_first_images("volumeid", n=2, width=50, height=40), - [Image.open(SAMPLES / "img1.jpg")], - ) - self.assertEqual(mock.call_count, 3) - - @requests_mock.Mocker() - def test_get_first_images_handles_errors(self, mock): - """ - In case of errors, element images are skipped until we got n images or no element are left - """ - self._write_file([{"id": "volumeid"}]) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True", - complete_qs=True, - json={ - "count": 3, - "number": 1, - # This should not be called, since only the first non-recursive page should be retrieved - "next": "/nowhere", - "results": [ - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - {"zone": {"image": {"url": "https://url2", "width": 1000}}}, - {"zone": {"image": {"url": "https://url3", "width": 1000}}}, - ], - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True", - complete_qs=True, - json={ - "count": 3, - "number": 1, - "next": "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=2", - "results": [ - {"zone": {"image": {"url": "https://url1", "width": 1000}}}, - ], - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=2", - complete_qs=True, - json={ - "count": 3, - "number": 1, - "next": "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=3", - "results": [ - {"zone": {"image": {"url": "https://url2", "width": 1000}}}, - ], - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=3", - complete_qs=True, - json={ - "count": 3, - "number": 1, - "next": None, - "results": [ - {"zone": {"image": {"url": "https://url3", "width": 1000}}}, - ], - }, - ) - mock.get("https://url1/full/50,/0/default.jpg", status_code=404) - mock.get("https://url2/full/50,/0/default.jpg", status_code=502) - mock.get( - "https://url3/full/50,/0/default.jpg", - [{"body": open(SAMPLES / "img3.jpg", "rb")}], - ) - gen = ThumbnailGenerator(self.path) - self.assertListEqual( - gen.get_first_images("volumeid", n=2, width=50, height=40), - [Image.open(SAMPLES / "img3.jpg")], - ) - self.assertEqual(mock.call_count, 7) - - @requests_mock.Mocker() - def test_open_image(self, mock): - self._write_file([{"id": "volumeid"}]) - mock.get( - "http://someimage/full/50,/0/default.jpg", - body=open(SAMPLES / "img1.jpg", "rb"), - ) - gen = ThumbnailGenerator(self.path) - - expected_img = Image.open(str(SAMPLES / "img1.jpg")) - actual_img = gen.open_image( - "http://someimage", - image_width=50, - max_width=100, - ) - - # See https://effbot.org/zone/pil-comparing-images.htm - self.assertIsNone(ImageChops.difference(expected_img, actual_img).getbbox()) - self.assertEqual(mock.call_count, 1) - - @requests_mock.Mocker() - def test_open_image_max_width(self, mock): - self._write_file([{"id": "volumeid"}]) - mock.get( - "http://someimage/full/100,/0/default.jpg", - body=open(SAMPLES / "img1.jpg", "rb"), - ) - gen = ThumbnailGenerator(self.path) - gen.open_image( - "http://someimage", - image_width=1000, - max_width=100, - ) - self.assertEqual(mock.call_count, 1) - - @requests_mock.Mocker() - def test_generate_thumbnail(self, mock): - self._write_file([{"id": "volumeid"}]) - images = [Image.open(SAMPLES / f"img{n}.jpg") for n in range(1, 4)] - - gen = ThumbnailGenerator(self.path) - - expected_thumbnail = Image.open(str(SAMPLES / "thumb.jpg")) - actual_thumbnail = gen.generate_thumbnail(images) - - self.assertLessEqual( - root_mean_square(expected_thumbnail, actual_thumbnail), 10.0 - ) - - @requests_mock.Mocker() - def test_upload_thumbnail(self, mock): - self._write_file([{"id": "volumeid"}]) - mock.put( - "http://s3/somewhere", - [{"status_code": 502}, {"status_code": 503}, {"status_code": 200}], - ) - gen = ThumbnailGenerator(self.path) - gen.upload_thumbnail( - thumbnail=Image.open(str(SAMPLES / "thumb.jpg")), url="http://s3/somewhere" - ) - self.assertEqual(mock.call_count, 3) - - @requests_mock.Mocker() - def test_run(self, mock): - self._write_file([{"id": "notfound"}, {"id": "pageid"}, {"id": "volumeid"}]) - mock.get( - "/api/v1/corpus/", - json=[ - { - "id": "corpusid", - "types": [ - {"slug": "volume", "display_name": "Volume", "folder": True}, - {"slug": "page", "display_name": "Page", "folder": False}, - ], - } - ], - ) - mock.get("/api/v1/element/notfound/", status_code=404) - mock.get( - "/api/v1/element/pageid/", - json={"id": "pageid", "type": "page", "corpus": {"id": "corpusid"}}, - ) - mock.get( - "/api/v1/element/volumeid/", - json={ - "id": "volumeid", - "name": "Some volume", - "type": "volume", - "corpus": {"id": "corpusid"}, - "thumbnail_put_url": "http://s3/somewhere", - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True", - complete_qs=True, - json={ - "count": 2, - "number": 1, - "results": [ - { - "zone": { - "image": {"url": "http://url1", "width": 500}, - } - }, - { - "zone": { - "image": {"url": "http://url2", "width": 750}, - } - }, - ], - }, - ) - mock.get( - "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True", - complete_qs=True, - json={ - "count": 4, - "number": 1, - "results": [ - { - "zone": { - "image": {"url": "http://url1", "width": 500}, - } - }, - { - "zone": { - "image": {"url": "http://url2", "width": 750}, - } - }, - { - "zone": { - "image": {"url": "http://url3", "width": 1000}, - } - }, - { - "zone": { - "image": {"url": "http://url4", "width": 1000}, - } - }, - ], - }, - ) - mock.get( - "http://url1/full/500,/0/default.jpg", - body=open(SAMPLES / "img1.jpg", "rb"), - ) - mock.get( - "http://url2/full/750,/0/default.jpg", - body=open(SAMPLES / "img2.jpg", "rb"), - ) - mock.get( - "http://url3/full/900,/0/default.jpg", - body=open(SAMPLES / "img3.jpg", "rb"), - ) - mock.put("http://s3/somewhere", [{"status_code": 502}, {"status_code": 200}]) - - gen = ThumbnailGenerator(self.path) - gen.run() - - self.assertListEqual( - [(req.method, req.url) for req in mock.request_history], - [ - ("GET", "https://arkindex.teklia.com/api/v1/corpus/"), - ("GET", "https://arkindex.teklia.com/api/v1/element/notfound/"), - ("GET", "https://arkindex.teklia.com/api/v1/element/pageid/"), - ("GET", "https://arkindex.teklia.com/api/v1/element/volumeid/"), - ( - "GET", - "https://arkindex.teklia.com/api/v1/elements/volumeid/children/?folder=False&with_zone=True", - ), - ("GET", "http://url1/full/500,/0/default.jpg"), - ("GET", "http://url2/full/750,/0/default.jpg"), - ( - "GET", - "https://arkindex.teklia.com/api/v1/elements/volumeid/children/?folder=False&recursive=True&with_zone=True", - ), - ("GET", "http://url3/full/900,/0/default.jpg"), - ("PUT", "http://s3/somewhere"), - ("PUT", "http://s3/somewhere"), - ], - )