Skip to content
Snippets Groups Projects

Remove thumbnail generation task

Merged Erwan Rouchet requested to merge nuke-thumbnails into master
6 files
+ 0
672
Compare changes
  • Side-by-side
  • Inline
Files
6
+ 0
249
# -*- coding: utf-8 -*-
import argparse
import json
import logging
import sys
from io import BytesIO
from pathlib import Path
import requests
from apistar.exceptions import ErrorResponse
from PIL import Image
from requests.exceptions import RequestException
from teklia_toolbox.requests import should_verify_cert
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from arkindex_tasks import default_client
logger = logging.getLogger(__name__)
class ThumbnailGenerator(object):
def __init__(self, path):
assert path.is_file(), "Path does not exist"
with path.open() as f:
data = json.load(f)
assert isinstance(data, list), "File should hold a list of elements"
self.elements = data
def get_folder(self, element):
"""
Retrieve an element from an item of elements.json and only return the element if it is a folder.
"""
assert "id" in element, "Missing element ID"
try:
element = default_client.request("RetrieveElement", id=element["id"])
except ErrorResponse as e:
if e.status_code == 404:
logger.warning("Element {!s} not found".format(element["id"]))
return
raise
# Find the element's type attributes
corpus = next(
corpus for corpus in self.corpora if corpus["id"] == element["corpus"]["id"]
)
element_type = next(t for t in corpus["types"] if t["slug"] == element["type"])
if not element_type["folder"]:
return
return element
def open_image(self, base_url, image_width, max_width=300):
"""
Open a IIIF image with Pillow, resized to a maximum width, using a base URL
"""
max_width = min(max_width, image_width)
url = f"{base_url}/full/{max_width},/0/default.jpg"
# PIL.Image.open requires the seek(int) method that the urllib responses do not provide
# We therefore get the whole response content and put it in a BytesIO
resp = requests.get(url, timeout=(30, 60), verify=should_verify_cert(url))
resp.raise_for_status()
return Image.open(BytesIO(resp.content))
def _list_folder_elements(self, folder_id):
"""
List all elements in a folder that could have images and could be used as thumbnails.
"""
# First try with the first page of results returned by a non-recursive list, much faster than recursive
first_page = default_client.request(
"ListElementChildren", id=folder_id, folder=False, with_zone=True
)["results"]
yield from first_page
# Fallback to going through all of the non-folder elements recursively.
# If there were no results at all, we do not even try to make the other query,
# since we already know there will be no results on this one too.
if first_page:
yield from default_client.paginate(
"ListElementChildren",
id=folder_id,
folder=False,
with_zone=True,
recursive=True,
)
def get_first_images(self, folder_id, n=3, width=900, height=400):
"""
Retrieve the images corresponding to the n first elements with
a zone inside a folder
"""
elements = self._list_folder_elements(folder_id)
stored_urls = []
images = []
while len(images) < n:
try:
element = next(elements)
except StopIteration:
break
zone = element.get("zone")
if not zone or zone["image"]["url"] in stored_urls:
# Skip elements with no image or that were already retrieved
continue
try:
image = self.open_image(
zone["image"]["url"], zone["image"]["width"], max_width=width
)
except RequestException as e:
# Skip elements with a non valid image
logger.warning(
f"""Image with URL "{zone['image']['url']}" returned an error: {e}"""
)
# Store the URL anyway to not retry this image again, even if it is found on another element
stored_urls.append(zone["image"]["url"])
continue
stored_urls.append(zone["image"]["url"])
images.append(image)
return images
def generate_thumbnail(self, images, width=900, height=400):
"""
Generate a Pillow image holding a thumbnail from a list of Pillow images.
The thumbnail will be divided in 'columns', one for each image.
Each image is then resized and cropped, relative to its center, to fill the whole column.
"""
thumbnail = Image.new("RGB", (width, height))
# Width of a single image in the thumbnail
single_width = int(width / len(images))
for i, image in enumerate(images):
# Resize
image_width, image_height = image.size
ratio = max(single_width / image_width, height / image_height)
newsize = int(image_width * ratio), int(image_height * ratio)
image = image.resize(newsize, Image.BICUBIC)
# Crop
image_width, image_height = image.size
left = int((image_width - single_width) / 2)
top = int((image_height - height) / 2)
image = image.crop((left, top, left + single_width, top + height))
# Assemble
thumbnail.paste(image, (i * single_width, 0))
return thumbnail
@retry(
reraise=True,
retry=retry_if_exception_type(requests.RequestException),
stop=stop_after_attempt(3),
wait=wait_fixed(5),
)
def upload_thumbnail(self, thumbnail, url):
"""
Upload a Pillow image to a S3 PUT URL
"""
b = BytesIO()
thumbnail.save(b, format="jpeg")
b.seek(0)
resp = requests.put(url, data=b, verify=should_verify_cert(url))
resp.raise_for_status()
def run(self):
self.corpora = default_client.request("ListCorpus")
processed_count, error_count = 0, 0
for element in filter(None, map(self.get_folder, self.elements)):
processed_count += 1
logger.info(
"Downloading images of the first elements in folder {} ({})".format(
element["name"], element["id"]
)
)
images = self.get_first_images(element["id"])
if len(images) < 1:
logger.warning(
"No elements with a valid image were found for folder {} ({}) - skipping generation".format(
element["name"], element["id"]
)
)
error_count += 1
continue
logger.info(
"Generating thumbnail for folder {} ({})".format(
element["name"], element["id"]
)
)
try:
thumbnail = self.generate_thumbnail(images)
except Exception as e:
logger.warning(
"Thumbnail generation failed for folder {} ({}): {!s}".format(
element["name"], element["id"], e
)
)
error_count += 1
continue
logger.info(
"Uploading thumbnail for folder {} ({})".format(
element["name"], element["id"]
)
)
try:
self.upload_thumbnail(thumbnail, element["thumbnail_put_url"])
except Exception as e:
logger.warning(
"Thumbnail upload failed for folder {} ({}): {!s}".format(
element["name"], element["id"], e
)
)
error_count += 1
logger.info(
"Ran on {} folders ({} failed)".format(processed_count, error_count)
)
if processed_count and processed_count <= error_count:
# All folders have failed
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Generate thumbnails for one or more folders"
)
parser.add_argument(
"path", help="Path to a JSON file holding a list of elements", type=Path
)
parser.add_argument(
"--sleep",
help="Throttle API requests by waiting for a given number of seconds",
type=float,
default=0,
)
args = vars(parser.parse_args())
default_client.sleep_duration = args.pop("sleep")
ThumbnailGenerator(**args).run()
if __name__ == "__main__":
main()
Loading