Skip to content
Snippets Groups Projects
Commit ae518f61 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Remove thumbnail generation task

parent 51d01801
No related branches found
No related tags found
1 merge request!384Remove thumbnail generation task
Pipeline #168134 passed
# -*- coding: utf-8 -*-
import argparse
import json
import logging
import sys
from io import BytesIO
from pathlib import Path
import requests
from apistar.exceptions import ErrorResponse
from PIL import Image
from requests.exceptions import RequestException
from teklia_toolbox.requests import should_verify_cert
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from arkindex_tasks import default_client
logger = logging.getLogger(__name__)
class ThumbnailGenerator(object):
def __init__(self, path):
assert path.is_file(), "Path does not exist"
with path.open() as f:
data = json.load(f)
assert isinstance(data, list), "File should hold a list of elements"
self.elements = data
def get_folder(self, element):
"""
Retrieve an element from an item of elements.json and only return the element if it is a folder.
"""
assert "id" in element, "Missing element ID"
try:
element = default_client.request("RetrieveElement", id=element["id"])
except ErrorResponse as e:
if e.status_code == 404:
logger.warning("Element {!s} not found".format(element["id"]))
return
raise
# Find the element's type attributes
corpus = next(
corpus for corpus in self.corpora if corpus["id"] == element["corpus"]["id"]
)
element_type = next(t for t in corpus["types"] if t["slug"] == element["type"])
if not element_type["folder"]:
return
return element
def open_image(self, base_url, image_width, max_width=300):
"""
Open a IIIF image with Pillow, resized to a maximum width, using a base URL
"""
max_width = min(max_width, image_width)
url = f"{base_url}/full/{max_width},/0/default.jpg"
# PIL.Image.open requires the seek(int) method that the urllib responses do not provide
# We therefore get the whole response content and put it in a BytesIO
resp = requests.get(url, timeout=(30, 60), verify=should_verify_cert(url))
resp.raise_for_status()
return Image.open(BytesIO(resp.content))
def _list_folder_elements(self, folder_id):
"""
List all elements in a folder that could have images and could be used as thumbnails.
"""
# First try with the first page of results returned by a non-recursive list, much faster than recursive
first_page = default_client.request(
"ListElementChildren", id=folder_id, folder=False, with_zone=True
)["results"]
yield from first_page
# Fallback to going through all of the non-folder elements recursively.
# If there were no results at all, we do not even try to make the other query,
# since we already know there will be no results on this one too.
if first_page:
yield from default_client.paginate(
"ListElementChildren",
id=folder_id,
folder=False,
with_zone=True,
recursive=True,
)
def get_first_images(self, folder_id, n=3, width=900, height=400):
"""
Retrieve the images corresponding to the n first elements with
a zone inside a folder
"""
elements = self._list_folder_elements(folder_id)
stored_urls = []
images = []
while len(images) < n:
try:
element = next(elements)
except StopIteration:
break
zone = element.get("zone")
if not zone or zone["image"]["url"] in stored_urls:
# Skip elements with no image or that were already retrieved
continue
try:
image = self.open_image(
zone["image"]["url"], zone["image"]["width"], max_width=width
)
except RequestException as e:
# Skip elements with a non valid image
logger.warning(
f"""Image with URL "{zone['image']['url']}" returned an error: {e}"""
)
# Store the URL anyway to not retry this image again, even if it is found on another element
stored_urls.append(zone["image"]["url"])
continue
stored_urls.append(zone["image"]["url"])
images.append(image)
return images
def generate_thumbnail(self, images, width=900, height=400):
"""
Generate a Pillow image holding a thumbnail from a list of Pillow images.
The thumbnail will be divided in 'columns', one for each image.
Each image is then resized and cropped, relative to its center, to fill the whole column.
"""
thumbnail = Image.new("RGB", (width, height))
# Width of a single image in the thumbnail
single_width = int(width / len(images))
for i, image in enumerate(images):
# Resize
image_width, image_height = image.size
ratio = max(single_width / image_width, height / image_height)
newsize = int(image_width * ratio), int(image_height * ratio)
image = image.resize(newsize, Image.BICUBIC)
# Crop
image_width, image_height = image.size
left = int((image_width - single_width) / 2)
top = int((image_height - height) / 2)
image = image.crop((left, top, left + single_width, top + height))
# Assemble
thumbnail.paste(image, (i * single_width, 0))
return thumbnail
@retry(
reraise=True,
retry=retry_if_exception_type(requests.RequestException),
stop=stop_after_attempt(3),
wait=wait_fixed(5),
)
def upload_thumbnail(self, thumbnail, url):
"""
Upload a Pillow image to a S3 PUT URL
"""
b = BytesIO()
thumbnail.save(b, format="jpeg")
b.seek(0)
resp = requests.put(url, data=b, verify=should_verify_cert(url))
resp.raise_for_status()
def run(self):
self.corpora = default_client.request("ListCorpus")
processed_count, error_count = 0, 0
for element in filter(None, map(self.get_folder, self.elements)):
processed_count += 1
logger.info(
"Downloading images of the first elements in folder {} ({})".format(
element["name"], element["id"]
)
)
images = self.get_first_images(element["id"])
if len(images) < 1:
logger.warning(
"No elements with a valid image were found for folder {} ({}) - skipping generation".format(
element["name"], element["id"]
)
)
error_count += 1
continue
logger.info(
"Generating thumbnail for folder {} ({})".format(
element["name"], element["id"]
)
)
try:
thumbnail = self.generate_thumbnail(images)
except Exception as e:
logger.warning(
"Thumbnail generation failed for folder {} ({}): {!s}".format(
element["name"], element["id"], e
)
)
error_count += 1
continue
logger.info(
"Uploading thumbnail for folder {} ({})".format(
element["name"], element["id"]
)
)
try:
self.upload_thumbnail(thumbnail, element["thumbnail_put_url"])
except Exception as e:
logger.warning(
"Thumbnail upload failed for folder {} ({}): {!s}".format(
element["name"], element["id"], e
)
)
error_count += 1
logger.info(
"Ran on {} folders ({} failed)".format(processed_count, error_count)
)
if processed_count and processed_count <= error_count:
# All folders have failed
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Generate thumbnails for one or more folders"
)
parser.add_argument(
"path", help="Path to a JSON file holding a list of elements", type=Path
)
parser.add_argument(
"--sleep",
help="Throttle API requests by waiting for a given number of seconds",
type=float,
default=0,
)
args = vars(parser.parse_args())
default_client.sleep_duration = args.pop("sleep")
ThumbnailGenerator(**args).run()
if __name__ == "__main__":
main()
# -*- coding: utf-8 -*-
import math
from PIL import ImageChops
def root_mean_square(img_a, img_b):
"""
Get the root-mean-square difference between two images
See https://effbot.org/zone/pil-comparing-images.htm
"""
h = ImageChops.difference(img_a, img_b).histogram()
return math.sqrt(
sum((value * ((idx % 256) ** 2) for idx, value in enumerate(h)))
/ float(img_a.size[0] * img_a.size[1])
)
tests/samples/img2.jpg

18.3 KiB

tests/samples/img3.jpg

18.3 KiB

tests/samples/thumb.jpg

65.2 KiB

# -*- coding: utf-8 -*-
import json
import tempfile
from pathlib import Path
from unittest import TestCase
from unittest.mock import patch
import requests_mock
from PIL import Image, ImageChops
from arkindex_tasks.generate_thumbnails import ThumbnailGenerator
from .image_helpers import root_mean_square
SAMPLES = Path(__file__).absolute().parent / "samples"
@patch(
"arkindex_tasks.generate_thumbnails.ThumbnailGenerator.upload_thumbnail.retry.wait.wait_fixed",
new=0,
)
class TestThumbnailGenerator(TestCase):
def setUp(self):
self.path = None
def _write_file(self, data):
self.path = Path(tempfile.mkstemp()[1])
with self.path.open("w") as f:
json.dump(data, f)
return self.path
def tearDown(self):
if self.path:
self.path.unlink()
def test_file_exists(self):
with self.assertRaisesRegex(AssertionError, "does not exist"):
ThumbnailGenerator(Path("/dev/null"))
def test_file_is_list(self):
self._write_file({"not": "a list"})
with self.assertRaisesRegex(AssertionError, "should hold a list"):
ThumbnailGenerator(self.path)
def test_get_folder_requires_id(self):
self._write_file([{"id": "volumeid"}])
with self.assertRaisesRegex(AssertionError, "Missing element ID"):
ThumbnailGenerator(self.path).get_folder({"without": "an ID"})
@requests_mock.Mocker()
def test_get_folder_not_found(self, mock):
mock.get("/api/v1/element/notfound/", status_code=404)
self._write_file([{"id": "notfound"}])
self.assertIsNone(ThumbnailGenerator(self.path).get_folder({"id": "notfound"}))
@requests_mock.Mocker()
def test_get_folder_not_a_folder(self, mock):
self._write_file([{"id": "pageid"}])
mock.get(
"/api/v1/element/pageid/",
json={"id": "pageid", "type": "page", "corpus": {"id": "corpusid"}},
)
gen = ThumbnailGenerator(self.path)
gen.corpora = [
{
"id": "corpusid",
"types": [{"slug": "page", "display_name": "Page", "folder": False}],
}
]
self.assertIsNone(gen.get_folder({"id": "pageid"}))
@requests_mock.Mocker()
def test_get_folder(self, mock):
self._write_file([{"id": "volumeid"}])
volume_data = {"id": "volumeid", "type": "volume", "corpus": {"id": "corpusid"}}
mock.get("/api/v1/element/volumeid/", json=volume_data)
gen = ThumbnailGenerator(self.path)
gen.corpora = [
{
"id": "corpusid",
"types": [{"slug": "volume", "display_name": "Volume", "folder": True}],
}
]
self.assertDictEqual(gen.get_folder({"id": "volumeid"}), volume_data)
@requests_mock.Mocker()
def test_get_first_images_max(self, mock):
self._write_file([{"id": "volumeid"}])
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True",
# Require an exact match of the entire query string, not just a portion of it
complete_qs=True,
json={
"count": 3,
"number": 1,
"results": [
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
{"zone": {"image": {"url": "https://url2", "width": 1000}}},
{"zone": {"image": {"url": "https://url3", "width": 1000}}},
],
},
)
mock.get(
"https://url1/full/50,/0/default.jpg",
[{"body": open(SAMPLES / "img1.jpg", "rb")}],
)
mock.get(
"https://url2/full/50,/0/default.jpg",
[{"body": open(SAMPLES / "img2.jpg", "rb")}],
)
gen = ThumbnailGenerator(self.path)
self.assertListEqual(
gen.get_first_images("volumeid", n=2, width=50, height=40),
[Image.open(SAMPLES / "img1.jpg"), Image.open(SAMPLES / "img2.jpg")],
)
self.assertEqual(mock.call_count, 3)
@requests_mock.Mocker()
def test_get_first_images_deduplicates_url(self, mock):
self._write_file([{"id": "volumeid"}])
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True",
complete_qs=True,
json={
"count": 3,
"number": 1,
"results": [
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
],
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True",
complete_qs=True,
json={
"count": 3,
"number": 1,
"results": [
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
],
},
)
mock.get(
"https://url1/full/50,/0/default.jpg",
[{"body": open(SAMPLES / "img1.jpg", "rb")}],
)
gen = ThumbnailGenerator(self.path)
self.assertListEqual(
gen.get_first_images("volumeid", n=2, width=50, height=40),
[Image.open(SAMPLES / "img1.jpg")],
)
self.assertEqual(mock.call_count, 3)
@requests_mock.Mocker()
def test_get_first_images_handles_errors(self, mock):
"""
In case of errors, element images are skipped until we got n images or no element are left
"""
self._write_file([{"id": "volumeid"}])
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True",
complete_qs=True,
json={
"count": 3,
"number": 1,
# This should not be called, since only the first non-recursive page should be retrieved
"next": "/nowhere",
"results": [
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
{"zone": {"image": {"url": "https://url2", "width": 1000}}},
{"zone": {"image": {"url": "https://url3", "width": 1000}}},
],
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True",
complete_qs=True,
json={
"count": 3,
"number": 1,
"next": "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=2",
"results": [
{"zone": {"image": {"url": "https://url1", "width": 1000}}},
],
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=2",
complete_qs=True,
json={
"count": 3,
"number": 1,
"next": "/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=3",
"results": [
{"zone": {"image": {"url": "https://url2", "width": 1000}}},
],
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True&page=3",
complete_qs=True,
json={
"count": 3,
"number": 1,
"next": None,
"results": [
{"zone": {"image": {"url": "https://url3", "width": 1000}}},
],
},
)
mock.get("https://url1/full/50,/0/default.jpg", status_code=404)
mock.get("https://url2/full/50,/0/default.jpg", status_code=502)
mock.get(
"https://url3/full/50,/0/default.jpg",
[{"body": open(SAMPLES / "img3.jpg", "rb")}],
)
gen = ThumbnailGenerator(self.path)
self.assertListEqual(
gen.get_first_images("volumeid", n=2, width=50, height=40),
[Image.open(SAMPLES / "img3.jpg")],
)
self.assertEqual(mock.call_count, 7)
@requests_mock.Mocker()
def test_open_image(self, mock):
self._write_file([{"id": "volumeid"}])
mock.get(
"http://someimage/full/50,/0/default.jpg",
body=open(SAMPLES / "img1.jpg", "rb"),
)
gen = ThumbnailGenerator(self.path)
expected_img = Image.open(str(SAMPLES / "img1.jpg"))
actual_img = gen.open_image(
"http://someimage",
image_width=50,
max_width=100,
)
# See https://effbot.org/zone/pil-comparing-images.htm
self.assertIsNone(ImageChops.difference(expected_img, actual_img).getbbox())
self.assertEqual(mock.call_count, 1)
@requests_mock.Mocker()
def test_open_image_max_width(self, mock):
self._write_file([{"id": "volumeid"}])
mock.get(
"http://someimage/full/100,/0/default.jpg",
body=open(SAMPLES / "img1.jpg", "rb"),
)
gen = ThumbnailGenerator(self.path)
gen.open_image(
"http://someimage",
image_width=1000,
max_width=100,
)
self.assertEqual(mock.call_count, 1)
@requests_mock.Mocker()
def test_generate_thumbnail(self, mock):
self._write_file([{"id": "volumeid"}])
images = [Image.open(SAMPLES / f"img{n}.jpg") for n in range(1, 4)]
gen = ThumbnailGenerator(self.path)
expected_thumbnail = Image.open(str(SAMPLES / "thumb.jpg"))
actual_thumbnail = gen.generate_thumbnail(images)
self.assertLessEqual(
root_mean_square(expected_thumbnail, actual_thumbnail), 10.0
)
@requests_mock.Mocker()
def test_upload_thumbnail(self, mock):
self._write_file([{"id": "volumeid"}])
mock.put(
"http://s3/somewhere",
[{"status_code": 502}, {"status_code": 503}, {"status_code": 200}],
)
gen = ThumbnailGenerator(self.path)
gen.upload_thumbnail(
thumbnail=Image.open(str(SAMPLES / "thumb.jpg")), url="http://s3/somewhere"
)
self.assertEqual(mock.call_count, 3)
@requests_mock.Mocker()
def test_run(self, mock):
self._write_file([{"id": "notfound"}, {"id": "pageid"}, {"id": "volumeid"}])
mock.get(
"/api/v1/corpus/",
json=[
{
"id": "corpusid",
"types": [
{"slug": "volume", "display_name": "Volume", "folder": True},
{"slug": "page", "display_name": "Page", "folder": False},
],
}
],
)
mock.get("/api/v1/element/notfound/", status_code=404)
mock.get(
"/api/v1/element/pageid/",
json={"id": "pageid", "type": "page", "corpus": {"id": "corpusid"}},
)
mock.get(
"/api/v1/element/volumeid/",
json={
"id": "volumeid",
"name": "Some volume",
"type": "volume",
"corpus": {"id": "corpusid"},
"thumbnail_put_url": "http://s3/somewhere",
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True",
complete_qs=True,
json={
"count": 2,
"number": 1,
"results": [
{
"zone": {
"image": {"url": "http://url1", "width": 500},
}
},
{
"zone": {
"image": {"url": "http://url2", "width": 750},
}
},
],
},
)
mock.get(
"/api/v1/elements/volumeid/children/?folder=False&with_zone=True&recursive=True",
complete_qs=True,
json={
"count": 4,
"number": 1,
"results": [
{
"zone": {
"image": {"url": "http://url1", "width": 500},
}
},
{
"zone": {
"image": {"url": "http://url2", "width": 750},
}
},
{
"zone": {
"image": {"url": "http://url3", "width": 1000},
}
},
{
"zone": {
"image": {"url": "http://url4", "width": 1000},
}
},
],
},
)
mock.get(
"http://url1/full/500,/0/default.jpg",
body=open(SAMPLES / "img1.jpg", "rb"),
)
mock.get(
"http://url2/full/750,/0/default.jpg",
body=open(SAMPLES / "img2.jpg", "rb"),
)
mock.get(
"http://url3/full/900,/0/default.jpg",
body=open(SAMPLES / "img3.jpg", "rb"),
)
mock.put("http://s3/somewhere", [{"status_code": 502}, {"status_code": 200}])
gen = ThumbnailGenerator(self.path)
gen.run()
self.assertListEqual(
[(req.method, req.url) for req in mock.request_history],
[
("GET", "https://arkindex.teklia.com/api/v1/corpus/"),
("GET", "https://arkindex.teklia.com/api/v1/element/notfound/"),
("GET", "https://arkindex.teklia.com/api/v1/element/pageid/"),
("GET", "https://arkindex.teklia.com/api/v1/element/volumeid/"),
(
"GET",
"https://arkindex.teklia.com/api/v1/elements/volumeid/children/?folder=False&with_zone=True",
),
("GET", "http://url1/full/500,/0/default.jpg"),
("GET", "http://url2/full/750,/0/default.jpg"),
(
"GET",
"https://arkindex.teklia.com/api/v1/elements/volumeid/children/?folder=False&recursive=True&with_zone=True",
),
("GET", "http://url3/full/900,/0/default.jpg"),
("PUT", "http://s3/somewhere"),
("PUT", "http://s3/somewhere"),
],
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment