diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 8d69766a4328f64566cf6cc02be09d9c9ff281bb..79e5ef6f97ace895d5103a97632b8337f20ae972 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -50,10 +50,18 @@ def download_image(url): Download an image and open it with Pillow """ assert url.startswith("http"), "Image URL must be HTTP(S)" + # Download the image # Cannot use stream=True as urllib's responses do not support the seek(int) method, # which is explicitly required by Image.open on file-like objects - resp = _retried_request(url) + try: + resp = _retried_request(url) + except requests.HTTPError as e: + if "/full/" in url and 400 <= e.response.status_code < 500: + # Retry with max instead of full as IIIF size + resp = _retried_request(url.replace("/full/", "/max/")) + else: + raise e # Preprocess the image and prepare it for classification image = Image.open(BytesIO(resp.content)).convert("RGB") diff --git a/tests/test_extract.py b/tests/test_extract.py index cfd78846d19351d206e4fc689b6ac742e9c759ab..7ae10ecfb8a7967dded76c844b5a554e00075673 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,7 +4,7 @@ import json import logging import pickle import re -from operator import methodcaller +from operator import attrgetter, methodcaller from typing import NamedTuple from unittest.mock import patch @@ -20,6 +20,7 @@ from dan.datasets.extract.exceptions import ( from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor from dan.datasets.extract.utils import ( EntityType, + download_image, insert_token, normalize_linebreaks, normalize_spaces, @@ -566,6 +567,40 @@ def test_download_image_error(iiif_url, caplog, capsys): assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" +def test_download_image_error_try_max(responses): + # An image's URL + url = ( + "https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/full/0/default.jpg" + ) + fixed_url = ( + "https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/max/0/default.jpg" + ) + + # Fake responses error + responses.add( + responses.GET, + url, + status=400, + ) + # Correct response with max + responses.add( + responses.GET, + fixed_url, + status=200, + body=next((FIXTURES / "prediction" / "images").iterdir()).read_bytes(), + ) + + image = download_image(url) + + assert image + # We try 3 times with the first URL + # Then the first try with the new URL is successful + assert len(responses.calls) == 4 + assert list(map(attrgetter("request.url"), responses.calls)) == [url] * 3 + [ + fixed_url + ] + + @pytest.mark.parametrize("allow_empty", (True, False)) def test_empty_transcription(allow_empty, mock_database): extractor = ArkindexExtractor( diff --git a/tox.ini b/tox.ini index 885c472ae0a71ddb8c9f596d48126981b831a8aa..4101490b058e18162fd68c98493af3d14e82bdc9 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ wheel_build_env = .pkg deps = pytest>=6 pytest-lazy-fixture + pytest-responses -rrequirements.txt commands = pytest {tty:--color=yes} {posargs}