Skip to content
Snippets Groups Projects
Commit a267de74 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Retry image download

parent 755cfd99
No related branches found
No related tags found
1 merge request!293Retry image download
......@@ -50,10 +50,18 @@ def download_image(url):
Download an image and open it with Pillow
"""
assert url.startswith("http"), "Image URL must be HTTP(S)"
# Download the image
# Cannot use stream=True as urllib's responses do not support the seek(int) method,
# which is explicitly required by Image.open on file-like objects
resp = _retried_request(url)
try:
resp = _retried_request(url)
except requests.HTTPError as e:
if "/full/" in url and 400 <= e.response.status_code < 500:
# Retry with max instead of full as IIIF size
resp = _retried_request(url.replace("/full/", "/max/"))
else:
raise e
# Preprocess the image and prepare it for classification
image = Image.open(BytesIO(resp.content)).convert("RGB")
......
......@@ -4,7 +4,7 @@ import json
import logging
import pickle
import re
from operator import methodcaller
from operator import attrgetter, methodcaller
from typing import NamedTuple
from unittest.mock import patch
......@@ -20,6 +20,7 @@ from dan.datasets.extract.exceptions import (
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import (
EntityType,
download_image,
insert_token,
normalize_linebreaks,
normalize_spaces,
......@@ -566,6 +567,40 @@ def test_download_image_error(iiif_url, caplog, capsys):
assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
def test_download_image_error_try_max(responses):
# An image's URL
url = (
"https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/full/0/default.jpg"
)
fixed_url = (
"https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/max/0/default.jpg"
)
# Fake responses error
responses.add(
responses.GET,
url,
status=400,
)
# Correct response with max
responses.add(
responses.GET,
fixed_url,
status=200,
body=next((FIXTURES / "prediction" / "images").iterdir()).read_bytes(),
)
image = download_image(url)
assert image
# We try 3 times with the first URL
# Then the first try with the new URL is successful
assert len(responses.calls) == 4
assert list(map(attrgetter("request.url"), responses.calls)) == [url] * 3 + [
fixed_url
]
@pytest.mark.parametrize("allow_empty", (True, False))
def test_empty_transcription(allow_empty, mock_database):
extractor = ArkindexExtractor(
......
......@@ -10,6 +10,7 @@ wheel_build_env = .pkg
deps =
pytest>=6
pytest-lazy-fixture
pytest-responses
-rrequirements.txt
commands =
pytest {tty:--color=yes} {posargs}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment