Skip to content
Snippets Groups Projects
Commit aeafcb13 authored by Alexis Imbert's avatar Alexis Imbert Committed by Erwan Rouchet
Browse files

Retry HTTP requests in IIIF imports

parent bd801137
No related branches found
No related tags found
1 merge request!373Retry HTTP requests in IIIF imports
Pipeline #159534 passed
......@@ -5,11 +5,11 @@ import uuid
from io import BytesIO
from urllib.parse import urlparse
import requests
from apistar.exceptions import ErrorResponse
from arkindex_tasks import USER_AGENT
from arkindex_tasks.import_iiif.parser import IIIFParser
from arkindex_tasks.import_iiif.utils import retried_iiif_get
from arkindex_tasks.utils import default_client, retried_request
......@@ -45,12 +45,11 @@ def main():
if not parsed.netloc: # Local path
args["stream"] = open(path, "rb")
else:
resp = requests.get(
resp = retried_iiif_get(
path,
headers={"User-Agent": USER_AGENT},
timeout=30,
)
resp.raise_for_status()
args["stream"] = BytesIO(resp.content)
try:
......
......@@ -7,19 +7,17 @@ from urllib.parse import urljoin, urlparse
import ijson
import requests
from apistar.exceptions import ErrorResponse
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from arkindex_tasks import USER_AGENT
from arkindex_tasks.base import WORKER_RUN_ID
from arkindex_tasks.enums import MetaType
from arkindex_tasks.import_iiif.utils import retried_iiif_get
from arkindex_tasks.utils import retried_request
logger = logging.getLogger(__name__)
# Timeout for the info.json request on images
IIIF_INFO_TIMEOUT = 15
# Time to wait before retrying the IIIF image information fetching
IIIF_INFO_RETRY_BACKOFF = 10
def to_primitive(data, many=False):
......@@ -315,20 +313,13 @@ class ManifestParser(IIIFParser):
return image
@retry(
reraise=True,
retry=retry_if_exception_type(requests.RequestException),
stop=stop_after_attempt(2),
wait=wait_fixed(IIIF_INFO_RETRY_BACKOFF),
)
def fetch_image_information(self, info_url):
resp = requests.get(
resp = retried_iiif_get(
info_url,
headers={"User-Agent": USER_AGENT},
timeout=IIIF_INFO_TIMEOUT,
allow_redirects=True,
)
resp.raise_for_status()
return resp.json()
def parse_canvas(self, canvas, default_name=None):
......
# -*- coding: utf-8 -*-
import requests
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from ..utils import HTTP_GET_RETRY_BACKOFF
@retry(
reraise=True,
retry=retry_if_exception_type(requests.RequestException),
stop=stop_after_attempt(2),
wait=wait_fixed(HTTP_GET_RETRY_BACKOFF),
)
def retried_iiif_get(url, headers, timeout, **kwargs):
"""
Wrapper for requests.get with a retry mechanism.
The same GET call will be retried 2 times.
If the 2nd call still gives an exception, that exception is re-raised
and the caller should catch it
"""
with requests.get(url, headers=headers, timeout=timeout, **kwargs) as r:
r.raise_for_status()
return r
......@@ -16,6 +16,9 @@ from tenacity import (
from arkindex_tasks import default_client
# Time to wait before retrying the IIIF image information fetching
HTTP_GET_RETRY_BACKOFF = 10
logger = logging.getLogger(__name__)
DOWNLOAD_CHUNK_SIZE = 8192
......@@ -66,7 +69,7 @@ def retried_request(*args, **kwargs):
reraise=True,
retry=retry_if_exception_type(requests.RequestException),
stop=stop_after_attempt(3),
wait=wait_fixed(5),
wait=wait_fixed(HTTP_GET_RETRY_BACKOFF),
)
def download_file(url, path):
"""
......
......@@ -6,6 +6,7 @@ from unittest import TestCase
from unittest.mock import call, patch
from uuid import uuid4
import requests
import requests_mock
from arkindex_tasks import default_client
......@@ -14,6 +15,10 @@ from arkindex_tasks.import_iiif.__main__ import main
@patch("arkindex_tasks.import_iiif.__main__.IIIFParser")
@patch("arkindex_tasks.import_iiif.__main__.argparse.ArgumentParser")
@patch(
"arkindex_tasks.import_iiif.utils.retried_iiif_get.retry.wait.wait_fixed",
new=0,
)
@requests_mock.Mocker()
class TestMain(TestCase):
def test_path(self, args_mock, parser_mock, req_mock):
......@@ -212,3 +217,22 @@ class TestMain(TestCase):
main()
self.assertEqual(default_client.sleep_duration, 0.01)
def test_retried_get(self, args_mock, parser_mock, req_mock):
corpus_id = uuid4()
req_mock.get("http://manifest", status_code=500)
args_mock().parse_args.return_value = Namespace(
path="http://manifest",
corpus_id=corpus_id,
folder_type="folder",
element_type="page",
sleep=0,
)
with self.assertRaises(requests.exceptions.HTTPError):
main()
self.assertEqual(req_mock.call_count, 2)
info_request, info_request_retry = req_mock.request_history
self.assertEqual(info_request.method, "GET")
self.assertEqual(info_request.url, "http://manifest/")
self.assertEqual(info_request_retry.method, "GET")
self.assertEqual(info_request_retry.url, "http://manifest/")
......@@ -13,7 +13,7 @@ SAMPLES = Path(__file__).absolute().parent / "samples"
@patch(
"arkindex_tasks.import_iiif.parser.ManifestParser.fetch_image_information.retry.wait.wait_fixed",
"arkindex_tasks.import_iiif.utils.retried_iiif_get.retry.wait.wait_fixed",
new=0,
)
class TestManifestParser(TestCase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment