diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index c91c5c2246e0f7702540ceeb58a20976aa338644..1e953cf83a2a186e15d22b8cfea493634716897b 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -259,7 +259,7 @@ class ArkindexExtractor: except Exception as e: raise ImageDownloadError( - split=split, path=str(destination), url=download_url, exc=e + split=split, path=destination, url=download_url, exc=e ) def format_text(self, text: str, charset: Optional[set] = None): diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py index 74e1b332c0fd2d91748cc45bf76025769292e8ea..2a703b1a8b26d270e73e1869ed960fe95e03fc85 100644 --- a/dan/datasets/extract/exceptions.py +++ b/dan/datasets/extract/exceptions.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from pathlib import Path class ProcessingError(Exception): @@ -26,13 +27,13 @@ class ImageDownloadError(Exception): """ def __init__( - self, split: str, path: str, url: str, exc: Exception, *args: object + self, split: str, path: Path, url: str, exc: Exception, *args: object ) -> None: super().__init__(*args) self.split: str = split - self.path: str = path + self.path: str = str(path) self.url: str = url - self.message = str(exc) + self.message = f"{str(exc)} for element {path.stem}" class NoTranscriptionError(ElementProcessingError): diff --git a/tests/test_extract.py b/tests/test_extract.py index 4240e2fb7779d761b1bb1bb384e790fd0543e438..d56484c37d55dcdcf90ab737d77eac9e7bd7d7b1 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -5,6 +5,7 @@ import logging import pickle import re from operator import attrgetter, methodcaller +from pathlib import Path from typing import NamedTuple from unittest.mock import patch @@ -701,7 +702,7 @@ def test_download_image_error(iiif_url, caplog, capsys): "split": "train", "polygon": [], "image_url": "deadbeef", - "destination": "/dev/null", + "destination": Path("/dev/null"), } # Make download_image crash iiif_url.return_value = BoundingBox(0, 0, 0, 0), task["image_url"] @@ -723,7 +724,7 @@ def test_download_image_error(iiif_url, caplog, capsys): extractor.tasks = [task] # Add the key in data - extractor.data[task["split"]][task["destination"]] = "deadbeefdata" + extractor.data[task["split"]][str(task["destination"])] = "deadbeefdata" extractor.download_images() @@ -738,7 +739,7 @@ def test_download_image_error(iiif_url, caplog, capsys): # Check stdout captured = capsys.readouterr() - assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" + assert captured.out == "deadbeef: Image URL must be HTTP(S) for element null\n" def test_download_image_error_try_max(responses, caplog):