From 35ac8365a118d13f42bec37909e8965d9826f8b1 Mon Sep 17 00:00:00 2001 From: Manon Blanco <blanco@teklia.com> Date: Mon, 20 Nov 2023 08:22:36 +0000 Subject: [PATCH] Log element ID when download as failed --- dan/datasets/extract/arkindex.py | 2 +- dan/datasets/extract/exceptions.py | 7 ++++--- tests/test_extract.py | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index c91c5c22..1e953cf8 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -259,7 +259,7 @@ class ArkindexExtractor: except Exception as e: raise ImageDownloadError( - split=split, path=str(destination), url=download_url, exc=e + split=split, path=destination, url=download_url, exc=e ) def format_text(self, text: str, charset: Optional[set] = None): diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py index 74e1b332..2a703b1a 100644 --- a/dan/datasets/extract/exceptions.py +++ b/dan/datasets/extract/exceptions.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from pathlib import Path class ProcessingError(Exception): @@ -26,13 +27,13 @@ class ImageDownloadError(Exception): """ def __init__( - self, split: str, path: str, url: str, exc: Exception, *args: object + self, split: str, path: Path, url: str, exc: Exception, *args: object ) -> None: super().__init__(*args) self.split: str = split - self.path: str = path + self.path: str = str(path) self.url: str = url - self.message = str(exc) + self.message = f"{str(exc)} for element {path.stem}" class NoTranscriptionError(ElementProcessingError): diff --git a/tests/test_extract.py b/tests/test_extract.py index 4240e2fb..d56484c3 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -5,6 +5,7 @@ import logging import pickle import re from operator import attrgetter, methodcaller +from pathlib import Path from typing import NamedTuple from unittest.mock import patch @@ -701,7 +702,7 @@ def test_download_image_error(iiif_url, caplog, capsys): "split": "train", "polygon": [], "image_url": "deadbeef", - "destination": "/dev/null", + "destination": Path("/dev/null"), } # Make download_image crash iiif_url.return_value = BoundingBox(0, 0, 0, 0), task["image_url"] @@ -723,7 +724,7 @@ def test_download_image_error(iiif_url, caplog, capsys): extractor.tasks = [task] # Add the key in data - extractor.data[task["split"]][task["destination"]] = "deadbeefdata" + extractor.data[task["split"]][str(task["destination"])] = "deadbeefdata" extractor.download_images() @@ -738,7 +739,7 @@ def test_download_image_error(iiif_url, caplog, capsys): # Check stdout captured = capsys.readouterr() - assert captured.out == "deadbeef: Image URL must be HTTP(S)\n" + assert captured.out == "deadbeef: Image URL must be HTTP(S) for element null\n" def test_download_image_error_try_max(responses, caplog): -- GitLab