From 35ac8365a118d13f42bec37909e8965d9826f8b1 Mon Sep 17 00:00:00 2001
From: Manon Blanco <blanco@teklia.com>
Date: Mon, 20 Nov 2023 08:22:36 +0000
Subject: [PATCH] Log element ID when download as failed

---
 dan/datasets/extract/arkindex.py   | 2 +-
 dan/datasets/extract/exceptions.py | 7 ++++---
 tests/test_extract.py              | 7 ++++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index c91c5c22..1e953cf8 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -259,7 +259,7 @@ class ArkindexExtractor:
 
         except Exception as e:
             raise ImageDownloadError(
-                split=split, path=str(destination), url=download_url, exc=e
+                split=split, path=destination, url=download_url, exc=e
             )
 
     def format_text(self, text: str, charset: Optional[set] = None):
diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py
index 74e1b332..2a703b1a 100644
--- a/dan/datasets/extract/exceptions.py
+++ b/dan/datasets/extract/exceptions.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from pathlib import Path
 
 
 class ProcessingError(Exception):
@@ -26,13 +27,13 @@ class ImageDownloadError(Exception):
     """
 
     def __init__(
-        self, split: str, path: str, url: str, exc: Exception, *args: object
+        self, split: str, path: Path, url: str, exc: Exception, *args: object
     ) -> None:
         super().__init__(*args)
         self.split: str = split
-        self.path: str = path
+        self.path: str = str(path)
         self.url: str = url
-        self.message = str(exc)
+        self.message = f"{str(exc)} for element {path.stem}"
 
 
 class NoTranscriptionError(ElementProcessingError):
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 4240e2fb..d56484c3 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -5,6 +5,7 @@ import logging
 import pickle
 import re
 from operator import attrgetter, methodcaller
+from pathlib import Path
 from typing import NamedTuple
 from unittest.mock import patch
 
@@ -701,7 +702,7 @@ def test_download_image_error(iiif_url, caplog, capsys):
         "split": "train",
         "polygon": [],
         "image_url": "deadbeef",
-        "destination": "/dev/null",
+        "destination": Path("/dev/null"),
     }
     # Make download_image crash
     iiif_url.return_value = BoundingBox(0, 0, 0, 0), task["image_url"]
@@ -723,7 +724,7 @@ def test_download_image_error(iiif_url, caplog, capsys):
     extractor.tasks = [task]
 
     # Add the key in data
-    extractor.data[task["split"]][task["destination"]] = "deadbeefdata"
+    extractor.data[task["split"]][str(task["destination"])] = "deadbeefdata"
 
     extractor.download_images()
 
@@ -738,7 +739,7 @@ def test_download_image_error(iiif_url, caplog, capsys):
 
     # Check stdout
     captured = capsys.readouterr()
-    assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"
+    assert captured.out == "deadbeef: Image URL must be HTTP(S) for element null\n"
 
 
 def test_download_image_error_try_max(responses, caplog):
-- 
GitLab