diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index d04412f89d48fe3fc7fc5ca740d676f490c9e9d8..b413d966362ee6d3cdc57fd869fb14df2a114020 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -159,12 +159,4 @@ def add_extract_parser(subcommands) -> None:
         help="Do not remove beginning, ending and consecutive spaces in transcriptions.",
     )
 
-    parser.add_argument(
-        "--cache",
-        dest="cache_dir",
-        type=pathlib.Path,
-        help="Where the images should be cached.",
-        default=pathlib.Path(".cache"),
-    )
-
     parser.set_defaults(func=run)
diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py
index 93c8d1ae5983edbb3ad3eaeafee0e46c0c863411..2155a6ca2383cf7ad9b499d95ef7de2eed5daaad 100644
--- a/dan/datasets/extract/exceptions.py
+++ b/dan/datasets/extract/exceptions.py
@@ -20,24 +20,19 @@ class ElementProcessingError(ProcessingError):
         self.element_id = element_id
 
 
-class ImageDownloadError(ElementProcessingError):
+class ImageDownloadError(Exception):
     """
     Raised when an element's image could not be downloaded
     """
 
-    error: Exception
-    """
-    Error encountered.
-    """
-
-    def __init__(self, element_id: str, error: Exception, *args: object) -> None:
-        super().__init__(element_id, *args)
-        self.error = error
-
-    def __str__(self) -> str:
-        return (
-            f"Couldn't retrieve image of element ({self.element_id}: {str(self.error)})"
-        )
+    def __init__(
+        self, split: str, path: str, url: str, exc: Exception, *args: object
+    ) -> None:
+        super().__init__(*args)
+        self.split: str = split
+        self.path: str = path
+        self.url: str = url
+        self.message = str(exc)
 
 
 class NoTranscriptionError(ElementProcessingError):
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index fa1d7d85936998d64c435152e18ace6ef1042c75..4c270f57c3e306a4ff31d307ec18777bf6b68077 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -5,11 +5,15 @@ import logging
 import pickle
 import random
 from collections import defaultdict
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import cached_property
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 from uuid import UUID
 
+import cv2
 import numpy as np
+from PIL import Image
 from tqdm import tqdm
 
 from arkindex_export import open_database
@@ -20,23 +24,28 @@ from dan.datasets.extract.db import (
     get_transcriptions,
 )
 from dan.datasets.extract.exceptions import (
+    ImageDownloadError,
     NoEndTokenError,
     NoTranscriptionError,
     ProcessingError,
 )
 from dan.datasets.extract.utils import (
     download_image,
+    get_bbox,
     insert_token,
     remove_spaces,
 )
 from dan.utils import EntityType, parse_tokens
-from line_image_extractor.extractor import extract, read_img, save_img
-from line_image_extractor.image_utils import Extraction, polygon_to_bbox, resize
+from line_image_extractor.extractor import save_img
+from line_image_extractor.image_utils import deskew_image, polygon_to_bbox
 
 IMAGES_DIR = "images"  # Subpath to the images directory.
 
 SPLIT_NAMES = ["train", "val", "test"]
-IIIF_URL_SUFFIX = "/full/full/0/default.jpg"
+IIIF_URL = "{image_url}/{bbox}/{size}/0/default.jpg"
+# IIIF 2.0 uses `full`
+IIIF_FULL_SIZE = "full"
+
 logger = logging.getLogger(__name__)
 
 
@@ -58,7 +67,6 @@ class ArkindexExtractor:
         entity_worker_version: Optional[Union[str, bool]] = None,
         max_width: Optional[int] = None,
         max_height: Optional[int] = None,
-        cache_dir: Path = Path(".cache"),
         keep_spaces: bool = False,
         image_extension: str = "",
     ) -> None:
@@ -75,23 +83,37 @@ class ArkindexExtractor:
         self.max_height = max_height
         self.image_extension = image_extension
 
-        self.cache_dir = cache_dir
-        # Create cache dir if non existent
-        self.cache_dir.mkdir(exist_ok=True, parents=True)
-
         self.keep_spaces = keep_spaces
 
         self.data: Dict = defaultdict(dict)
         self.charset = set()
 
-    def find_image_in_cache(self, image_id: str) -> Path:
-        """Images are cached to avoid downloading them twice. They are stored under a specific name,
-        based on their Arkindex ID. Images are saved under the JPEG format.
-
-        :param image_id: ID of the image. The image is saved under this name.
-        :return: Where the image should be saved in the cache folder.
-        """
-        return (self.cache_dir / image_id).with_suffix(self.image_extension)
+        # Image download tasks to process
+        self.tasks: List[Dict[str, str]] = []
+
+    @cached_property
+    def max_resize(self):
+        # We keep the aspect ratio so we only use on dimension, the biggest one
+        if self.max_width > self.max_height:
+            return f"{self.max_width},"
+        return f",{self.max_height}"
+
+    def get_iiif_size_arg(self, width: int, height: int) -> str:
+        if (self.max_width is None or width <= self.max_width) and (
+            self.max_height is None or height <= self.max_height
+        ):
+            return IIIF_FULL_SIZE
+
+        # Resizing if the image is bigger than the wanted size.
+        if self.max_width and self.max_height:
+            return self.max_resize
+        return f"{self.max_width or ''},{self.max_height or ''}"
+
+    def build_iiif_url(self, polygon, image_url):
+        bbox = polygon_to_bbox(json.loads(str(polygon)))
+        size = self.get_iiif_size_arg(width=bbox.width, height=bbox.height)
+        # Rotations are done using the lib
+        return IIIF_URL.format(image_url=image_url, bbox=get_bbox(polygon), size=size)
 
     def _keep_char(self, char: str) -> bool:
         # Keep all text by default if no separator was given
@@ -176,51 +198,38 @@ class ArkindexExtractor:
         )
         return self.reconstruct_text(transcription.text, entities)
 
-    def retrieve_image(self, child: Element):
-        """Get or download image of the element. Checks in cache before downloading.
-
-        :param child: Processed element.
-        :return: The element's image.
-        """
-        cached_img_path = self.find_image_in_cache(child.image.id)
-        if not cached_img_path.exists():
-            # Save in cache
-            download_image(child.image.url + IIIF_URL_SUFFIX).save(
-                cached_img_path, format="jpeg"
-            )
-
-        return read_img(cached_img_path)
-
-    def get_image(self, child: Element, destination: Path) -> None:
+    def get_image(
+        self,
+        split: str,
+        polygon: List[List[int]],
+        image_url: str,
+        destination: Path,
+    ) -> None:
         """Save the element's image to the given path and applies any image operations needed.
 
-        :param child: Processed element.
+        :param split: Dataset split this image belongs to.
+        :param polygon: Polygon of the processed element.
+        :param image_url: Base IIIF URL of the image.
         :param destination: Where the image should be saved.
         """
-        polygon = json.loads(str(child.polygon))
-
-        if self.max_height or self.max_width:
-            polygon = resize(
-                polygon,
-                self.max_width,
-                self.max_height,
-                scale_x=1.0,
-                scale_y_top=1.0,
-                scale_y_bottom=1.0,
+        download_url: str = self.build_iiif_url(polygon=polygon, image_url=image_url)
+        try:
+            img: Image.Image = download_image(download_url)
+
+            # Deskew image
+            image = deskew_image(
+                np.asarray(img), polygon=np.asarray(polygon), max_deskew_angle=45
             )
 
-        # Extract the polygon in the image
-        image = extract(
-            img=self.retrieve_image(child),
-            polygon=np.array(polygon),
-            bbox=polygon_to_bbox(polygon),
-            # Hardcoded while we don't have a configuration file
-            extraction_mode=Extraction.deskew_min_area_rect,
-            max_deskew_angle=45,
-        )
+            # Convert to RGB
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
-        # Save the image to disk
-        save_img(path=destination, img=image)
+            # Save the image to disk
+            save_img(path=destination, img=image)
+        except Exception as e:
+            raise ImageDownloadError(
+                split=split, path=str(destination), url=download_url, exc=e
+            )
 
     def format_text(self, text: str):
         if not self.keep_spaces:
@@ -238,10 +247,20 @@ class ArkindexExtractor:
         """
         text = self.extract_transcription(element)
 
-        image_path = Path(
-            self.output, IMAGES_DIR, split, f"{element.type}_{element.id}"
-        ).with_suffix(self.image_extension)
-        self.get_image(element, image_path)
+        image_path = Path(self.output, IMAGES_DIR, split, element.id).with_suffix(
+            self.image_extension
+        )
+
+        # Create task for multithreading pool if image does not exist yet
+        if not image_path.exists():
+            self.tasks.append(
+                {
+                    "split": split,
+                    "polygon": json.loads(str(element.polygon)),
+                    "image_url": element.image.url,
+                    "destination": image_path,
+                }
+            )
 
         self.data[split][str(image_path)] = self.format_text(text)
         self.charset = self.charset.union(set(text))
@@ -292,6 +311,40 @@ class ArkindexExtractor:
             pickle.dumps(sorted(list(self.charset)))
         )
 
+    def download_images(self):
+        failed_downloads = []
+        with tqdm(
+            desc="Downloading images", total=len(self.tasks)
+        ) as pbar, ThreadPoolExecutor() as executor:
+
+            def process_future(future: Future):
+                """
+                Callback function called at the end of the thread
+                """
+                # Update the progress bar count
+                pbar.update(1)
+
+                exc = future.exception()
+                if exc is None:
+                    # No error
+                    return
+                # If failed, tag for removal
+                assert isinstance(exc, ImageDownloadError)
+                # Remove transcription from labels dict
+                del self.data[exc.split][exc.path]
+                # Save tried URL
+                failed_downloads.append((exc.url, exc.message))
+
+            # Submit all tasks
+            for task in self.tasks:
+                executor.submit(self.get_image, **task).add_done_callback(
+                    process_future
+                )
+
+        if failed_downloads:
+            logger.error(f"Failed to download {len(failed_downloads)} image(s).")
+            print(*list(map(": ".join, failed_downloads)), sep="\n")
+
     def run(self):
         # Iterate over the subsets to find the page images and labels.
         for folder_id, split in zip(self.folders, SPLIT_NAMES):
@@ -312,6 +365,8 @@ class ArkindexExtractor:
                     # Progress bar updates
                     pbar.update()
                     pbar.refresh()
+
+        self.download_images()
         self.export()
 
 
@@ -331,7 +386,6 @@ def run(
     max_width: Optional[int],
     max_height: Optional[int],
     image_format: str,
-    cache_dir: Path,
     keep_spaces: bool,
 ):
     assert database.exists(), f"No file found @ {database}"
@@ -358,7 +412,6 @@ def run(
         entity_worker_version=entity_worker_version,
         max_width=max_width,
         max_height=max_height,
-        cache_dir=cache_dir,
         keep_spaces=keep_spaces,
         image_extension=image_format,
     ).run()
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 759c0078cd71ae8af3743b044dc0ea827dd33640..5c40af104a9563ea2a212eadbd0d8ac9f08912e1 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -2,9 +2,10 @@
 import logging
 import re
 from io import BytesIO
+from typing import List
 
 import requests
-from PIL import Image
+from PIL import Image, ImageOps
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -24,7 +25,7 @@ TRIM_REGEX = re.compile(r"\t?(?: +)")
 
 
 def _retry_log(retry_state, *args, **kwargs):
-    logger.warning(
+    logger.debug(
         f"Request to {retry_state.args[0]} failed ({repr(retry_state.outcome.exception())}), "
         f"retrying in {retry_state.idle_for} seconds"
     )
@@ -55,6 +56,10 @@ def download_image(url):
 
     # Preprocess the image and prepare it for classification
     image = Image.open(BytesIO(resp.content))
+
+    # Do not rotate JPEG images (see https://github.com/python-pillow/Pillow/issues/4703)
+    image = ImageOps.exif_transpose(image)
+
     logger.debug(
         "Downloaded image {} - size={}x{}".format(url, image.size[0], image.size[1])
     )
@@ -80,3 +85,14 @@ def remove_spaces(text: str) -> str:
     # remove begin/ending spaces
     # replace \t with regular space and consecutive spaces
     return TRIM_REGEX.sub(" ", text.strip())
+
+
+def get_bbox(polygon: List[List[int]]) -> str:
+    """
+    Arkindex polygon stored as string
+    returns a comma-separated string of upper left-most pixel, width + height of the image
+    """
+    all_x, all_y = zip(*polygon)
+    x, y = min(all_x), min(all_y)
+    width, height = max(all_x) - x, max(all_y) - y
+    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index a7fec23c5d1f931674a8b776ca24f854551ffa60..cdc50ab02f57ac4a4d9436305dd82ea37c68d295 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -8,6 +8,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 - Create the mapping of the images (identified by its path) to the ground-truth transcription (with NER tokens if needed) (in the `labels.json` file),
 - Store the set of characters encountered in the dataset (in the `charset.pkl` file).
 
+If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed.
+
 | Parameter                        | Description                                                                                                                                                                                                                          | Type            | Default                              |
 | -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | ------------------------------------ |
 | `database`                       | Path to an Arkindex export database in SQLite format.                                                                                                                                                                                | `Path`          |                                      |
diff --git a/tests/conftest.py b/tests/conftest.py
index 0f4b5579ea4f21b0253c8f7597d7b42f12082b0a..93f5870a0844ec72eda04c3ff50e6d45c2a2b221 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -85,16 +85,20 @@ def mock_database(tmp_path_factory):
         element_path = (FIXTURES / "extraction" / "elements" / id).with_suffix(".json")
         element_json = json.loads(element_path.read_text())
 
+        element_type = element_json["type"]
+        image_path = (
+            FIXTURES / "extraction" / "images" / element_type / id
+        ).with_suffix(".jpg")
+
         polygon = element_json.get("polygon")
         # Always use page images because polygons are based on the full image
-        # Reconstruct and reuse the page ID to use the image cache (and avoid downloading through the Arkindex API)
-        image_id = "-".join(id.split("-")[:2])
         image, _ = (
             Image.get_or_create(
-                id=image_id,
+                id=id + "-image",
                 defaults={
                     "server": image_server,
-                    "url": f"http://image/{image_id}/url",
+                    # Use path to image instead of actual URL since we won't be doing any download
+                    "url": image_path,
                     "width": 0,
                     "height": 0,
                 },
@@ -106,7 +110,7 @@ def mock_database(tmp_path_factory):
         element = Element.create(
             id=id,
             name=id,
-            type=element_json["type"],
+            type=element_type,
             image=image,
             polygon=json.dumps(polygon) if polygon else None,
             created=0.0,
diff --git a/tests/data/extraction/images/pages/test-page_1.jpg b/tests/data/extraction/images/double_page/test-page_1.jpg
similarity index 100%
rename from tests/data/extraction/images/pages/test-page_1.jpg
rename to tests/data/extraction/images/double_page/test-page_1.jpg
diff --git a/tests/data/extraction/images/pages/test-page_2.jpg b/tests/data/extraction/images/double_page/test-page_2.jpg
similarity index 100%
rename from tests/data/extraction/images/pages/test-page_2.jpg
rename to tests/data/extraction/images/double_page/test-page_2.jpg
diff --git a/tests/data/extraction/images/pages/train-page_1.jpg b/tests/data/extraction/images/double_page/train-page_1.jpg
similarity index 100%
rename from tests/data/extraction/images/pages/train-page_1.jpg
rename to tests/data/extraction/images/double_page/train-page_1.jpg
diff --git a/tests/data/extraction/images/pages/train-page_2.jpg b/tests/data/extraction/images/double_page/train-page_2.jpg
similarity index 100%
rename from tests/data/extraction/images/pages/train-page_2.jpg
rename to tests/data/extraction/images/double_page/train-page_2.jpg
diff --git a/tests/data/extraction/images/pages/val-page_1.jpg b/tests/data/extraction/images/double_page/val-page_1.jpg
similarity index 100%
rename from tests/data/extraction/images/pages/val-page_1.jpg
rename to tests/data/extraction/images/double_page/val-page_1.jpg
diff --git a/tests/data/extraction/images/lines/text_line_test-page_1-line_1.jpg b/tests/data/extraction/images/lines/text_line_test-page_1-line_1.jpg
deleted file mode 100644
index 407f9dd1439951a9b59d566abf2b5aa6010c4b27..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_1-line_1.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_test-page_1-line_2.jpg b/tests/data/extraction/images/lines/text_line_test-page_1-line_2.jpg
deleted file mode 100644
index 2cfc7c7a19be54d45db8b3f645c7c705b0ab02f8..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_1-line_2.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_test-page_1-line_3.jpg b/tests/data/extraction/images/lines/text_line_test-page_1-line_3.jpg
deleted file mode 100644
index 928a28e57a0a879223ea5d3492e5ea792ae8c48e..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_1-line_3.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_test-page_2-line_1.jpg b/tests/data/extraction/images/lines/text_line_test-page_2-line_1.jpg
deleted file mode 100644
index 363bdf0d49cbdf8099127c6973503a7d9851fb88..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_2-line_1.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_test-page_2-line_2.jpg b/tests/data/extraction/images/lines/text_line_test-page_2-line_2.jpg
deleted file mode 100644
index 2942bec167c441261803c9ce3ee8c40635411d99..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_2-line_2.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_test-page_2-line_3.jpg b/tests/data/extraction/images/lines/text_line_test-page_2-line_3.jpg
deleted file mode 100644
index 06a282dcf6e788916fc69eb965185beab0d0f4b9..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_test-page_2-line_3.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_1-line_1.jpg b/tests/data/extraction/images/lines/text_line_train-page_1-line_1.jpg
deleted file mode 100644
index 8adab6778a143c4382a02e91346f2d6c0d501aa7..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_1-line_1.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_1-line_2.jpg b/tests/data/extraction/images/lines/text_line_train-page_1-line_2.jpg
deleted file mode 100644
index 182f1584fbfea9c8ae1ba9e3e3ae6306433cb967..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_1-line_2.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_1-line_3.jpg b/tests/data/extraction/images/lines/text_line_train-page_1-line_3.jpg
deleted file mode 100644
index d69dd6c60aa2de34c8f1738e1699a82c03dc0eb9..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_1-line_3.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_1-line_4.jpg b/tests/data/extraction/images/lines/text_line_train-page_1-line_4.jpg
deleted file mode 100644
index 97ea72c0bebd64048a613a3207aba6d49bfc9e24..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_1-line_4.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_2-line_1.jpg b/tests/data/extraction/images/lines/text_line_train-page_2-line_1.jpg
deleted file mode 100644
index 4a38dd3f6e792e6c39cf3b75f088222d374762c0..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_2-line_1.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_2-line_2.jpg b/tests/data/extraction/images/lines/text_line_train-page_2-line_2.jpg
deleted file mode 100644
index f328d7706ffc23792198a6fd0284ed529a2fef08..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_2-line_2.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_train-page_2-line_3.jpg b/tests/data/extraction/images/lines/text_line_train-page_2-line_3.jpg
deleted file mode 100644
index f5902c789fe0c1a5acd3261827b64a5d2f29f2a0..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_train-page_2-line_3.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_val-page_1-line_1.jpg b/tests/data/extraction/images/lines/text_line_val-page_1-line_1.jpg
deleted file mode 100644
index a147b8c4b2a77e09d526ed4119338c47a089e9f0..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_val-page_1-line_1.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_val-page_1-line_2.jpg b/tests/data/extraction/images/lines/text_line_val-page_1-line_2.jpg
deleted file mode 100644
index afc9e0b582b261196d8269d5fbfe4a80615ffd06..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_val-page_1-line_2.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/lines/text_line_val-page_1-line_3.jpg b/tests/data/extraction/images/lines/text_line_val-page_1-line_3.jpg
deleted file mode 100644
index 7aa7ff7e7c8641b595402da059078aa39658ff8a..0000000000000000000000000000000000000000
Binary files a/tests/data/extraction/images/lines/text_line_val-page_1-line_3.jpg and /dev/null differ
diff --git a/tests/data/extraction/images/text_line/test-page_1-line_1.jpg b/tests/data/extraction/images/text_line/test-page_1-line_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..59b4173da998b2a6c9a00c6fde7a550920e06b47
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_1-line_1.jpg differ
diff --git a/tests/data/extraction/images/text_line/test-page_1-line_2.jpg b/tests/data/extraction/images/text_line/test-page_1-line_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aa4c1f0f923882eebca307773d1099864de0622a
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_1-line_2.jpg differ
diff --git a/tests/data/extraction/images/text_line/test-page_1-line_3.jpg b/tests/data/extraction/images/text_line/test-page_1-line_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bee0d3159564d31de40a21c553d1486cbe296333
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_1-line_3.jpg differ
diff --git a/tests/data/extraction/images/text_line/test-page_2-line_1.jpg b/tests/data/extraction/images/text_line/test-page_2-line_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..44d4f37898a62d2f4bee272cb35add685b06dfdf
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_2-line_1.jpg differ
diff --git a/tests/data/extraction/images/text_line/test-page_2-line_2.jpg b/tests/data/extraction/images/text_line/test-page_2-line_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c19ec187bbf254e934cd344422ef32731885dba6
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_2-line_2.jpg differ
diff --git a/tests/data/extraction/images/text_line/test-page_2-line_3.jpg b/tests/data/extraction/images/text_line/test-page_2-line_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e05ada4d1c1ce67a85847782cf030245a21b3e2
Binary files /dev/null and b/tests/data/extraction/images/text_line/test-page_2-line_3.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_1-line_1.jpg b/tests/data/extraction/images/text_line/train-page_1-line_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6768c8183e841c5326facfce7bb4b1bdf5801eff
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_1-line_1.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_1-line_2.jpg b/tests/data/extraction/images/text_line/train-page_1-line_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b787d7072945c314c0917c63b3329a78b12bbf97
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_1-line_2.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_1-line_3.jpg b/tests/data/extraction/images/text_line/train-page_1-line_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b8ae4811a419bb25d27bd1c684522eeb71e314fa
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_1-line_3.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_1-line_4.jpg b/tests/data/extraction/images/text_line/train-page_1-line_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..41ee3f51abf4926f2d9c930c7fb3e339171687bc
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_1-line_4.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_2-line_1.jpg b/tests/data/extraction/images/text_line/train-page_2-line_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a1d15adc504778437929c9e3f244b5b6bcacde77
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_2-line_1.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_2-line_2.jpg b/tests/data/extraction/images/text_line/train-page_2-line_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc7ff4c07f9f6d4765f1d145e6b6e4544a4c31b1
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_2-line_2.jpg differ
diff --git a/tests/data/extraction/images/text_line/train-page_2-line_3.jpg b/tests/data/extraction/images/text_line/train-page_2-line_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..90e737d16d6cf982cbfe25242985c0da7ee7bc6a
Binary files /dev/null and b/tests/data/extraction/images/text_line/train-page_2-line_3.jpg differ
diff --git a/tests/data/extraction/images/text_line/val-page_1-line_1.jpg b/tests/data/extraction/images/text_line/val-page_1-line_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5937da29f438c8e98494b29a5f549b0b398e3723
Binary files /dev/null and b/tests/data/extraction/images/text_line/val-page_1-line_1.jpg differ
diff --git a/tests/data/extraction/images/text_line/val-page_1-line_2.jpg b/tests/data/extraction/images/text_line/val-page_1-line_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..758e2c89b1cd4b37ba2c03a984b417970c384daa
Binary files /dev/null and b/tests/data/extraction/images/text_line/val-page_1-line_2.jpg differ
diff --git a/tests/data/extraction/images/text_line/val-page_1-line_3.jpg b/tests/data/extraction/images/text_line/val-page_1-line_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..30e2e4319acdb5038fc1da831091018ff871c329
Binary files /dev/null and b/tests/data/extraction/images/text_line/val-page_1-line_3.jpg differ
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 91def38987aa424107082b35f944e21647de07c1..88bf40bdf4be6cf40cec41fc0e2ae2b1e2dd2a9a 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -1,12 +1,15 @@
 # -*- coding: utf-8 -*-
 
 import json
+import logging
 import pickle
 import re
 from operator import methodcaller
 from typing import NamedTuple
+from unittest.mock import patch
 
 import pytest
+from PIL import Image, ImageChops
 
 from dan.datasets.extract.exceptions import NoEndTokenError
 from dan.datasets.extract.extract import ArkindexExtractor
@@ -243,7 +246,9 @@ def test_reconstruct_text_only_start_token(joined, text_before, text_after):
 @pytest.mark.parametrize(
     "transcription_entities_worker_version", ("worker_version_id", False)
 )
+@patch("dan.datasets.extract.extract.download_image")
 def test_extract(
+    mock_download_image,
     load_entities,
     keep_spaces,
     transcription_entities_worker_version,
@@ -261,7 +266,11 @@ def test_extract(
         if token
     ]
 
-    ArkindexExtractor(
+    def mock_build_image_url(image_url, *args, **kwargs):
+        # During tests, the image URL is its local path
+        return image_url
+
+    extractor = ArkindexExtractor(
         folders=["train", "val", "test"],
         element_type=["text_line"],
         parent_element_type="double_page",
@@ -273,10 +282,14 @@ def test_extract(
         entity_worker_version=transcription_entities_worker_version
         if load_entities
         else None,
-        cache_dir=EXTRACTION_DATA_PATH / "images" / "pages",
         keep_spaces=keep_spaces,
         image_extension=".jpg",
-    ).run()
+    )
+    # Mock build_image_url to simply return the path to the image
+    extractor.build_iiif_url = mock_build_image_url
+    # Mock download_image so that it simply opens it with Pillow
+    mock_download_image.side_effect = Image.open
+    extractor.run()
 
     # Check files
     IMAGE_DIR = output / "images"
@@ -287,24 +300,24 @@ def test_extract(
     expected_paths = [
         output / "charset.pkl",
         # Images of test folder
-        TEST_DIR / "text_line_test-page_1-line_1.jpg",
-        TEST_DIR / "text_line_test-page_1-line_2.jpg",
-        TEST_DIR / "text_line_test-page_1-line_3.jpg",
-        TEST_DIR / "text_line_test-page_2-line_1.jpg",
-        TEST_DIR / "text_line_test-page_2-line_2.jpg",
-        TEST_DIR / "text_line_test-page_2-line_3.jpg",
+        TEST_DIR / "test-page_1-line_1.jpg",
+        TEST_DIR / "test-page_1-line_2.jpg",
+        TEST_DIR / "test-page_1-line_3.jpg",
+        TEST_DIR / "test-page_2-line_1.jpg",
+        TEST_DIR / "test-page_2-line_2.jpg",
+        TEST_DIR / "test-page_2-line_3.jpg",
         # Images of train folder
-        TRAIN_DIR / "text_line_train-page_1-line_1.jpg",
-        TRAIN_DIR / "text_line_train-page_1-line_2.jpg",
-        TRAIN_DIR / "text_line_train-page_1-line_3.jpg",
-        TRAIN_DIR / "text_line_train-page_1-line_4.jpg",
-        TRAIN_DIR / "text_line_train-page_2-line_1.jpg",
-        TRAIN_DIR / "text_line_train-page_2-line_2.jpg",
-        TRAIN_DIR / "text_line_train-page_2-line_3.jpg",
+        TRAIN_DIR / "train-page_1-line_1.jpg",
+        TRAIN_DIR / "train-page_1-line_2.jpg",
+        TRAIN_DIR / "train-page_1-line_3.jpg",
+        TRAIN_DIR / "train-page_1-line_4.jpg",
+        TRAIN_DIR / "train-page_2-line_1.jpg",
+        TRAIN_DIR / "train-page_2-line_2.jpg",
+        TRAIN_DIR / "train-page_2-line_3.jpg",
         # Images of val folder
-        VAL_DIR / "text_line_val-page_1-line_1.jpg",
-        VAL_DIR / "text_line_val-page_1-line_2.jpg",
-        VAL_DIR / "text_line_val-page_1-line_3.jpg",
+        VAL_DIR / "val-page_1-line_1.jpg",
+        VAL_DIR / "val-page_1-line_2.jpg",
+        VAL_DIR / "val-page_1-line_3.jpg",
         output / "labels.json",
     ]
     assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
@@ -312,58 +325,26 @@ def test_extract(
     # Check "labels.json"
     expected_labels = {
         "test": {
-            str(
-                TEST_DIR / "text_line_test-page_1-line_1.jpg"
-            ): "â“¢Coupez  â“•Louis  â“‘7.12.14",
-            str(
-                TEST_DIR / "text_line_test-page_1-line_2.jpg"
-            ): "â“¢Poutrain  â“•Adolphe  â“‘9.4.13",
-            str(
-                TEST_DIR / "text_line_test-page_1-line_3.jpg"
-            ): "ⓢGabale  ⓕFrançois  ⓑ26.3.11",
-            str(
-                TEST_DIR / "text_line_test-page_2-line_1.jpg"
-            ): "â“¢Durosoy  â“•Louis  â“‘22-4-18",
-            str(
-                TEST_DIR / "text_line_test-page_2-line_2.jpg"
-            ): "â“¢Colaiani  â“•Angels  â“‘28.11.17",
-            str(
-                TEST_DIR / "text_line_test-page_2-line_3.jpg"
-            ): "â“¢Renouard  â“•Maurice  â“‘25.7.04",
+            str(TEST_DIR / "test-page_1-line_1.jpg"): "â“¢Coupez  â“•Louis  â“‘7.12.14",
+            str(TEST_DIR / "test-page_1-line_2.jpg"): "â“¢Poutrain  â“•Adolphe  â“‘9.4.13",
+            str(TEST_DIR / "test-page_1-line_3.jpg"): "ⓢGabale  ⓕFrançois  ⓑ26.3.11",
+            str(TEST_DIR / "test-page_2-line_1.jpg"): "â“¢Durosoy  â“•Louis  â“‘22-4-18",
+            str(TEST_DIR / "test-page_2-line_2.jpg"): "â“¢Colaiani  â“•Angels  â“‘28.11.17",
+            str(TEST_DIR / "test-page_2-line_3.jpg"): "â“¢Renouard  â“•Maurice  â“‘25.7.04",
         },
         "train": {
-            str(
-                TRAIN_DIR / "text_line_train-page_1-line_1.jpg"
-            ): "â“¢Caillet  â“•Maurice  â“‘28.9.06",
-            str(
-                TRAIN_DIR / "text_line_train-page_1-line_2.jpg"
-            ): "â“¢Reboul  â“•Jean  â“‘30.9.02",
-            str(
-                TRAIN_DIR / "text_line_train-page_1-line_3.jpg"
-            ): "â“¢Bareyre  â“•Jean  â“‘28.3.11",
-            str(
-                TRAIN_DIR / "text_line_train-page_1-line_4.jpg"
-            ): "â“¢Roussy  â“•Jean  â“‘4.11.14",
-            str(
-                TRAIN_DIR / "text_line_train-page_2-line_1.jpg"
-            ): "â“¢Marin  â“•Marcel  â“‘10.8.06",
-            str(
-                TRAIN_DIR / "text_line_train-page_2-line_2.jpg"
-            ): "â“¢Roques  â“•Eloi  â“‘11.10.04",
-            str(
-                TRAIN_DIR / "text_line_train-page_2-line_3.jpg"
-            ): "â“¢Giros  â“•Paul  â“‘30.10.10",
+            str(TRAIN_DIR / "train-page_1-line_1.jpg"): "â“¢Caillet  â“•Maurice  â“‘28.9.06",
+            str(TRAIN_DIR / "train-page_1-line_2.jpg"): "â“¢Reboul  â“•Jean  â“‘30.9.02",
+            str(TRAIN_DIR / "train-page_1-line_3.jpg"): "â“¢Bareyre  â“•Jean  â“‘28.3.11",
+            str(TRAIN_DIR / "train-page_1-line_4.jpg"): "â“¢Roussy  â“•Jean  â“‘4.11.14",
+            str(TRAIN_DIR / "train-page_2-line_1.jpg"): "â“¢Marin  â“•Marcel  â“‘10.8.06",
+            str(TRAIN_DIR / "train-page_2-line_2.jpg"): "â“¢Roques  â“•Eloi  â“‘11.10.04",
+            str(TRAIN_DIR / "train-page_2-line_3.jpg"): "â“¢Giros  â“•Paul  â“‘30.10.10",
         },
         "val": {
-            str(
-                VAL_DIR / "text_line_val-page_1-line_1.jpg"
-            ): "â“¢Monard  â“•Louis  â“‘29-7-04",
-            str(
-                VAL_DIR / "text_line_val-page_1-line_2.jpg"
-            ): "â“¢Astier  â“•Arthur  â“‘11-2-13",
-            str(
-                VAL_DIR / "text_line_val-page_1-line_3.jpg"
-            ): "â“¢De Vlieger  â“•Jules  â“‘21-11-11",
+            str(VAL_DIR / "val-page_1-line_1.jpg"): "â“¢Monard  â“•Louis  â“‘29-7-04",
+            str(VAL_DIR / "val-page_1-line_2.jpg"): "â“¢Astier  â“•Arthur  â“‘11-2-13",
+            str(VAL_DIR / "val-page_1-line_3.jpg"): "â“¢De Vlieger  â“•Jules  â“‘21-11-11",
         },
     }
 
@@ -407,6 +388,56 @@ def test_extract(
         if expected_path.suffix != ".jpg":
             continue
 
-        assert (
-            EXTRACTION_DATA_PATH / "images" / "lines" / expected_path.name
-        ).read_bytes() == expected_path.read_bytes()
+        assert ImageChops.difference(
+            Image.open(
+                EXTRACTION_DATA_PATH / "images" / "text_line" / expected_path.name
+            ),
+            Image.open(expected_path),
+        )
+
+
+@patch("dan.datasets.extract.extract.ArkindexExtractor.build_iiif_url")
+def test_download_image_error(iiif_url, caplog, capsys):
+    task = {
+        "split": "train",
+        "polygon": [],
+        "image_url": "deadbeef",
+        "destination": "/dev/null",
+    }
+    # Make download_image crash
+    iiif_url.return_value = task["image_url"]
+
+    extractor = ArkindexExtractor(
+        folders=["train", "val", "test"],
+        element_type=["text_line"],
+        parent_element_type="double_page",
+        output=None,
+        load_entities=False,
+        entity_separators=None,
+        tokens=None,
+        transcription_worker_version=None,
+        entity_worker_version=None,
+        keep_spaces=False,
+        image_extension=".jpg",
+    )
+
+    # Build a random task
+    extractor.tasks = [task]
+
+    # Add the key in data
+    extractor.data[task["split"]][task["destination"]] = "deadbeefdata"
+
+    extractor.download_images()
+
+    # Key should have been removed
+    assert task["destination"] not in extractor.data[task["split"]]
+
+    # Check error log
+    assert len(caplog.record_tuples) == 1
+    _, level, msg = caplog.record_tuples[0]
+    assert level == logging.ERROR
+    assert msg == "Failed to download 1 image(s)."
+
+    # Check stdout
+    captured = capsys.readouterr()
+    assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"