From 80017b86128e3ae12e1eccfdd96b31620ad0e9e7 Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Mon, 26 Jun 2023 10:03:42 +0000
Subject: [PATCH] Download images in subresolution using IIIF url

---
 dan/datasets/extract/__init__.py | 12 ++++++++++++
 dan/datasets/extract/db.py       | 32 ++++++++++++++++++++++++--------
 dan/datasets/extract/extract.py  | 24 +++++++++++++++++++++---
 docs/usage/datasets/extract.md   |  2 ++
 4 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index 3ac33aea..ea0a758c 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None:
         help="Validation set split size.",
     )
 
+    parser.add_argument(
+        "--max-width",
+        type=int,
+        help="Images larger than this width will be resized to this width.",
+    )
+
+    parser.add_argument(
+        "--max-height",
+        type=int,
+        help="Images larger than this height will be resized to this width.",
+    )
+
     parser.set_defaults(func=run)
diff --git a/dan/datasets/extract/db.py b/dan/datasets/extract/db.py
index b77769c8..86a6caac 100644
--- a/dan/datasets/extract/db.py
+++ b/dan/datasets/extract/db.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
 import ast
+from dataclasses import dataclass
 from itertools import starmap
-from typing import List, NamedTuple, Union
+from typing import List, NamedTuple, Optional, Union
 from urllib.parse import urljoin
 
 from arkindex_export import Image
@@ -41,13 +42,21 @@ Entity = NamedTuple(
 )
 
 
-class Element(NamedTuple):
+@dataclass
+class Element:
     id: str
     type: str
     polygon: str
     url: str
-    width: str
-    height: str
+    width: int
+    height: int
+
+    max_width: Optional[int] = None
+    max_height: Optional[int] = None
+
+    def __post_init__(self):
+        self.max_height = self.max_height or self.height
+        self.max_width = self.max_width or self.width
 
     @property
     def bounding_box(self):
@@ -56,10 +65,18 @@ class Element(NamedTuple):
     @property
     def image_url(self):
         x, y, width, height = self.bounding_box
-        return urljoin(self.url + "/", f"{x},{y},{width},{height}/full/0/default.jpg")
+        return urljoin(
+            self.url + "/",
+            f"{x},{y},{width},{height}/!{self.max_width},{self.max_height}/0/default.jpg",
+        )
 
 
-def get_elements(parent_id: str, element_type: str) -> List[Element]:
+def get_elements(
+    parent_id: str,
+    element_type: str,
+    max_width: Optional[int] = None,
+    max_height: Optional[int] = None,
+) -> List[Element]:
     """
     Retrieve elements from an SQLite export of an Arkindex corpus
     """
@@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]:
             Image.height,
         )
     )
-
     return list(
         starmap(
-            Element,
+            lambda *x: Element(*x, max_width=max_width, max_height=max_height),
             query.tuples(),
         )
     )
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index 60b90e93..d35b08d0 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -3,7 +3,7 @@
 import random
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 from uuid import UUID
 
 from arkindex_export import open_database
@@ -56,6 +56,8 @@ class ArkindexExtractor:
         entity_worker_version: str = None,
         train_prob: float = None,
         val_prob: float = None,
+        max_width: Optional[int] = None,
+        max_height: Optional[int] = None,
     ) -> None:
         self.element_type = element_type
         self.parent_element_type = parent_element_type
@@ -67,6 +69,8 @@ class ArkindexExtractor:
         self.entity_worker_version = entity_worker_version
         self.train_prob = train_prob
         self.val_prob = val_prob
+        self.max_width = max_width
+        self.max_height = max_height
 
         self.subsets = self.get_subsets(folders)
 
@@ -179,7 +183,12 @@ class ArkindexExtractor:
         # Extract children elements
         else:
             for element_type in self.element_type:
-                for element in get_elements(parent.id, element_type):
+                for element in get_elements(
+                    parent.id,
+                    element_type,
+                    max_width=self.max_width,
+                    max_height=self.max_height,
+                ):
                     try:
                         data[element_type].append(self.process_element(element, split))
                     except ProcessingError as e:
@@ -192,7 +201,12 @@ class ArkindexExtractor:
         for idx, subset in enumerate(self.subsets, start=1):
             # Iterate over the pages to create splits at page level.
             for parent in tqdm(
-                get_elements(subset.id, self.parent_element_type),
+                get_elements(
+                    subset.id,
+                    self.parent_element_type,
+                    max_width=self.max_width,
+                    max_height=self.max_height,
+                ),
                 desc=f"Processing {subset} {idx}/{len(self.subsets)}",
             ):
                 split = subset.split or self.get_random_split()
@@ -219,6 +233,8 @@ def run(
     entity_worker_version: Union[str, bool],
     train_prob,
     val_prob,
+    max_width: Optional[int],
+    max_height: Optional[int],
 ):
     assert (
         use_existing_split or parent
@@ -265,4 +281,6 @@ def run(
         entity_worker_version=entity_worker_version,
         train_prob=train_prob,
         val_prob=val_prob,
+        max_width=max_width,
+        max_height=max_height,
     ).run()
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index 907a1c39..3c6a6cf2 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 | `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` |         |
 | `--train-prob`                   | Training set split size                                                             | `float`    | `0.7`     |
 | `--val-prob`                     | Validation set split size                                                           | `float`    | `0.15`    |
+| `--max-width`                     | Images larger than this width will be resized to this width.                                                           | `int`    |     |
+| `--max-height`                     | Images larger than this height will be resized to this height.                                                           | `int`    |     |
 
 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
 ```yaml
-- 
GitLab