diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 3ac33aea608f8e2e3e8907e2fd20e41e393bf4c7..ea0a758c8352f0bbb9a8afb000c7e9fcc4aa17ba 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None: help="Validation set split size.", ) + parser.add_argument( + "--max-width", + type=int, + help="Images larger than this width will be resized to this width.", + ) + + parser.add_argument( + "--max-height", + type=int, + help="Images larger than this height will be resized to this width.", + ) + parser.set_defaults(func=run) diff --git a/dan/datasets/extract/db.py b/dan/datasets/extract/db.py index b77769c81247c40761a75490bd4e0da69eca1bca..86a6caac4bef9014218d50c0936c87c7b5712462 100644 --- a/dan/datasets/extract/db.py +++ b/dan/datasets/extract/db.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- import ast +from dataclasses import dataclass from itertools import starmap -from typing import List, NamedTuple, Union +from typing import List, NamedTuple, Optional, Union from urllib.parse import urljoin from arkindex_export import Image @@ -41,13 +42,21 @@ Entity = NamedTuple( ) -class Element(NamedTuple): +@dataclass +class Element: id: str type: str polygon: str url: str - width: str - height: str + width: int + height: int + + max_width: Optional[int] = None + max_height: Optional[int] = None + + def __post_init__(self): + self.max_height = self.max_height or self.height + self.max_width = self.max_width or self.width @property def bounding_box(self): @@ -56,10 +65,18 @@ class Element(NamedTuple): @property def image_url(self): x, y, width, height = self.bounding_box - return urljoin(self.url + "/", f"{x},{y},{width},{height}/full/0/default.jpg") + return urljoin( + self.url + "/", + f"{x},{y},{width},{height}/!{self.max_width},{self.max_height}/0/default.jpg", + ) -def get_elements(parent_id: str, element_type: str) -> List[Element]: +def get_elements( + parent_id: str, + element_type: str, + max_width: Optional[int] = None, + max_height: Optional[int] = None, +) -> List[Element]: """ Retrieve elements from an SQLite export of an Arkindex corpus """ @@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]: Image.height, ) ) - return list( starmap( - Element, + lambda *x: Element(*x, max_width=max_width, max_height=max_height), query.tuples(), ) ) diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 60b90e933d24831b1a662c2c350123d3525178d0..d35b08d0400192bd31d6454e87e9b5687f246e08 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -3,7 +3,7 @@ import random from collections import defaultdict from pathlib import Path -from typing import List, Union +from typing import List, Optional, Union from uuid import UUID from arkindex_export import open_database @@ -56,6 +56,8 @@ class ArkindexExtractor: entity_worker_version: str = None, train_prob: float = None, val_prob: float = None, + max_width: Optional[int] = None, + max_height: Optional[int] = None, ) -> None: self.element_type = element_type self.parent_element_type = parent_element_type @@ -67,6 +69,8 @@ class ArkindexExtractor: self.entity_worker_version = entity_worker_version self.train_prob = train_prob self.val_prob = val_prob + self.max_width = max_width + self.max_height = max_height self.subsets = self.get_subsets(folders) @@ -179,7 +183,12 @@ class ArkindexExtractor: # Extract children elements else: for element_type in self.element_type: - for element in get_elements(parent.id, element_type): + for element in get_elements( + parent.id, + element_type, + max_width=self.max_width, + max_height=self.max_height, + ): try: data[element_type].append(self.process_element(element, split)) except ProcessingError as e: @@ -192,7 +201,12 @@ class ArkindexExtractor: for idx, subset in enumerate(self.subsets, start=1): # Iterate over the pages to create splits at page level. for parent in tqdm( - get_elements(subset.id, self.parent_element_type), + get_elements( + subset.id, + self.parent_element_type, + max_width=self.max_width, + max_height=self.max_height, + ), desc=f"Processing {subset} {idx}/{len(self.subsets)}", ): split = subset.split or self.get_random_split() @@ -219,6 +233,8 @@ def run( entity_worker_version: Union[str, bool], train_prob, val_prob, + max_width: Optional[int], + max_height: Optional[int], ): assert ( use_existing_split or parent @@ -265,4 +281,6 @@ def run( entity_worker_version=entity_worker_version, train_prob=train_prob, val_prob=val_prob, + max_width=max_width, + max_height=max_height, ).run() diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index 907a1c39a2aa0db62063ae636bc7b919797db9ce..3c6a6cf2428517daa910d118c64d75aac7d4bc7f 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind | `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` | | | `--train-prob` | Training set split size | `float` | `0.7` | | `--val-prob` | Validation set split size | `float` | `0.15` | +| `--max-width` | Images larger than this width will be resized to this width. | `int` | | +| `--max-height` | Images larger than this height will be resized to this height. | `int` | | The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. ```yaml