Skip to content
Snippets Groups Projects
Commit 4347bfd8 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Mélodie
Browse files

Download images in subresolution using IIIF url

parent 5fe511cf
No related branches found
No related tags found
No related merge requests found
...@@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None: ...@@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None:
help="Validation set split size.", help="Validation set split size.",
) )
parser.add_argument(
"--max-width",
type=int,
help="Images larger than this width will be resized to this width.",
)
parser.add_argument(
"--max-height",
type=int,
help="Images larger than this height will be resized to this width.",
)
parser.set_defaults(func=run) parser.set_defaults(func=run)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import ast import ast
from dataclasses import dataclass
from itertools import starmap from itertools import starmap
from typing import List, NamedTuple, Union from typing import List, NamedTuple, Optional, Union
from urllib.parse import urljoin from urllib.parse import urljoin
from arkindex_export import Image from arkindex_export import Image
...@@ -41,13 +42,21 @@ Entity = NamedTuple( ...@@ -41,13 +42,21 @@ Entity = NamedTuple(
) )
class Element(NamedTuple): @dataclass
class Element:
id: str id: str
type: str type: str
polygon: str polygon: str
url: str url: str
width: str width: int
height: str height: int
max_width: Optional[int] = None
max_height: Optional[int] = None
def __post_init__(self):
self.max_height = self.max_height or self.height
self.max_width = self.max_width or self.width
@property @property
def bounding_box(self): def bounding_box(self):
...@@ -56,10 +65,18 @@ class Element(NamedTuple): ...@@ -56,10 +65,18 @@ class Element(NamedTuple):
@property @property
def image_url(self): def image_url(self):
x, y, width, height = self.bounding_box x, y, width, height = self.bounding_box
return urljoin(self.url + "/", f"{x},{y},{width},{height}/full/0/default.jpg") return urljoin(
self.url + "/",
f"{x},{y},{width},{height}/!{self.max_width},{self.max_height}/0/default.jpg",
)
def get_elements(parent_id: str, element_type: str) -> List[Element]: def get_elements(
parent_id: str,
element_type: str,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
) -> List[Element]:
""" """
Retrieve elements from an SQLite export of an Arkindex corpus Retrieve elements from an SQLite export of an Arkindex corpus
""" """
...@@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]: ...@@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]:
Image.height, Image.height,
) )
) )
return list( return list(
starmap( starmap(
Element, lambda *x: Element(*x, max_width=max_width, max_height=max_height),
query.tuples(), query.tuples(),
) )
) )
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import random import random
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import List, Optional, Union
from uuid import UUID from uuid import UUID
from arkindex_export import open_database from arkindex_export import open_database
...@@ -56,6 +56,8 @@ class ArkindexExtractor: ...@@ -56,6 +56,8 @@ class ArkindexExtractor:
entity_worker_version: str = None, entity_worker_version: str = None,
train_prob: float = None, train_prob: float = None,
val_prob: float = None, val_prob: float = None,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
) -> None: ) -> None:
self.element_type = element_type self.element_type = element_type
self.parent_element_type = parent_element_type self.parent_element_type = parent_element_type
...@@ -67,6 +69,8 @@ class ArkindexExtractor: ...@@ -67,6 +69,8 @@ class ArkindexExtractor:
self.entity_worker_version = entity_worker_version self.entity_worker_version = entity_worker_version
self.train_prob = train_prob self.train_prob = train_prob
self.val_prob = val_prob self.val_prob = val_prob
self.max_width = max_width
self.max_height = max_height
self.subsets = self.get_subsets(folders) self.subsets = self.get_subsets(folders)
...@@ -179,7 +183,12 @@ class ArkindexExtractor: ...@@ -179,7 +183,12 @@ class ArkindexExtractor:
# Extract children elements # Extract children elements
else: else:
for element_type in self.element_type: for element_type in self.element_type:
for element in get_elements(parent.id, element_type): for element in get_elements(
parent.id,
element_type,
max_width=self.max_width,
max_height=self.max_height,
):
try: try:
data[element_type].append(self.process_element(element, split)) data[element_type].append(self.process_element(element, split))
except ProcessingError as e: except ProcessingError as e:
...@@ -192,7 +201,12 @@ class ArkindexExtractor: ...@@ -192,7 +201,12 @@ class ArkindexExtractor:
for idx, subset in enumerate(self.subsets, start=1): for idx, subset in enumerate(self.subsets, start=1):
# Iterate over the pages to create splits at page level. # Iterate over the pages to create splits at page level.
for parent in tqdm( for parent in tqdm(
get_elements(subset.id, self.parent_element_type), get_elements(
subset.id,
self.parent_element_type,
max_width=self.max_width,
max_height=self.max_height,
),
desc=f"Processing {subset} {idx}/{len(self.subsets)}", desc=f"Processing {subset} {idx}/{len(self.subsets)}",
): ):
split = subset.split or self.get_random_split() split = subset.split or self.get_random_split()
...@@ -219,6 +233,8 @@ def run( ...@@ -219,6 +233,8 @@ def run(
entity_worker_version: Union[str, bool], entity_worker_version: Union[str, bool],
train_prob, train_prob,
val_prob, val_prob,
max_width: Optional[int],
max_height: Optional[int],
): ):
assert ( assert (
use_existing_split or parent use_existing_split or parent
...@@ -265,4 +281,6 @@ def run( ...@@ -265,4 +281,6 @@ def run(
entity_worker_version=entity_worker_version, entity_worker_version=entity_worker_version,
train_prob=train_prob, train_prob=train_prob,
val_prob=val_prob, val_prob=val_prob,
max_width=max_width,
max_height=max_height,
).run() ).run()
...@@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind ...@@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` | | | `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` | |
| `--train-prob` | Training set split size | `float` | `0.7` | | `--train-prob` | Training set split size | `float` | `0.7` |
| `--val-prob` | Validation set split size | `float` | `0.15` | | `--val-prob` | Validation set split size | `float` | `0.15` |
| `--max-width` | Images larger than this width will be resized to this width. | `int` | |
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
```yaml ```yaml
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment