Skip to content
Snippets Groups Projects
Commit 80017b86 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Mélodie Boillet
Browse files

Download images in subresolution using IIIF url

parent fa3013b4
No related branches found
No related tags found
1 merge request!175Download images in subresolution using IIIF url
......@@ -146,4 +146,16 @@ def add_extract_parser(subcommands) -> None:
help="Validation set split size.",
)
parser.add_argument(
"--max-width",
type=int,
help="Images larger than this width will be resized to this width.",
)
parser.add_argument(
"--max-height",
type=int,
help="Images larger than this height will be resized to this width.",
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
import ast
from dataclasses import dataclass
from itertools import starmap
from typing import List, NamedTuple, Union
from typing import List, NamedTuple, Optional, Union
from urllib.parse import urljoin
from arkindex_export import Image
......@@ -41,13 +42,21 @@ Entity = NamedTuple(
)
class Element(NamedTuple):
@dataclass
class Element:
id: str
type: str
polygon: str
url: str
width: str
height: str
width: int
height: int
max_width: Optional[int] = None
max_height: Optional[int] = None
def __post_init__(self):
self.max_height = self.max_height or self.height
self.max_width = self.max_width or self.width
@property
def bounding_box(self):
......@@ -56,10 +65,18 @@ class Element(NamedTuple):
@property
def image_url(self):
x, y, width, height = self.bounding_box
return urljoin(self.url + "/", f"{x},{y},{width},{height}/full/0/default.jpg")
return urljoin(
self.url + "/",
f"{x},{y},{width},{height}/!{self.max_width},{self.max_height}/0/default.jpg",
)
def get_elements(parent_id: str, element_type: str) -> List[Element]:
def get_elements(
parent_id: str,
element_type: str,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
) -> List[Element]:
"""
Retrieve elements from an SQLite export of an Arkindex corpus
"""
......@@ -77,10 +94,9 @@ def get_elements(parent_id: str, element_type: str) -> List[Element]:
Image.height,
)
)
return list(
starmap(
Element,
lambda *x: Element(*x, max_width=max_width, max_height=max_height),
query.tuples(),
)
)
......
......@@ -3,7 +3,7 @@
import random
from collections import defaultdict
from pathlib import Path
from typing import List, Union
from typing import List, Optional, Union
from uuid import UUID
from arkindex_export import open_database
......@@ -56,6 +56,8 @@ class ArkindexExtractor:
entity_worker_version: str = None,
train_prob: float = None,
val_prob: float = None,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
) -> None:
self.element_type = element_type
self.parent_element_type = parent_element_type
......@@ -67,6 +69,8 @@ class ArkindexExtractor:
self.entity_worker_version = entity_worker_version
self.train_prob = train_prob
self.val_prob = val_prob
self.max_width = max_width
self.max_height = max_height
self.subsets = self.get_subsets(folders)
......@@ -179,7 +183,12 @@ class ArkindexExtractor:
# Extract children elements
else:
for element_type in self.element_type:
for element in get_elements(parent.id, element_type):
for element in get_elements(
parent.id,
element_type,
max_width=self.max_width,
max_height=self.max_height,
):
try:
data[element_type].append(self.process_element(element, split))
except ProcessingError as e:
......@@ -192,7 +201,12 @@ class ArkindexExtractor:
for idx, subset in enumerate(self.subsets, start=1):
# Iterate over the pages to create splits at page level.
for parent in tqdm(
get_elements(subset.id, self.parent_element_type),
get_elements(
subset.id,
self.parent_element_type,
max_width=self.max_width,
max_height=self.max_height,
),
desc=f"Processing {subset} {idx}/{len(self.subsets)}",
):
split = subset.split or self.get_random_split()
......@@ -219,6 +233,8 @@ def run(
entity_worker_version: Union[str, bool],
train_prob,
val_prob,
max_width: Optional[int],
max_height: Optional[int],
):
assert (
use_existing_split or parent
......@@ -265,4 +281,6 @@ def run(
entity_worker_version=entity_worker_version,
train_prob=train_prob,
val_prob=val_prob,
max_width=max_width,
max_height=max_height,
).run()
......@@ -21,6 +21,8 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` | |
| `--train-prob` | Training set split size | `float` | `0.7` |
| `--val-prob` | Validation set split size | `float` | `0.15` |
| `--max-width` | Images larger than this width will be resized to this width. | `int` | |
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
```yaml
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment