Skip to content
Snippets Groups Projects

Skip lines where page could not be downloaded

Merged Yoann Schneider requested to merge skip-failed-pages into master
Files
2
@@ -7,7 +7,7 @@ from pathlib import Path
from typing import Any, Dict
import numpy as np
from arkindex_export import Dataset, Element, open_database
from arkindex_export import Dataset, Element, Transcription, open_database
from line_image_extractor.extractor import extract, read_img, save_img
from line_image_extractor.image_utils import polygon_to_bbox, resize
from PIL import Image
@@ -107,7 +107,9 @@ class DataGenerator:
bbox = polygon_to_bbox(polygon)
# Skip if line is vertical and these are forbidden
if self.filter.skip_vertical_lines and _is_vertical(bbox):
return
raise ValueError(
"Image is vertical and current configuration forbids these."
)
# Extract the polygon in the image
image = extract(
@@ -129,11 +131,30 @@ class DataGenerator:
)
)
# Save the image to disk
save_img(path=destination, img=image)
status = save_img(path=destination, img=image)
if not status:
raise ValueError("Image download failed.")
def parse_image_path(self, image_path: Path):
return str(image_path)
def process_child(self, child: Transcription, split: str, image_path: Path) -> None:
"""
Download the child's image and store its transcription.
If something went wrong during image download, the child is skipped and its transcription will not be saved.
"""
try:
# Extract the image
self.get_image(child.element, image_path)
except Exception as e:
logger.warning(f"Skipping element ({child.id}): {e}")
return
# Store transcription
self.data[split][self.parse_image_path(image_path)] = self.parse_transcription(
child.text, split=split
)
def process_parent(self, parent: Element, split: str):
"""
Process every children under this parent element.
@@ -151,13 +172,7 @@ class DataGenerator:
/ "images"
/ f"{parent.id}_{child.element.name.split('_')[-1]}_{child.element_id}.jpg"
)
# Store transcription
self.data[split][
self.parse_image_path(image_path)
] = self.parse_transcription(child.text, split=split)
# Extract the image
self.get_image(child.element, image_path)
self.process_child(child=child, split=split, image_path=image_path)
def export(self):
"""
Loading