Skip to content
Snippets Groups Projects

Skip lines where page could not be downloaded

Merged Yoann Schneider requested to merge skip-failed-pages into master
Files
2
@@ -7,7 +7,7 @@ from pathlib import Path
from typing import Any, Dict
import numpy as np
from arkindex_export import Dataset, Element, open_database
from arkindex_export import Dataset, Element, Transcription, open_database
from line_image_extractor.extractor import extract, read_img, save_img
from line_image_extractor.image_utils import polygon_to_bbox, resize
from PIL import Image
@@ -134,6 +134,19 @@ class DataGenerator:
def parse_image_path(self, image_path: Path):
return str(image_path)
def process_child(self, child: Transcription, split: str, image_path: Path):
try:
# Extract the image
self.get_image(child.element, image_path)
except Exception as e:
logger.warn(f"Skipping element ({child.id}): {e}")
return
# Store transcription
self.data[split][self.parse_image_path(image_path)] = self.parse_transcription(
child.text, split=split
)
def process_parent(self, parent: Element, split: str):
"""
Process every children under this parent element.
@@ -151,17 +164,7 @@ class DataGenerator:
/ "images"
/ f"{parent.id}_{child.element.name.split('_')[-1]}_{child.element_id}.jpg"
)
try:
# Extract the image
self.get_image(child.element, image_path)
except Exception as e:
logger.warn(f"Skipping element ({child.id}): {e}")
continue
# Store transcription
self.data[split][
self.parse_image_path(image_path)
] = self.parse_transcription(child.text, split=split)
self.process_child(child=child, split=split, image_path=image_path)
def export(self):
"""
Loading