Skip to content
Snippets Groups Projects

Draft: Resolve "support getting a certain element type's transcriptions and not just a page's"

1 file
+ 14
4
Compare changes
  • Side-by-side
  • Inline
@@ -38,7 +38,6 @@ MANUAL = "manual"
TEXT_LINE = "text_line"
WHITE = 255
DEFAULT_RESCALE = 1.0
ROTATION_CLASSES_TO_ANGLES = {
"rotate_0": 0,
"rotate_left_90": 90,
@@ -105,6 +104,7 @@ class HTRDataGenerator:
self.max_deskew_angle = max_deskew_angle
self.skew_angle = skew_angle
self.should_rotate = should_rotate
if scale_x or scale_y_top or scale_y_bottom:
self.should_resize_polygons = True
# use 1.0 as default - no resize, if not specified
@@ -504,13 +504,16 @@ class HTRDataGenerator:
logger.debug(f"Page {page_id}")
self.extract_lines(page_id, image_data)
def run_volumes(self, volume_ids: list):
def run_volumes(self, volume_ids: list, element_type):
for volume_id in tqdm.tqdm(volume_ids):
logger.info(f"Volume {volume_id}")
pages = [
page
for page in self.api_client.cached_paginate(
"ListElementChildren", id=volume_id, recursive=True, type="page"
"ListElementChildren",
id=volume_id,
recursive=True,
type=element_type,
)
]
self.run_pages(pages)
@@ -809,6 +812,13 @@ def create_parser():
help="Cache dir where to save the full size downloaded images. Change it to force redownload.",
)
parser.add_argument(
"--element_type",
type=str,
default="page",
help="The type of the element from which you want to extract the transcriptions",
)
return parser
@@ -849,7 +859,7 @@ def main():
if args.pages:
data_generator.run_pages(args.pages)
if args.volumes:
data_generator.run_volumes(args.volumes)
data_generator.run_volumes(args.volumes, args.element_type)
if args.folders:
data_generator.run_folders(args.folders, args.volume_type)
if args.corpora:
Loading