From 5f3fc3d853943a99e3c4d893c1bea49c43babcc3 Mon Sep 17 00:00:00 2001 From: Chaza_Abdelwahab <abdelwahab@teklia.com> Date: Thu, 6 Jan 2022 14:36:48 +0100 Subject: [PATCH] tested changes --- kaldi_data_generator/main.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kaldi_data_generator/main.py b/kaldi_data_generator/main.py index e1dadc9..49f9f70 100644 --- a/kaldi_data_generator/main.py +++ b/kaldi_data_generator/main.py @@ -38,7 +38,6 @@ MANUAL = "manual" TEXT_LINE = "text_line" WHITE = 255 DEFAULT_RESCALE = 1.0 - ROTATION_CLASSES_TO_ANGLES = { "rotate_0": 0, "rotate_left_90": 90, @@ -105,6 +104,7 @@ class HTRDataGenerator: self.max_deskew_angle = max_deskew_angle self.skew_angle = skew_angle self.should_rotate = should_rotate + if scale_x or scale_y_top or scale_y_bottom: self.should_resize_polygons = True # use 1.0 as default - no resize, if not specified @@ -504,13 +504,16 @@ class HTRDataGenerator: logger.debug(f"Page {page_id}") self.extract_lines(page_id, image_data) - def run_volumes(self, volume_ids: list): + def run_volumes(self, volume_ids: list, element_type): for volume_id in tqdm.tqdm(volume_ids): logger.info(f"Volume {volume_id}") pages = [ page for page in self.api_client.cached_paginate( - "ListElementChildren", id=volume_id, recursive=True, type="page" + "ListElementChildren", + id=volume_id, + recursive=True, + type=element_type, ) ] self.run_pages(pages) @@ -809,6 +812,13 @@ def create_parser(): help="Cache dir where to save the full size downloaded images. Change it to force redownload.", ) + parser.add_argument( + "--element_type", + type=str, + default="page", + help="The type of the element from which you want to extract the transcriptions", + ) + return parser @@ -849,7 +859,7 @@ def main(): if args.pages: data_generator.run_pages(args.pages) if args.volumes: - data_generator.run_volumes(args.volumes) + data_generator.run_volumes(args.volumes, args.element_type) if args.folders: data_generator.run_folders(args.folders, args.volume_type) if args.corpora: -- GitLab