From 2ecb07cd66b171e9301079643dc5c06ca90e8816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Fri, 2 Dec 2022 17:56:12 +0000 Subject: [PATCH] generate synthetic documents with curriculum (number of lines) --- dan/manager/ocr.py | 145 ++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 68 deletions(-) diff --git a/dan/manager/ocr.py b/dan/manager/ocr.py index 372f597d..43d46894 100644 --- a/dan/manager/ocr.py +++ b/dan/manager/ocr.py @@ -388,11 +388,16 @@ class OCRDataset(GenericDataset): ) ) else: - label = self.samples[randint(0, len(self))]["label"] + # Get a page-level transcription and split it by lines + texts = self.samples[randint(0, len(self))]["label"].split("\n") + # Select some lines to be generated + n_lines = min(len(texts), nb_lines_per_page) + i = randint(0, len(texts) - n_lines + 1) + texts = texts[i : i + n_lines] + # Generate the synthetic document (of n_lines) pages.append( - generate_typed_text_paragraph_image( - config=self.params["config"]["synthetic_data"]["config"], - texts=label.split("\n"), + self.generate_typed_text_paragraph_image( + texts=texts, same_font_size=True, ) ) @@ -432,6 +437,74 @@ class OCRDataset(GenericDataset): text, self.params["config"]["synthetic_data"]["config"] ) + def generate_typed_text_paragraph_image( + self, texts, padding_value=255, max_pad_left_ratio=0.1, same_font_size=False + ): + config = self.params["config"]["synthetic_data"]["config"] + if same_font_size: + images = list() + txt_color = config["text_color_default"] + bg_color = config["background_color_default"] + font_size = randint(config["font_size_min"], config["font_size_max"] + 1) + for text in texts: + font_path = config["valid_fonts"][ + randint(0, len(config["valid_fonts"])) + ] + fnt = ImageFont.truetype(font_path, font_size) + text_width, text_height = fnt.getsize(text) + padding_top = int( + rand_uniform( + config["padding_top_ratio_min"], config["padding_top_ratio_max"] + ) + * text_height + ) + padding_bottom = int( + rand_uniform( + config["padding_bottom_ratio_min"], + config["padding_bottom_ratio_max"], + ) + * text_height + ) + padding_left = int( + rand_uniform( + config["padding_left_ratio_min"], + config["padding_left_ratio_max"], + ) + * text_width + ) + padding_right = int( + rand_uniform( + config["padding_right_ratio_min"], + config["padding_right_ratio_max"], + ) + * text_width + ) + padding = [padding_top, padding_bottom, padding_left, padding_right] + images.append( + generate_typed_text_line_image_from_params( + text, fnt, bg_color, txt_color, config["color_mode"], padding + ) + ) + else: + images = [generate_typed_text_line_image(t) for t in texts] + + max_width = max([img.shape[1] for img in images]) + padded_images = [ + pad_image_width_random( + img, + max_width, + padding_value=padding_value, + max_pad_left_ratio=max_pad_left_ratio, + ) + for img in images + ] + label = { + "sem": "\n".join(texts), + "begin": "\n".join(texts), + "raw": "\n".join(texts), + } + return [np.concatenate(padded_images, axis=0), label, 1] # image, label, n_col + class OCRCollateFunction: """ @@ -618,70 +691,6 @@ def generate_typed_text_line_image_from_params( return np.array(img) -def generate_typed_text_paragraph_image( - config, texts, padding_value=255, max_pad_left_ratio=0.1, same_font_size=False -): - if same_font_size: - images = list() - txt_color = config["text_color_default"] - bg_color = config["background_color_default"] - font_size = randint(config["font_size_min"], config["font_size_max"] + 1) - for text in texts: - font_path = config["valid_fonts"][randint(0, len(config["valid_fonts"]))] - fnt = ImageFont.truetype(font_path, font_size) - text_width, text_height = fnt.getsize(text) - padding_top = int( - rand_uniform( - config["padding_top_ratio_min"], config["padding_top_ratio_max"] - ) - * text_height - ) - padding_bottom = int( - rand_uniform( - config["padding_bottom_ratio_min"], - config["padding_bottom_ratio_max"], - ) - * text_height - ) - padding_left = int( - rand_uniform( - config["padding_left_ratio_min"], config["padding_left_ratio_max"] - ) - * text_width - ) - padding_right = int( - rand_uniform( - config["padding_right_ratio_min"], config["padding_right_ratio_max"] - ) - * text_width - ) - padding = [padding_top, padding_bottom, padding_left, padding_right] - images.append( - generate_typed_text_line_image_from_params( - text, fnt, bg_color, txt_color, config["color_mode"], padding - ) - ) - else: - images = [generate_typed_text_line_image(t) for t in texts] - - max_width = max([img.shape[1] for img in images]) - padded_images = [ - pad_image_width_random( - img, - max_width, - padding_value=padding_value, - max_pad_left_ratio=max_pad_left_ratio, - ) - for img in images - ] - label = { - "sem": "\n".join(texts), - "begin": "\n".join(texts), - "raw": "\n".join(texts), - } - return [np.concatenate(padded_images, axis=0), label, 1] # image, label, n_col - - def char_in_font(unicode_char, font_path): with TTFont(font_path) as font: for cmap in font["cmap"].tables: -- GitLab