generate synthetic documents with curriculum (number of lines)

2ecb07cd · Solene Tarride · f021b5d0 · 2ecb07cd
Commit 2ecb07cd authored 2 years ago by Solene Tarride
--- a/dan/manager/ocr.py
+++ b/dan/manager/ocr.py
@@ -388,11 +388,16 @@ class OCRDataset(GenericDataset):
                    )
                )
            else:
-                label = self.samples[randint(0, len(self))]["label"]
+                # Get a page-level transcription and split it by lines
+                texts = self.samples[randint(0, len(self))]["label"].split("\n")
+                # Select some lines to be generated
+                n_lines = min(len(texts), nb_lines_per_page)
+                i = randint(0, len(texts) - n_lines + 1)
+                texts = texts[i : i + n_lines]
+                # Generate the synthetic document (of n_lines)
                pages.append(
-                    generate_typed_text_paragraph_image(
-                        config=self.params["config"]["synthetic_data"]["config"],
-                        texts=label.split("\n"),
+                    self.generate_typed_text_paragraph_image(
+                        texts=texts,
                        same_font_size=True,
                    )
                )
@@ -432,6 +437,74 @@ class OCRDataset(GenericDataset):
            text, self.params["config"]["synthetic_data"]["config"]
        )

+    def generate_typed_text_paragraph_image(
+        self, texts, padding_value=255, max_pad_left_ratio=0.1, same_font_size=False
+    ):
+        config = self.params["config"]["synthetic_data"]["config"]
+        if same_font_size:
+            images = list()
+            txt_color = config["text_color_default"]
+            bg_color = config["background_color_default"]
+            font_size = randint(config["font_size_min"], config["font_size_max"] + 1)
+            for text in texts:
+                font_path = config["valid_fonts"][
+                    randint(0, len(config["valid_fonts"]))
+                ]
+                fnt = ImageFont.truetype(font_path, font_size)
+                text_width, text_height = fnt.getsize(text)
+                padding_top = int(
+                    rand_uniform(
+                        config["padding_top_ratio_min"], config["padding_top_ratio_max"]
+                    )
+                    * text_height
+                )
+                padding_bottom = int(
+                    rand_uniform(
+                        config["padding_bottom_ratio_min"],
+                        config["padding_bottom_ratio_max"],
+                    )
+                    * text_height
+                )
+                padding_left = int(
+                    rand_uniform(
+                        config["padding_left_ratio_min"],
+                        config["padding_left_ratio_max"],
+                    )
+                    * text_width
+                )
+                padding_right = int(
+                    rand_uniform(
+                        config["padding_right_ratio_min"],
+                        config["padding_right_ratio_max"],
+                    )
+                    * text_width
+                )
+                padding = [padding_top, padding_bottom, padding_left, padding_right]
+                images.append(
+                    generate_typed_text_line_image_from_params(
+                        text, fnt, bg_color, txt_color, config["color_mode"], padding
+                    )
+                )
+        else:
+            images = [generate_typed_text_line_image(t) for t in texts]
+
+        max_width = max([img.shape[1] for img in images])
+        padded_images = [
+            pad_image_width_random(
+                img,
+                max_width,
+                padding_value=padding_value,
+                max_pad_left_ratio=max_pad_left_ratio,
+            )
+            for img in images
+        ]
+        label = {
+            "sem": "\n".join(texts),
+            "begin": "\n".join(texts),
+            "raw": "\n".join(texts),
+        }
+        return [np.concatenate(padded_images, axis=0), label, 1]  # image, label, n_col
+

 class OCRCollateFunction:
    """
@@ -618,70 +691,6 @@ def generate_typed_text_line_image_from_params(
    return np.array(img)


-def generate_typed_text_paragraph_image(
-    config, texts, padding_value=255, max_pad_left_ratio=0.1, same_font_size=False
-):
-    if same_font_size:
-        images = list()
-        txt_color = config["text_color_default"]
-        bg_color = config["background_color_default"]
-        font_size = randint(config["font_size_min"], config["font_size_max"] + 1)
-        for text in texts:
-            font_path = config["valid_fonts"][randint(0, len(config["valid_fonts"]))]
-            fnt = ImageFont.truetype(font_path, font_size)
-            text_width, text_height = fnt.getsize(text)
-            padding_top = int(
-                rand_uniform(
-                    config["padding_top_ratio_min"], config["padding_top_ratio_max"]
-                )
-                * text_height
-            )
-            padding_bottom = int(
-                rand_uniform(
-                    config["padding_bottom_ratio_min"],
-                    config["padding_bottom_ratio_max"],
-                )
-                * text_height
-            )
-            padding_left = int(
-                rand_uniform(
-                    config["padding_left_ratio_min"], config["padding_left_ratio_max"]
-                )
-                * text_width
-            )
-            padding_right = int(
-                rand_uniform(
-                    config["padding_right_ratio_min"], config["padding_right_ratio_max"]
-                )
-                * text_width
-            )
-            padding = [padding_top, padding_bottom, padding_left, padding_right]
-            images.append(
-                generate_typed_text_line_image_from_params(
-                    text, fnt, bg_color, txt_color, config["color_mode"], padding
-                )
-            )
-    else:
-        images = [generate_typed_text_line_image(t) for t in texts]
-
-    max_width = max([img.shape[1] for img in images])
-    padded_images = [
-        pad_image_width_random(
-            img,
-            max_width,
-            padding_value=padding_value,
-            max_pad_left_ratio=max_pad_left_ratio,
-        )
-        for img in images
-    ]
-    label = {
-        "sem": "\n".join(texts),
-        "begin": "\n".join(texts),
-        "raw": "\n".join(texts),
-    }
-    return [np.concatenate(padded_images, axis=0), label, 1]  # image, label, n_col
-
-
 def char_in_font(unicode_char, font_path):
    with TTFont(font_path) as font:
        for cmap in font["cmap"].tables: