Compare revisions

Solene Tarride · Yoann Schneider · Mélodie Boillet · Yoann Schneider · Yoann Schneider · 5a7dd7ef
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
      - id: isort
        args: ["--profile", "black"]
  - repo: https://github.com/ambv/black
-    rev: 23.1.0
+    rev: 23.3.0
    hooks:
    - id: black
  - repo: https://github.com/pycqa/flake8
@@ -35,7 +35,7 @@ repos:
      - id: end-of-file-fixer
      - id: mixed-line-ending
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.5
    hooks:
      - id: codespell
        args: ['--write-changes']

--- a/dan/manager/dataset.py
+++ b/dan/manager/dataset.py
@@ -3,15 +3,18 @@ import json
 import os
 import random

-import cv2
 import numpy as np
 import torch
-from PIL import Image
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
+from torchvision.io import ImageReadMode, read_image

 from dan.datasets.utils import natural_sort
-from dan.transforms import apply_data_augmentation
+from dan.transforms import (
+    get_augmentation_transforms,
+    get_normalization_transforms,
+    get_preprocessing_transforms,
+)


 class DatasetManager:
@@ -60,11 +63,13 @@ class DatasetManager:
            "train",
            self.params["train"]["name"],
            self.get_paths_and_sets(self.params["train"]["datasets"]),
+            normalization_transforms=get_normalization_transforms(),
+            augmentation_transforms=(
+                get_augmentation_transforms()
+                if self.params["config"]["augmentation"]
+                else None
+            ),
        )
-        (
-            self.params["config"]["mean"],
-            self.params["config"]["std"],
-        ) = self.train_dataset.compute_std_mean()

        self.my_collate_function = self.train_dataset.collate_function(
            self.params["config"]
@@ -77,6 +82,8 @@ class DatasetManager:
                "val",
                custom_name,
                self.get_paths_and_sets(self.params["val"][custom_name]),
+                normalization_transforms=get_normalization_transforms(),
+                augmentation_transforms=None,
            )
            self.apply_specific_treatment_after_dataset_loading(
                self.valid_datasets[custom_name]
@@ -155,7 +162,11 @@ class DatasetManager:
                {"path": self.params["datasets"][set_info[0]], "set_name": set_info[1]}
            )
        self.test_datasets[custom_name] = self.dataset_class(
-            self.params, "test", custom_name, paths_and_sets
+            self.params,
+            "test",
+            custom_name,
+            paths_and_sets,
+            normalization_transforms=get_normalization_transforms(),
        )
        self.apply_specific_treatment_after_dataset_loading(
            self.test_datasets[custom_name]
@@ -199,29 +210,18 @@ class GenericDataset(Dataset):
        self.params = params
        self.name = custom_name
        self.set_name = set_name
-        self.mean = (
-            np.array(params["config"]["mean"])
-            if "mean" in params["config"].keys()
-            else None
-        )
-        self.std = (
-            np.array(params["config"]["std"])
-            if "std" in params["config"].keys()
-            else None
-        )

+        self.preprocessing_transforms = get_preprocessing_transforms(
+            params["config"]["preprocessings"]
+        )
        self.load_in_memory = (
            self.params["config"]["load_in_memory"]
            if "load_in_memory" in self.params["config"]
            else True
        )

-        self.samples = self.load_samples(
-            paths_and_sets, load_in_memory=self.load_in_memory
-        )
-
-        if self.load_in_memory:
-            self.apply_preprocessing(params["config"]["preprocessings"])
+        # Load samples and preprocess images if load_in_memory is True
+        self.samples = self.load_samples(paths_and_sets)

        self.curriculum_config = None

@@ -230,15 +230,13 @@ class GenericDataset(Dataset):

    @staticmethod
    def load_image(path):
-        with Image.open(path) as pil_img:
-            img = np.array(pil_img)
-            # grayscale images
-            if len(img.shape) == 2:
-                img = np.expand_dims(img, axis=2)
-        return img
+        """
+        Load an image as a torch.Tensor and scale the values between 0 and 1.
+        """
+        img = read_image(path, mode=ImageReadMode.RGB)
+        return img.to(dtype=torch.get_default_dtype()).div(255)

-    @staticmethod
-    def load_samples(paths_and_sets, load_in_memory=True):
+    def load_samples(self, paths_and_sets):
        """
        Load images and labels
        """
@@ -262,64 +260,12 @@ class GenericDataset(Dataset):
                        "path": os.path.abspath(filename),
                    }
                )
-                if load_in_memory:
-                    samples[-1]["img"] = GenericDataset.load_image(filename)
+                if self.load_in_memory:
+                    samples[-1]["img"] = self.preprocessing_transforms(
+                        self.load_image(filename)
+                    )
        return samples

-    def apply_preprocessing(self, preprocessings):
-        for i in range(len(self.samples)):
-            (
-                self.samples[i]["img"],
-                self.samples[i]["resize_ratio"],
-            ) = apply_preprocessing(self.samples[i]["img"], preprocessings)
-
-    def compute_std_mean(self):
-        """
-        Compute cumulated variance and mean of whole dataset
-        """
-        if self.mean is not None and self.std is not None:
-            return self.mean, self.std
-
-        sum = np.zeros((3,))
-        diff = np.zeros((3,))
-        nb_pixels = 0
-        for metric in ["mean", "std"]:
-            for ind in range(len(self.samples)):
-                img = (
-                    self.get_sample_img(ind)
-                    if self.load_in_memory
-                    else apply_preprocessing(
-                        self.get_sample_img(ind),
-                        self.params["config"]["preprocessings"],
-                    )[0]
-                )
-
-                if metric == "mean":
-                    sum += np.sum(img, axis=(0, 1))
-                    nb_pixels += np.prod(img.shape[:2])
-                elif metric == "std":
-                    diff += [
-                        np.sum((img[:, :, k] - self.mean[k]) ** 2) for k in range(3)
-                    ]
-            if metric == "mean":
-                self.mean = sum / nb_pixels
-            elif metric == "std":
-                self.std = np.sqrt(diff / nb_pixels)
-        return self.mean, self.std
-
-    def apply_data_augmentation(self, img):
-        """
-        Apply data augmentation strategy on the input image
-        """
-        augs = [
-            self.params["config"][key] if key in self.params["config"].keys() else None
-            for key in ["augmentation", "valid_augmentation", "test_augmentation"]
-        ]
-        for aug, set_name in zip(augs, ["train", "val", "test"]):
-            if aug and self.set_name == set_name:
-                return apply_data_augmentation(img, aug)
-        return img
-
    def get_sample_img(self, i):
        """
        Get image by index
@@ -327,60 +273,6 @@ class GenericDataset(Dataset):
        if self.load_in_memory:
            return self.samples[i]["img"]
        else:
-            return GenericDataset.load_image(self.samples[i]["path"])
-
-
-def apply_preprocessing(img, preprocessings):
-    """
-    Apply preprocessings on an image
-    """
-    resize_ratio = [1, 1]
-    for preprocessing in preprocessings:
-        if preprocessing["type"] == "to_grayscaled":
-            temp_img = img
-            h, w, c = temp_img.shape
-            if c == 3:
-                img = np.expand_dims(
-                    0.2125 * temp_img[:, :, 0]
-                    + 0.7154 * temp_img[:, :, 1]
-                    + 0.0721 * temp_img[:, :, 2],
-                    axis=2,
-                ).astype(np.uint8)
-
-        if preprocessing["type"] == "to_RGB":
-            temp_img = img
-            h, w, c = temp_img.shape
-            if c == 1:
-                img = np.concatenate([temp_img, temp_img, temp_img], axis=2)
-
-        if preprocessing["type"] == "resize":
-            keep_ratio = preprocessing["keep_ratio"]
-            max_h, max_w = preprocessing["max_height"], preprocessing["max_width"]
-            temp_img = img
-            h, w, c = temp_img.shape
-
-            ratio_h = max_h / h if max_h else 1
-            ratio_w = max_w / w if max_w else 1
-            if keep_ratio:
-                ratio_h = ratio_w = min(ratio_w, ratio_h)
-            new_h = min(max_h, int(h * ratio_h))
-            new_w = min(max_w, int(w * ratio_w))
-            temp_img = cv2.resize(temp_img, (new_w, new_h))
-            if len(temp_img.shape) == 2:
-                temp_img = np.expand_dims(temp_img, axis=2)
-
-            img = temp_img
-            resize_ratio = [ratio_h, ratio_w]
-
-        if preprocessing["type"] == "fixed_height":
-            new_h = preprocessing["height"]
-            temp_img = img
-            h, w, c = temp_img.shape
-            ratio = new_h / h
-            temp_img = cv2.resize(temp_img, (int(w * ratio), new_h))
-            if len(temp_img.shape) == 2:
-                temp_img = np.expand_dims(temp_img, axis=2)
-            img = temp_img
-            resize_ratio = [ratio, ratio]
-
-    return img, resize_ratio
+            return self.preprocessing_transforms(
+                self.load_image(self.samples[i]["path"])
+            )
--- a/dan/manager/ocr.py
+++ b/dan/manager/ocr.py
@@ -2,11 +2,9 @@
 import os
 import pickle

-import cv2
 import numpy as np
-import torch

-from dan.manager.dataset import DatasetManager, GenericDataset, apply_preprocessing
+from dan.manager.dataset import DatasetManager, GenericDataset
 from dan.utils import pad_images, pad_sequences_1D, token_to_ind


@@ -52,43 +50,43 @@ class OCRDataset(GenericDataset):
    Specific class to handle OCR/HTR datasets
    """

-    def __init__(self, params, set_name, custom_name, paths_and_sets):
+    def __init__(
+        self,
+        params,
+        set_name,
+        custom_name,
+        paths_and_sets,
+        normalization_transforms,
+        augmentation_transforms=None,
+    ):
        super(OCRDataset, self).__init__(params, set_name, custom_name, paths_and_sets)
        self.charset = None
        self.tokens = None
        # Factor to reduce the height and width of the feature vector before feeding the decoder.
        self.reduce_dims_factor = np.array([32, 8, 1])
        self.collate_function = OCRCollateFunction
+        self.normalization_transforms = normalization_transforms
+        self.augmentation_transforms = augmentation_transforms

    def __getitem__(self, idx):
        sample = dict(**self.samples[idx])

        if not self.load_in_memory:
            sample["img"] = self.get_sample_img(idx)
-            sample["img"], sample["resize_ratio"] = apply_preprocessing(
-                sample["img"], self.params["config"]["preprocessings"]
-            )

        # Data augmentation
-        sample["img"] = self.apply_data_augmentation(sample["img"])
-
-        if "max_size" in self.params["config"] and self.params["config"]["max_size"]:
-            max_ratio = max(
-                sample["img"].shape[0]
-                / self.params["config"]["max_size"]["max_height"],
-                sample["img"].shape[1] / self.params["config"]["max_size"]["max_width"],
-            )
-            if max_ratio > 1:
-                new_h, new_w = int(np.ceil(sample["img"].shape[0] / max_ratio)), int(
-                    np.ceil(sample["img"].shape[1] / max_ratio)
-                )
-                sample["img"] = cv2.resize(sample["img"], (new_w, new_h))
+        if self.augmentation_transforms:
+            sample["img"] = self.augmentation_transforms(image=np.array(sample["img"]))[
+                "image"
+            ]

        # Normalization
-        sample["img"] = (sample["img"] - self.mean) / self.std
+        sample["img"] = self.normalization_transforms(sample["img"])

+        # Get final height and width
+        final_c, final_h, final_w = sample["img"].shape
        sample["img_reduced_shape"] = np.ceil(
-            sample["img"].shape / self.reduce_dims_factor
+            [final_h, final_w, final_c] / self.reduce_dims_factor
        ).astype(int)

        if self.set_name == "train":
@@ -97,8 +95,8 @@ class OCRDataset(GenericDataset):
            ]

        sample["img_position"] = [
-            [0, sample["img"].shape[0]],
-            [0, sample["img"].shape[1]],
+            [0, final_h],
+            [0, final_w],
        ]
        return sample

@@ -131,12 +129,10 @@ class OCRCollateFunction:

    def __call__(self, batch_data):
        labels = [batch_data[i]["token_label"] for i in range(len(batch_data))]
-        labels = pad_sequences_1D(labels, padding_value=self.label_padding_value)
-        labels = torch.tensor(labels).long()
+        labels = pad_sequences_1D(labels, padding_value=self.label_padding_value).long()

        imgs = [batch_data[i]["img"] for i in range(len(batch_data))]
        imgs = pad_images(imgs)
-        imgs = torch.tensor(imgs).float().permute(0, 3, 1, 2)

        formatted_batch_data = {
            formatted_key: [batch_data[i][initial_key] for i in range(len(batch_data))]

--- a/dan/ocr/document/train.py
+++ b/dan/ocr/document/train.py
@@ -15,7 +15,7 @@ from dan.encoder import FCN_Encoder
 from dan.manager.training import Manager
 from dan.mlflow import MLFLOW_AVAILABLE
 from dan.schedulers import exponential_dropout_scheduler
-from dan.transforms import aug_config
+from dan.transforms import Preprocessing
 from dan.utils import MLflowNotInstalled

 if MLFLOW_AVAILABLE:
@@ -107,11 +107,12 @@ def get_config():
                "worker_per_gpu": 4,  # Num of parallel processes per gpu for data loading
                "preprocessings": [
                    {
-                        "type": "to_RGB",
-                        # if grayscaled image, produce RGB one (3 channels with same value) otherwise do nothing
-                    },
+                        "type": Preprocessing.MaxResize,
+                        "max_width": 2000,
+                        "max_height": 2000,
+                    }
                ],
-                "augmentation": aug_config(0.9, 0.1),
+                "augmentation": True,
            },
        },
        "model_params": {
@@ -257,18 +258,18 @@ def serialize_config(config):
    return serialized_config


-def start_training(config) -> None:
+def start_training(config, mlflow_logging: bool) -> None:
    if (
        config["training_params"]["use_ddp"]
        and not config["training_params"]["force_cpu"]
    ):
        mp.spawn(
            train_and_test,
-            args=(config, True),
+            args=(config, mlflow_logging),
            nprocs=config["training_params"]["nb_gpu"],
        )
    else:
-        train_and_test(0, config, True)
+        train_and_test(0, config, mlflow_logging)


 def run():
@@ -285,7 +286,7 @@ def run():
        raise MLflowNotInstalled()

    if "mlflow" not in config:
-        start_training(config)
+        start_training(config, mlflow_logging=False)
    else:
        labels_path = (
            Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json"
@@ -313,4 +314,4 @@ def run():
                    dictionary=artifact,
                    artifact_file=filename,
                )
-            start_training(config)
+            start_training(config, mlflow_logging=True)
--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -20,6 +20,7 @@ from dan.predict.attention import (
    plot_attention,
    split_text_and_confidences,
 )
+from dan.transforms import get_normalization_transforms
 from dan.utils import ind_to_token, read_image


@@ -74,7 +75,7 @@ class DAN:

        self.encoder = encoder
        self.decoder = decoder
-        self.mean, self.std = parameters["mean"], parameters["std"]
+        self.normalization = get_normalization_transforms()
        self.max_chars = parameters["max_char_prediction"]

    def preprocess(self, input_image):
@@ -89,7 +90,7 @@ class DAN:
        if len(input_image.shape) < 3:
            input_image = cv2.cvtColor(input_image, cv2.COLOR_GRAY2RGB)

-        input_image = (input_image - self.mean) / self.std
+        input_image = self.normalization(input_image)
        return input_image

    def predict(

--- a/dan/transforms.py
+++ b/dan/transforms.py
@@ -2,354 +2,206 @@
 """
 Each transform class defined here takes as input a PIL Image and returns the modified PIL Image
 """
-import math
+from enum import Enum
+from random import randint

-import cv2
 import numpy as np
-from cv2 import dilate, erode, normalize
-from numpy import random
-from PIL import Image
-from torch import rand, randint
-from torch.distributions.uniform import Uniform
-from torchvision.transforms import (
+from albumentations import OneOf, SomeOf
+from albumentations.augmentations import (
+    Affine,
+    CoarseDropout,
    ColorJitter,
+    Downscale,
+    ElasticTransform,
    GaussianBlur,
-    RandomCrop,
-    RandomPerspective,
+    GaussNoise,
+    Perspective,
+    PiecewiseAffine,
+    Sharpen,
+    ToGray,
 )
-from torchvision.transforms.functional import InterpolationMode
-
+from cv2 import INTER_NEAREST, dilate, erode
+from numpy import random
+from torch import Tensor
+from torchvision.transforms import Compose, Normalize, ToPILImage, ToTensor
+from torchvision.transforms.functional import resize

-class Dilation:
-    """
-    OCR: stroke width increasing
-    """
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]

-    def __init__(self, kernel, iterations):
-        self.kernel = np.ones(kernel, np.uint8)
-        self.iterations = iterations

-    def __call__(self, x):
-        return Image.fromarray(
-            dilate(np.array(x), self.kernel, iterations=self.iterations)
-        )
+class Preprocessing(str, Enum):
+    # If the image is bigger than the given size, resize it while keeping the original ratio
+    MaxResize = "max_resize"
+    # Resize the height to a fixed value while keeping the original ratio
+    FixedHeightResize = "fixed_height_resize"
+    # Resize the width to a fixed value while keeping the original ratio
+    FixedWidthResize = "fixed_width_resize"


-class Erosion:
+class FixedHeightResize:
    """
-    OCR: stroke width decreasing
+    Resize an image tensor to a fixed height
    """

-    def __init__(self, kernel, iterations):
-        self.kernel = np.ones(kernel, np.uint8)
-        self.iterations = iterations
+    def __init__(self, height: int) -> None:
+        self.height = height

-    def __call__(self, x):
-        return Image.fromarray(
-            erode(np.array(x), self.kernel, iterations=self.iterations)
-        )
+    def __call__(self, img: Tensor) -> Tensor:
+        size = (self.height, self._calc_new_width(img))
+        return resize(img, size, antialias=False)

+    def _calc_new_width(self, img: Tensor) -> int:
+        aspect_ratio = img.shape[2] / img.shape[1]
+        return round(self.height * aspect_ratio)

-class GaussianNoise:
-    """
-    Add Gaussian Noise
-    """
-
-    def __init__(self, std):
-        self.std = std

-    def __call__(self, x):
-        x_np = np.array(x)
-        mean, std = np.mean(x_np), np.std(x_np)
-        std = math.copysign(max(abs(std), 0.000001), std)
-        min_, max_ = np.min(
-            x_np,
-        ), np.max(x_np)
-        normal_noise = np.random.randn(*x_np.shape)
-        if (
-            len(x_np.shape) == 3
-            and x_np.shape[2] == 3
-            and np.all(x_np[:, :, 0] == x_np[:, :, 1])
-            and np.all(x_np[:, :, 0] == x_np[:, :, 2])
-        ):
-            normal_noise[:, :, 1] = normal_noise[:, :, 2] = normal_noise[:, :, 0]
-        x_np = ((x_np - mean) / std + normal_noise * self.std) * std + mean
-        x_np = normalize(x_np, x_np, max_, min_, cv2.NORM_MINMAX)
-
-        return Image.fromarray(x_np.astype(np.uint8))
-
-
-class Sharpen:
+class FixedWidthResize:
    """
-    Add Gaussian Noise
+    Resize an image tensor to a fixed width
    """

-    def __init__(self, alpha, strength):
-        self.alpha = alpha
-        self.strength = strength
+    def __init__(self, width: int) -> None:
+        self.width = width

-    def __call__(self, x):
-        x_np = np.array(x)
-        id_matrix = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]])
-        effect_matrix = np.array([[1, 1, 1], [1, -(8 + self.strength), 1], [1, 1, 1]])
-        kernel = (1 - self.alpha) * id_matrix - self.alpha * effect_matrix
-        kernel = np.expand_dims(kernel, axis=2)
-        kernel = np.concatenate([kernel, kernel, kernel], axis=2)
-        sharpened = cv2.filter2D(x_np, -1, kernel=kernel[:, :, 0])
-        return Image.fromarray(sharpened.astype(np.uint8))
+    def __call__(self, img: Tensor) -> Tensor:
+        size = (self._calc_new_height(img), self.width)
+        return resize(img, size, antialias=False)
+
+    def _calc_new_height(self, img: Tensor) -> int:
+        aspect_ratio = img.shape[1] / img.shape[2]
+        return round(self.width * aspect_ratio)


-class ZoomRatio:
+class MaxResize:
    """
-    Crop by ratio
-    Preserve dimensions if keep_dim = True (= zoom)
+    Resize an image tensor if it is bigger than the maximum size
    """

-    def __init__(self, ratio_h, ratio_w, keep_dim=True):
-        self.ratio_w = ratio_w
-        self.ratio_h = ratio_h
-        self.keep_dim = keep_dim
+    def __init__(self, height: int, width: int) -> None:
+        self.max_width = width
+        self.max_height = height

-    def __call__(self, x):
-        w, h = x.size
-        x = RandomCrop((int(h * self.ratio_h), int(w * self.ratio_w)))(x)
-        if self.keep_dim:
-            x = x.resize((w, h), Image.BILINEAR)
-        return x
+    def __call__(self, img: Tensor) -> Tensor:
+        height, width = img.shape[1:]
+        if width <= self.max_width and height <= self.max_height:
+            return img
+        width_ratio = self.max_width / width
+        height_ratio = self.max_height / height
+        ratio = min(height_ratio, width_ratio)
+        new_width = int(width * ratio)
+        new_height = int(height * ratio)
+        return resize(img, (new_height, new_width), antialias=False)


-class ElasticDistortion:
-    def __init__(self, kernel_size=(7, 7), sigma=5, alpha=1):
-        self.kernel_size = kernel_size
-        self.sigma = sigma
-        self.alpha = alpha
+class Dilation:
+    """
+    OCR: stroke width increasing
+    """
+
+    def __init__(self, kernel, iterations):
+        self.kernel = kernel
+        self.iterations = iterations

    def __call__(self, x):
-        x_np = np.array(x)
+        return dilate(np.array(x), self.kernel, iterations=self.iterations)

-        h, w = x_np.shape[:2]

-        dx = np.random.uniform(-1, 1, (h, w))
-        dy = np.random.uniform(-1, 1, (h, w))
-
-        x_gauss = cv2.GaussianBlur(dx, self.kernel_size, self.sigma)
-        y_gauss = cv2.GaussianBlur(dy, self.kernel_size, self.sigma)
+class Erosion:
+    """
+    OCR: stroke width decreasing
+    """

-        n = np.sqrt(x_gauss**2 + y_gauss**2)
+    def __init__(self, kernel, iterations):
+        self.kernel = kernel
+        self.iterations = iterations

-        nd_x = self.alpha * x_gauss / n
-        nd_y = self.alpha * y_gauss / n
+    def __call__(self, x):
+        return erode(np.array(x), self.kernel, iterations=self.iterations)

-        ind_y, ind_x = np.indices((h, w), dtype=np.float32)

-        map_x = nd_x + ind_x
-        map_x = map_x.reshape(h, w).astype(np.float32)
-        map_y = nd_y + ind_y
-        map_y = map_y.reshape(h, w).astype(np.float32)
+class ErosionDilation:
+    """
+    Random erosion or dilation
+    """

-        dst = cv2.remap(x_np, map_x, map_y, cv2.INTER_LINEAR)
-        return Image.fromarray(dst.astype(np.uint8))
+    def __init__(self, min_kernel, max_kernel, iterations, p=1.0):
+        self.min_kernel = min_kernel
+        self.max_kernel = max_kernel
+        self.iterations = iterations
+        self.p = p
+        self.always_apply = False
+
+    def __call__(self, image, force_apply=False):
+        if not (random.random() <= self.p or self.always_apply or force_apply):
+            return {"image": image}
+        kernel_h = randint(self.min_kernel, self.max_kernel)
+        kernel_w = randint(self.min_kernel, self.max_kernel)
+        kernel = np.ones((kernel_h, kernel_w), np.uint8)
+        augmented_image = (
+            Erosion(kernel, iterations=self.iterations)(image)
+            if random.random() < 0.5
+            else Dilation(kernel=kernel, iterations=self.iterations)(image)
+        )
+        return {"image": augmented_image}


-def get_list_augmenters(img, aug_configs, fill_value):
+def get_preprocessing_transforms(preprocessings: list) -> Compose:
    """
-    Randomly select a list of data augmentation techniques to used based on aug_configs
+    Returns a list of transformations to be applied to the image.
    """
-    augmenters = list()
-    for aug_config in aug_configs:
-        if rand((1,)) > aug_config["proba"]:
-            continue
-
-        if aug_config["type"] == "zoom_ratio":
-            ratio_h = Uniform(
-                aug_config["min_ratio_h"], aug_config["max_ratio_h"]
-            ).sample()
-            ratio_w = Uniform(
-                aug_config["min_ratio_w"], aug_config["max_ratio_w"]
-            ).sample()
-            augmenters.append(
-                ZoomRatio(
-                    ratio_h=ratio_h, ratio_w=ratio_w, keep_dim=aug_config["keep_dim"]
-                )
-            )
-
-        elif aug_config["type"] == "perspective":
-            scale = Uniform(aug_config["min_factor"], aug_config["max_factor"]).sample()
-            augmenters.append(
-                RandomPerspective(
-                    distortion_scale=scale,
-                    p=1,
-                    interpolation=InterpolationMode.BILINEAR,
-                    fill=fill_value,
-                )
-            )
-
-        elif aug_config["type"] == "elastic_distortion":
-            kernel_size = (
-                randint(
-                    aug_config["min_kernel_size"], aug_config["max_kernel_size"], (1,)
-                ).item()
-            ) // 2 * 2 + 1
-            sigma = (
-                Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
-                .sample()
-                .item()
-            )
-            alpha = (
-                Uniform(aug_config["min_alpha"], aug_config["max_alpha"])
-                .sample()
-                .item()
-            )
-            augmenters.append(
-                ElasticDistortion(
-                    kernel_size=(kernel_size, kernel_size), sigma=sigma, alpha=alpha
+    transforms = []
+    for preprocessing in preprocessings:
+        match preprocessing["type"]:
+            case Preprocessing.MaxResize:
+                transforms.append(
+                    MaxResize(
+                        height=preprocessing["max_height"],
+                        width=preprocessing["max_width"],
+                    )
                )
-            )
-
-        elif aug_config["type"] == "dilation_erosion":
-            kernel_h = randint(
-                aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
-            )
-            kernel_w = randint(
-                aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
-            )
-            if randint(0, 2, (1,)) == 0:
-                augmenters.append(
-                    Erosion((kernel_w, kernel_h), aug_config["iterations"])
-                )
-            else:
-                augmenters.append(
-                    Dilation((kernel_w, kernel_h), aug_config["iterations"])
+            case Preprocessing.FixedHeightResize:
+                transforms.append(
+                    FixedHeightResize(height=preprocessing["fixed_height"])
                )
+            case Preprocessing.FixedWidthResize:
+                transforms.append(FixedWidthResize(width=preprocessing["fixed_width"]))
+    transforms.append(ToPILImage())
+    return Compose(transforms)

-        elif aug_config["type"] == "color_jittering":
-            augmenters.append(
-                ColorJitter(
-                    contrast=aug_config["factor_contrast"],
-                    brightness=aug_config["factor_brightness"],
-                    saturation=aug_config["factor_saturation"],
-                    hue=aug_config["factor_hue"],
-                )
-            )
-
-        elif aug_config["type"] == "gaussian_blur":
-            max_kernel_h = min(aug_config["max_kernel"], img.size[1])
-            max_kernel_w = min(aug_config["max_kernel"], img.size[0])
-            kernel_h = (
-                randint(aug_config["min_kernel"], max_kernel_h + 1, (1,)).item()
-            ) // 2 * 2 + 1
-            kernel_w = (
-                randint(aug_config["min_kernel"], max_kernel_w + 1, (1,)).item()
-            ) // 2 * 2 + 1
-            sigma = (
-                Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
-                .sample()
-                .item()
-            )
-            augmenters.append(
-                GaussianBlur(kernel_size=(kernel_w, kernel_h), sigma=sigma)
-            )
-
-        elif aug_config["type"] == "gaussian_noise":
-            augmenters.append(GaussianNoise(std=aug_config["std"]))
-
-        elif aug_config["type"] == "sharpen":
-            alpha = Uniform(aug_config["min_alpha"], aug_config["max_alpha"]).sample()
-            strength = Uniform(
-                aug_config["min_strength"], aug_config["max_strength"]
-            ).sample()
-            augmenters.append(Sharpen(alpha=alpha, strength=strength))
-
-        else:
-            print("Error - unknown augmentor: {}".format(aug_config["type"]))
-            exit(-1)
-
-    return augmenters
-
-
-def apply_data_augmentation(img, da_config):
+
+def get_augmentation_transforms() -> SomeOf:
    """
-    Apply data augmentation strategy on input image
+    Returns a list of transformations to be applied to the image.
    """
-    if da_config["proba"] != 1 and rand((1,)) > da_config["proba"]:
-        return img
+    return SomeOf(
+        [
+            Perspective(scale=(0.05, 0.09), fit_output=True),
+            GaussianBlur(sigma_limit=2.5),
+            GaussNoise(var_limit=50**2),
+            ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
+            OneOf(
+                [
+                    ElasticTransform(
+                        alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0
+                    ),
+                    PiecewiseAffine(scale=(0.01, 0.04), nb_rows=1, nb_cols=4),
+                ]
+            ),
+            Sharpen(alpha=(0.0, 1.0)),
+            ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
+            Affine(shear={"x": (-20, 20), "y": (0, 0)}),
+            CoarseDropout(),
+            Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
+            ToGray(),
+        ],
+        n=2,
+        p=0.9,
+    )

-    # Convert to PIL Image
-    img = img[:, :, 0] if img.shape[2] == 1 else img
-    img = Image.fromarray(img)

-    fill_value = da_config["fill_value"] if "fill_value" in da_config else 255
-    augmenters = get_list_augmenters(
-        img, da_config["augmentations"], fill_value=fill_value
-    )
-    if da_config["order"] == "random":
-        random.shuffle(augmenters)
-
-    for augmenter in augmenters:
-        img = augmenter(img)
-
-    # convert to numpy array
-    img = np.array(img)
-    img = np.expand_dims(img, axis=2) if len(img.shape) == 2 else img
-    return img
-
-
-def aug_config(proba_use_da, p):
-    return {
-        "order": "random",
-        "proba": proba_use_da,
-        "augmentations": [
-            {
-                "type": "perspective",
-                "proba": p,
-                "min_factor": 0,
-                "max_factor": 0.4,
-            },
-            {
-                "type": "elastic_distortion",
-                "proba": p,
-                "min_alpha": 0.5,
-                "max_alpha": 1,
-                "min_sigma": 1,
-                "max_sigma": 10,
-                "min_kernel_size": 3,
-                "max_kernel_size": 9,
-            },
-            {
-                "type": "dilation_erosion",
-                "proba": p,
-                "min_kernel": 1,
-                "max_kernel": 3,
-                "iterations": 1,
-            },
-            {
-                "type": "color_jittering",
-                "proba": p,
-                "factor_hue": 0.2,
-                "factor_brightness": 0.4,
-                "factor_contrast": 0.4,
-                "factor_saturation": 0.4,
-            },
-            {
-                "type": "gaussian_blur",
-                "proba": p,
-                "min_kernel": 3,
-                "max_kernel": 5,
-                "min_sigma": 3,
-                "max_sigma": 5,
-            },
-            {
-                "type": "gaussian_noise",
-                "proba": p,
-                "std": 0.5,
-            },
-            {
-                "type": "sharpen",
-                "proba": p,
-                "min_alpha": 0,
-                "max_alpha": 1,
-                "min_strength": 0,
-                "max_strength": 1,
-            },
-        ],
-    }
+def get_normalization_transforms() -> Compose:
+    """
+    Returns a list of normalization transformations.
+    """
+    return Compose([ToTensor(), Normalize(IMAGENET_MEAN, IMAGENET_STD)])
--- a/dan/utils.py
+++ b/dan/utils.py
 # -*- coding: utf-8 -*-
 import cv2
-import numpy as np
+import torch

 # Layout begin-token to end-token
 SEM_MATCHING_TOKENS = {"ⓘ": "Ⓘ", "ⓓ": "Ⓓ", "ⓢ": "Ⓢ", "ⓒ": "Ⓒ", "ⓟ": "Ⓟ", "ⓐ": "Ⓐ"}
@@ -18,9 +18,9 @@ def pad_sequences_1D(data, padding_value):
    """
    x_lengths = [len(x) for x in data]
    longest_x = max(x_lengths)
-    padded_data = np.ones((len(data), longest_x)).astype(np.int32) * padding_value
+    padded_data = torch.ones((len(data), longest_x), dtype=torch.int32) * padding_value
    for i, x_len in enumerate(x_lengths):
-        padded_data[i, :x_len] = data[i][:x_len]
+        padded_data[i, :x_len] = torch.tensor(data[i][:x_len])
    return padded_data


@@ -30,19 +30,19 @@ def pad_images(data):
    :param data: List of numpy arrays.
    :return padded_data: A tensor containing all the padded images.
    """
-    longest_x = max([x.shape[0] for x in data])
-    longest_y = max([x.shape[1] for x in data])
-    padded_data = np.zeros((len(data), longest_x, longest_y, data[0].shape[2]))
+    longest_x = max([x.shape[1] for x in data])
+    longest_y = max([x.shape[2] for x in data])
+    padded_data = torch.zeros((len(data), data[0].shape[0], longest_x, longest_y))
    for index, image in enumerate(data):
-        delta_x = longest_x - image.shape[0]
-        delta_y = longest_y - image.shape[1]
+        delta_x = longest_x - image.shape[1]
+        delta_y = longest_y - image.shape[2]
        top, bottom = delta_x // 2, delta_x - (delta_x // 2)
        left, right = delta_y // 2, delta_y - (delta_y // 2)
        padded_data[
            index,
-            top : padded_data.shape[1] - bottom,
-            left : padded_data.shape[2] - right,
            :,
+            top : padded_data.shape[2] - bottom,
+            left : padded_data.shape[3] - right,
        ] = image
    return padded_data


--- a/docs/assets/augmentations/document_color_jitter.png
+++ b/docs/assets/augmentations/document_color_jitter.png
--- a/docs/assets/augmentations/document_downscale.png
+++ b/docs/assets/augmentations/document_downscale.png
--- a/docs/assets/augmentations/document_dropout.png
+++ b/docs/assets/augmentations/document_dropout.png
--- a/docs/assets/augmentations/document_elastic.png
+++ b/docs/assets/augmentations/document_elastic.png
--- a/docs/assets/augmentations/document_erosion_dilation.png
+++ b/docs/assets/augmentations/document_erosion_dilation.png
--- a/docs/assets/augmentations/document_full_pipeline.png
+++ b/docs/assets/augmentations/document_full_pipeline.png
--- a/docs/assets/augmentations/document_full_pipeline_2.png
+++ b/docs/assets/augmentations/document_full_pipeline_2.png
--- a/docs/assets/augmentations/document_gaussian_blur.png
+++ b/docs/assets/augmentations/document_gaussian_blur.png
--- a/docs/assets/augmentations/document_gaussian_noise.png
+++ b/docs/assets/augmentations/document_gaussian_noise.png
--- a/docs/assets/augmentations/document_grayscale.png
+++ b/docs/assets/augmentations/document_grayscale.png
--- a/docs/assets/augmentations/document_perspective.png
+++ b/docs/assets/augmentations/document_perspective.png
--- a/docs/assets/augmentations/document_piecewise.png
+++ b/docs/assets/augmentations/document_piecewise.png
--- a/docs/assets/augmentations/document_sharpen.png
+++ b/docs/assets/augmentations/document_sharpen.png
No results found