Fix rebase

8dace579 · Mélodie Boillet · f627d8e6 · 8dace579 · 8dace579 · 8dace579
Verified Commit 8dace579 authored 1 year ago by Mélodie Boillet
--- a/dan/manager/dataset.py
+++ b/dan/manager/dataset.py
@@ -22,18 +22,20 @@ class OCRDataset(Dataset):
        charset,
        tokens,
        preprocessing_transforms,
-        normalization_transforms,
        augmentation_transforms,
        load_in_memory=False,
+        mean=None,
+        std=None,
    ):
        self.set_name = set_name
        self.charset = charset
        self.tokens = tokens
        self.load_in_memory = load_in_memory
+        self.mean = mean
+        self.std = std

-        # Pre-processing, augmentation, normalization
+        # Pre-processing, augmentation
        self.preprocessing_transforms = preprocessing_transforms
-        self.normalization_transforms = normalization_transforms
        self.augmentation_transforms = augmentation_transforms

        # Factor to reduce the height and width of the feature vector before feeding the decoder.
@@ -55,20 +57,20 @@ class OCRDataset(Dataset):
        """
        Return an item from the dataset (image and label)
        """
-
        # Load preprocessed image
-        sample = dict(**self.samples[idx])
+        sample = copy.deepcopy(self.samples[idx])
        if not self.load_in_memory:
            sample["img"] = self.get_sample_img(idx)

+        # Convert to numpy
+        sample["img"] = np.array(sample["img"])
+
        # Apply data augmentation
        if self.augmentation_transforms:
-            sample["img"] = self.augmentation_transforms(image=np.array(sample["img"]))[
-                "image"
-            ]
+            sample["img"] = self.augmentation_transforms(image=sample["img"])["image"]

        # Image normalization
-        sample["img"] = self.normalization_transforms(sample["img"])
+        sample["img"] = (sample["img"] - self.mean) / self.std

        # Get final height and width
        sample["img_reduced_shape"], sample["img_position"] = self.compute_final_size(
@@ -113,28 +115,51 @@ class OCRDataset(Dataset):

    def get_sample_img(self, i):
        """
-        Compute the final image size and position after feature extraction
+        Get image by index
        """
        if self.load_in_memory:
            return self.samples[i]["img"]

        return self.preprocessing_transforms(read_image(self.samples[i]["path"]))

+    def compute_std_mean(self):
+        """
+        Compute cumulated variance and mean of whole dataset
+        """
+        if self.mean is not None and self.std is not None:
+            return self.mean, self.std
+
+        sum = np.zeros((3,))
+        diff = np.zeros((3,))
+        nb_pixels = 0
+        for metric in ["mean", "std"]:
+            for ind in range(len(self.samples)):
+                img = np.array(self.get_sample_img(ind))
+                if metric == "mean":
+                    sum += np.sum(img, axis=(0, 1))
+                    nb_pixels += np.prod(img.shape[:2])
+                elif metric == "std":
+                    diff += [
+                        np.sum((img[:, :, k] - self.mean[k]) ** 2) for k in range(3)
+                    ]
+            if metric == "mean":
+                self.mean = sum / nb_pixels
+            elif metric == "std":
+                self.std = np.sqrt(diff / nb_pixels)
+        return self.mean, self.std
+
    def compute_final_size(self, img):
        """
        Compute the final image size and position after feature extraction
        """
-        final_c, final_h, final_w = img.shape
-        image_reduced_shape = np.ceil(
-            [final_h, final_w, final_c] / self.reduce_dims_factor
-        ).astype(int)
+        image_reduced_shape = np.ceil(img.shape / self.reduce_dims_factor).astype(int)

        if self.set_name == "train":
            image_reduced_shape = [max(1, t) for t in image_reduced_shape]

        image_position = [
-            [0, final_h],
-            [0, final_w],
+            [0, img.shape[0]],
+            [0, img.shape[1]],
        ]
        return image_reduced_shape, image_position


--- a/dan/manager/ocr.py
+++ b/dan/manager/ocr.py
@@ -9,11 +9,7 @@ from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler

 from dan.manager.dataset import OCRDataset
-from dan.transforms import (
-    get_augmentation_transforms,
-    get_normalization_transforms,
-    get_preprocessing_transforms,
-)
+from dan.transforms import get_augmentation_transforms, get_preprocessing_transforms
 from dan.utils import pad_images, pad_sequences_1D


@@ -36,6 +32,17 @@ class OCRDatasetManager:
        self.valid_samplers = dict()
        self.test_samplers = dict()

+        self.mean = (
+            np.array(params["config"]["mean"])
+            if "mean" in params["config"].keys()
+            else None
+        )
+        self.std = (
+            np.array(params["config"]["std"])
+            if "std" in params["config"].keys()
+            else None
+        )
+
        self.generator = torch.Generator()
        self.generator.manual_seed(0)

@@ -49,7 +56,6 @@ class OCRDatasetManager:
        self.params["config"]["padding_token"] = self.tokens["pad"]

        self.my_collate_function = OCRCollateFunction(self.params["config"])
-        self.normalization = get_normalization_transforms(from_pil_image=True)
        self.augmentation = (
            get_augmentation_transforms()
            if self.params["config"]["augmentation"]
@@ -69,11 +75,14 @@ class OCRDatasetManager:
            charset=self.charset,
            tokens=self.tokens,
            preprocessing_transforms=self.preprocessing,
-            normalization_transforms=self.normalization,
            augmentation_transforms=self.augmentation,
            load_in_memory=self.load_in_memory,
+            mean=self.mean,
+            std=self.std,
        )

+        self.mean, self.std = self.train_dataset.compute_std_mean()
+
        for custom_name in self.params["val"].keys():
            self.valid_datasets[custom_name] = OCRDataset(
                set_name="val",
@@ -81,9 +90,10 @@ class OCRDatasetManager:
                charset=self.charset,
                tokens=self.tokens,
                preprocessing_transforms=self.preprocessing,
-                normalization_transforms=self.normalization,
                augmentation_transforms=None,
                load_in_memory=self.load_in_memory,
+                mean=self.mean,
+                std=self.std,
            )

    def load_ddp_samplers(self):
@@ -167,9 +177,10 @@ class OCRDatasetManager:
            charset=self.charset,
            tokens=self.tokens,
            preprocessing_transforms=self.preprocessing,
-            normalization_transforms=self.normalization,
            augmentation_transforms=None,
            load_in_memory=self.load_in_memory,
+            mean=self.mean,
+            std=self.std,
        )

        if self.params["use_ddp"]:
@@ -181,6 +192,7 @@ class OCRDatasetManager:
            )
        else:
            self.test_samplers[custom_name] = None
+
        self.test_loaders[custom_name] = DataLoader(
            self.test_datasets[custom_name],
            batch_size=1,

--- a/dan/manager/training.py
+++ b/dan/manager/training.py
 # -*- coding: utf-8 -*-
 import os
 import random
+from copy import deepcopy
 from enum import Enum
 from time import time

@@ -452,22 +453,68 @@ class GenericTrainingManager:

    def save_params(self):
        """
-        Output yaml file containing a summary of all hyperparameters chosen for the training
+        Output a yaml file containing a summary of all hyperparameters chosen for the training
+        and a yaml file containing parameters used for inference
        """
-        path = os.path.join(self.paths["results"], "parameters.yml")
+
+        def compute_nb_params(module):
+            return sum([np.prod(p.size()) for p in list(module.parameters())])
+
+        def class_to_str_dict(my_dict):
+            for key in my_dict.keys():
+                if key == "preprocessings":
+                    my_dict[key] = [
+                        {
+                            key: value.value if isinstance(value, Enum) else value
+                            for key, value in preprocessing.items()
+                        }
+                        for preprocessing in my_dict[key]
+                    ]
+                elif callable(my_dict[key]):
+                    my_dict[key] = my_dict[key].__name__
+                elif isinstance(my_dict[key], np.ndarray):
+                    my_dict[key] = my_dict[key].tolist()
+                elif isinstance(my_dict[key], list) and isinstance(
+                    my_dict[key][0], tuple
+                ):
+                    my_dict[key] = [list(elt) for elt in my_dict[key]]
+                elif isinstance(my_dict[key], dict):
+                    my_dict[key] = class_to_str_dict(my_dict[key])
+            return my_dict
+
+        # Save training parameters
+        path = os.path.join(self.paths["results"], "training_parameters.yml")
        if os.path.isfile(path):
            return
+        params = class_to_str_dict(my_dict=deepcopy(self.params))
+        total_params = 0
+        for model_name in self.models.keys():
+            current_params = compute_nb_params(self.models[model_name])
+            params["model_params"]["models"][model_name] = [
+                params["model_params"]["models"][model_name],
+                "{:,}".format(current_params),
+            ]
+            total_params += current_params
+        params["model_params"]["total_params"] = "{:,}".format(total_params)
+        params["mean"] = self.dataset.mean.tolist()
+        params["std"] = self.dataset.std.tolist()
+        with open(path, "w") as f:
+            yaml.dump(params, f)

-        params = {
+        # Save inference parameters
+        path = os.path.join(self.paths["results"], "inference_parameters.yml")
+        if os.path.isfile(path):
+            return
+        inference_params = {
            "parameters": {
-                "max_char_prediction": self.params["training_params"][
-                    "max_char_prediction"
-                ],
+                "mean": params["mean"],
+                "std": params["std"],
+                "max_char_prediction": params["training_params"]["max_char_prediction"],
                "encoder": {
-                    "dropout": self.params["model_params"]["dropout"],
+                    "dropout": params["model_params"]["dropout"],
                },
                "decoder": {
-                    key: self.params["model_params"][key]
+                    key: params["model_params"][key]
                    for key in [
                        "enc_dim",
                        "l_max",
@@ -483,20 +530,11 @@ class GenericTrainingManager:
                        "attention_win",
                    ]
                },
-                "preprocessings": [
-                    {
-                        key: value.value if isinstance(value, Enum) else value
-                        for key, value in preprocessing.items()
-                    }
-                    for preprocessing in self.params["dataset_params"]["config"].get(
-                        "preprocessings", []
-                    )
-                ],
+                "preprocessings": params["dataset_params"]["config"]["preprocessings"],
            },
        }
-
        with open(path, "w") as f:
-            yaml.dump(params, f)
+            yaml.dump(inference_params, f)

    def backward_loss(self, loss, retain_graph=False):
        self.scaler.scale(loss).backward(retain_graph=retain_graph)

--- a/dan/predict/attention.py
+++ b/dan/predict/attention.py
@@ -303,7 +303,6 @@ def plot_attention(
    :param line_separators: List of line separators
    :param display_polygons: Whether to plot extracted polygons
    """
-
    image = to_pil_image(image)
    attention_map = []


--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -19,7 +19,7 @@ from dan.predict.attention import (
    plot_attention,
    split_text_and_confidences,
 )
-from dan.transforms import get_normalization_transforms, get_preprocessing_transforms
+from dan.transforms import get_preprocessing_transforms
 from dan.utils import ind_to_token, list_to_batches, pad_images, read_image


@@ -74,7 +74,10 @@ class DAN:

        self.encoder = encoder
        self.decoder = decoder
-        self.normalization = get_normalization_transforms()
+        self.mean, self.std = (
+            torch.tensor(parameters["mean"]) / 255,
+            torch.tensor(parameters["std"]) / 255,
+        )
        self.preprocessing_transforms = get_preprocessing_transforms(
            parameters.get("preprocessings", [])
        )
@@ -87,7 +90,12 @@ class DAN:
        """
        image = read_image(path)
        preprocessed_image = self.preprocessing_transforms(image)
-        return preprocessed_image, self.normalization(preprocessed_image)
+        normalized_image = torch.zeros(preprocessed_image.shape)
+        for ch in range(preprocessed_image.shape[0]):
+            normalized_image[ch, :, :] = (
+                preprocessed_image[ch, :, :] - self.mean[ch]
+            ) / self.std[ch]
+        return preprocessed_image, normalized_image

    def predict(
        self,
@@ -276,7 +284,6 @@ def process_batch(
    # Convert to tensor of size (batch_size, channel, height, width) with batch_size=1
    input_tensor = pad_images(input_images).to(device)
    visu_tensor = pad_images(visu_images).to(device)
-
    logger.info("Images preprocessed!")

    # Parse delimiters to regex
@@ -297,8 +304,8 @@ def process_batch(
        threshold_method=threshold_method,
        threshold_value=threshold_value,
    )
-    logger.info("Prediction parsing...")

+    logger.info("Prediction parsing...")
    for idx, image_path in enumerate(image_batch):
        predicted_text = prediction["text"][idx]
        result = {"text": predicted_text}
@@ -319,7 +326,6 @@ def process_batch(
            ]

            # calculates scores by token
-
            result["confidences"]["by ner token"] = [
                {
                    "text": f"{predicted_text[current: next_token]}".replace("\n", " "),

--- a/dan/transforms.py
+++ b/dan/transforms.py
@@ -5,8 +5,8 @@ Each transform class defined here takes as input a PIL Image and returns the mod
 from enum import Enum
 from random import randint

+import albumentations as A
 import numpy as np
-from albumentations import SomeOf
 from albumentations.augmentations import (
    Affine,
    CoarseDropout,
@@ -18,15 +18,15 @@ from albumentations.augmentations import (
    Sharpen,
    ToGray,
 )
-from cv2 import INTER_NEAREST, dilate, erode
+from albumentations.core.transforms_interface import ImageOnlyTransform
+from cv2 import dilate, erode
 from numpy import random
+from PIL import Image
 from torch import Tensor
-from torchvision.transforms import Compose, Normalize, ToPILImage, ToTensor
+from torch.distributions.uniform import Uniform
+from torchvision.transforms import Compose, ToPILImage
 from torchvision.transforms.functional import resize

-IMAGENET_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_STD = [0.229, 0.224, 0.225]
-

 class Preprocessing(str, Enum):
    # If the image is bigger than the given size, resize it while keeping the original ratio
@@ -200,43 +200,38 @@ def get_preprocessing_transforms(
                )
            case Preprocessing.FixedWidthResize:
                transforms.append(FixedWidthResize(width=preprocessing["fixed_width"]))
-
    if to_pil_image:
        transforms.append(ToPILImage())
-
    return Compose(transforms)


-def get_augmentation_transforms() -> SomeOf:
+def get_augmentation_transforms() -> A.Compose:
    """
-    Returns a list of transformations to be applied to the image.
+    Returns a list of transformation to be applied to the image.
    """
-    return SomeOf(
+    return A.Compose(
        [
-            Perspective(scale=(0.05, 0.09), fit_output=True),
-            GaussianBlur(sigma_limit=2.5),
-            GaussNoise(var_limit=50**2),
-            ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
-            ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0),
-            Sharpen(alpha=(0.0, 1.0)),
-            ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
-            Affine(shear={"x": (-20, 20), "y": (0, 0)}),
-            CoarseDropout(),
-            Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
-            ToGray(),
+            DPIAdjusting(min_factor=0.75, max_factor=1),
+            A.SomeOf(
+                [
+                    ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
+                    Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4),
+                    GaussianBlur(sigma_limit=2.5, p=1),
+                    GaussNoise(var_limit=50**2, p=1),
+                    ColorJitter(
+                        contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1
+                    ),
+                    ElasticTransform(
+                        alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1
+                    ),
+                    Sharpen(alpha=(0.0, 1.0), p=1),
+                    Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1),
+                    CoarseDropout(p=1),
+                    ToGray(p=0.5),
+                ],
+                n=2,
+                p=0.9,
+            ),
        ],
        p=0.9,
    )
-
-
-def get_normalization_transforms(from_pil_image: bool = False) -> Compose:
-    """
-    Returns a list of normalization transformations.
-    """
-    transforms = []
-
-    if from_pil_image:
-        transforms.append(ToTensor())
-
-    transforms.append(Normalize(IMAGENET_MEAN, IMAGENET_STD))
-    return Compose(transforms)
--- a/dan/utils.py
+++ b/dan/utils.py
@@ -32,19 +32,15 @@ def pad_images(images):
    :param images: List of images as torch tensors.
    :return padded_images: A tensor containing all the padded images.
    """
-    longest_x = max([x.shape[1] for x in data])
-    longest_y = max([x.shape[2] for x in data])
-    padded_data = torch.zeros((len(data), data[0].shape[0], longest_x, longest_y))
-    for index, image in enumerate(data):
-        delta_x = longest_x - image.shape[1]
-        delta_y = longest_y - image.shape[2]
-        top, bottom = delta_x // 2, delta_x - (delta_x // 2)
-        left, right = delta_y // 2, delta_y - (delta_y // 2)
-        padded_data[
+    longest_x = max([x.shape[1] for x in images])
+    longest_y = max([x.shape[2] for x in images])
+    padded_images = torch.zeros((len(images), images[0].shape[0], longest_x, longest_y))
+    for index, image in enumerate(images):
+        padded_images[
            index,
            :,
-            top : padded_data.shape[2] - bottom,
-            left : padded_data.shape[3] - right,
+            0 : image.shape[1],
+            0 : image.shape[2],
        ] = image
    return padded_images


--- a/docs/get_started/training.md
+++ b/docs/get_started/training.md
@@ -42,4 +42,4 @@ The training command does not take any input parameters for now. To train a DAN

 ## 3. Predict

-Once the training is complete, you can apply  a trained DAN model on an image using the [predict command](../usage/predict.md).
+Once the training is complete, you can apply  a trained DAN model on an image using the [predict command](../usage/predict.md) and the `inference_parameters.yml` file, located in `{training_params.output_folder}/results`.
--- a/docs/usage/train/augmentation.md
+++ b/docs/usage/train/augmentation.md
@@ -6,26 +6,26 @@ This page lists data augmentation transforms used in DAN.

 ### Elastic Transform

-|                              | Elastic Transform                                                                                                                                                                              |
-| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation applies local distortions that rotate characters locally.                                                                                                                  |
-| Comments                     | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation.                                             |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform). |
-| Examples                     | ![](../../assets/augmentations/line_elastic.png) ![](../../assets/augmentations/document_elastic.png)                                                                                          |
-| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels)                                                                                                                                                |
+|                              | Elastic Transform                                                                                                                                                                             |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation applies local distortions that rotate characters locally.                                                                                                                 |
+| Comments                     | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation.                                            |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform) |
+| Examples                     | ![](../../assets/augmentations/line_elastic.png) ![](../../assets/augmentations/document_elastic.png)                                                                                         |
+| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels)                                                                                                                                               |

 ### PieceWise Affine

 !!! warning
    This transform is temporarily removed from the pipeline until [this issue](https://github.com/albumentations-team/albumentations/issues/1442) is fixed.

-|                              | PieceWise Affine                                                                                                                                                                              |
-| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation also applies local distortions but with a larger grid than ElasticTransform.                                                                                              |
-| Comments                     | This transformation is very slow. It is a new transform that was not in the original DAN implementation.                                                                                      |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine). |
-| Examples                     | ![](../../assets/augmentations/line_piecewise.png) ![](../../assets/augmentations/document_piecewise.png)                                                                                     |
-| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels)                                                                                                                                               |
+|                              | PieceWise Affine                                                                                                                                                                             |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation also applies local distortions but with a larger grid than ElasticTransform.                                                                                             |
+| Comments                     | This transformation is very slow. It is a new transform that was not in the original DAN implementation.                                                                                     |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine) |
+| Examples                     | ![](../../assets/augmentations/line_piecewise.png) ![](../../assets/augmentations/document_piecewise.png)                                                                                    |
+| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels)                                                                                                                                              |

 ### Dilation Erosion

@@ -33,99 +33,96 @@ This page lists data augmentation transforms used in DAN.
 | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | Description                  | This transformation makes the pen stroke thicker or thinner.                                                                                                                               |
 | Comments                     | The `RandomDilationErosion` class randomly selects a kernel size and applies a dilation or an erosion to the image. It relies on opencv and is similar to the original DAN implementation. |
-| Documentation                | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html).                                                                                     |
+| Documentation                | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html)                                                                                      |
 | Examples                     | ![](../../assets/augmentations/line_erosion_dilation.png) ![](../../assets/augmentations/document_erosion_dilation.png)                                                                    |
 | CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.03 (1116x581 pixels)                                                                                                                                            |

 ### Sharpen

-|                              | Sharpen                                                                                                                                                           |
-| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation makes the image sharper.                                                                                                                      |
-| Comments                     | Similar to the original DAN implementation.                                                                                                                       |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen). |
-| Examples                     | ![](../../assets/augmentations/line_sharpen.png) ![](../../assets/augmentations/document_sharpen.png)                                                             |
-| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                   |
+|                              | Sharpen                                                                                                                                                          |
+| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation makes the image sharper.                                                                                                                     |
+| Comments                     | Similar to the original DAN implementation.                                                                                                                      |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen) |
+| Examples                     | ![](../../assets/augmentations/line_sharpen.png) ![](../../assets/augmentations/document_sharpen.png)                                                            |
+| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                  |

 ### Color Jittering

-|                              | Color Jittering                                                                                                                                                       |
-| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation alters the colors of the image.                                                                                                                   |
-| Comments                     | Similar to the original DAN implementation.                                                                                                                           |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter). |
-| Examples                     | ![](../../assets/augmentations/line_color_jitter.png) ![](../../assets/augmentations/document_color_jitter.png)                                                       |
-| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                       |
+|                              | Color Jittering                                                                                                                                                      |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation alters the colors of the image.                                                                                                                  |
+| Comments                     | Similar to the original DAN implementation.                                                                                                                          |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter) |
+| Examples                     | ![](../../assets/augmentations/line_color_jitter.png) ![](../../assets/augmentations/document_color_jitter.png)                                                      |
+| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                      |

 ### Gaussian Noise

-|                              | Gaussian Noise                                                                                                                                                          |
-| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation adds Gaussian noise to the image.                                                                                                                   |
-| Comments                     | The noise from the original DAN implementation is more uniform.                                                                                                         |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise). |
-| Examples                     | ![](../../assets/augmentations/line_gaussian_noise.png) ![](../../assets/augmentations/document_gaussian_noise.png)                                                     |
-| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels)                                                                                                                         |
+|                              | Gaussian Noise                                                                                                                                                         |
+| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation adds Gaussian noise to the image.                                                                                                                  |
+| Comments                     | The noise from the original DAN implementation is more uniform.                                                                                                        |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise) |
+| Examples                     | ![](../../assets/augmentations/line_gaussian_noise.png) ![](../../assets/augmentations/document_gaussian_noise.png)                                                    |
+| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels)                                                                                                                        |

 ### Gaussian Blur

-|                              | Gaussian Blur                                                                                                                                                          |
-| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation blurs the image.                                                                                                                                   |
-| Comments                     | Similar to the original DAN implementation.                                                                                                                            |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur). |
-| Examples                     | ![](../../assets/augmentations/line_gaussian_blur.png) ![](../../assets/augmentations/document_gaussian_blur.png)                                                      |
-| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                        |
+|                              | Gaussian Blur                                                                                                                                                         |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation blurs the image.                                                                                                                                  |
+| Comments                     | Similar to the original DAN implementation.                                                                                                                           |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur) |
+| Examples                     | ![](../../assets/augmentations/line_gaussian_blur.png) ![](../../assets/augmentations/document_gaussian_blur.png)                                                     |
+| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                       |

 ### Random Perspective

-|                              | Random Perspective                                                                                                                                                    |
-| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation changes the perspective from which the photo is taken.                                                                                            |
-| Comments                     | Similar to the original DAN implementation.                                                                                                                           |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective). |
-| Examples                     | ![](../../assets/augmentations/line_perspective.png) ![](../../assets/augmentations/document_perspective.png)                                                         |
-| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels)                                                                                                                       |
+|                              | Random Perspective                                                                                                                                                   |
+| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation changes the perspective from which the photo is taken.                                                                                           |
+| Comments                     | Similar to the original DAN implementation.                                                                                                                          |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective) |
+| Examples                     | ![](../../assets/augmentations/line_perspective.png) ![](../../assets/augmentations/document_perspective.png)                                                        |
+| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels)                                                                                                                      |

 ### Shearing (x-axis)

-|                              | Shearing (x-axis)                                                                                                                                                                    |
-| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| Description                  | This transformation changes the slant of the text on the image.                                                                                                                      |
-| Comments                     | New transform that was not in the original DAN implementation.                                                                                                                       |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine). |
-| Examples                     | ![](../../assets/augmentations/line_shearx.png) ![](../../assets/augmentations/document_shearx.png)                                                                                  |
-| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                                      |
+|                              | Shearing (x-axis)                                                                                                                                                                   |
+| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation changes the slant of the text on the image.                                                                                                                     |
+| Comments                     | New transform that was not in the original DAN implementation.                                                                                                                      |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine) |
+| Examples                     | ![](../../assets/augmentations/line_shearx.png) ![](../../assets/augmentations/document_shearx.png)                                                                                 |
+| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels)                                                                                                                                     |

 ### Coarse Dropout

-|                              | Coarse Dropout                                                                                                                                                                              |
-| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation adds dropout on the image, turning small patches into black pixels.                                                                                                     |
-| Comments                     | It is a new transform that was not in the original DAN implementation.                                                                                                                      |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout). |
-| Examples                     | ![](../../assets/augmentations/line_dropout.png) ![](../../assets/augmentations/document_dropout.png)                                                                                       |
-| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                                             |
+|                              | Coarse Dropout                                                                                                                                                                             |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Description                  | This transformation adds dropout on the image, turning small patches into black pixels.                                                                                                    |
+| Comments                     | It is a new transform that was not in the original DAN implementation.                                                                                                                     |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout) |
+| Examples                     | ![](../../assets/augmentations/line_dropout.png) ![](../../assets/augmentations/document_dropout.png)                                                                                      |
+| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                                            |

 ### DPIAdjusting

-|                              | Downscale                                                                                                                                                           |
-| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation downscales the image by a random factor.                                                                                                        |
-| Comments                     | It is a new transform that was not in the original DAN implementation.                                                                                              |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Downscale). |
-| Examples                     | ![](../../assets/augmentations/line_downscale.png) ![](../../assets/augmentations/document_downscale.png)                                                           |
-| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.03 (1116x581 pixels)                                                                                                                     |
+|             | DPIAdjusting                                                   |
+| ----------- | -------------------------------------------------------------- |
+| Description | This transformation downscales the image from a random factor. |
+| Comments    | Similar to the original DAN implementation.                    |

 ### ToGray

-|                              | Grayscale                                                                                                                                                        |
-| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Description                  | This transformation transforms an RGB image into grayscale.                                                                                                      |
-| Comments                     | It is a new transform that was not in the original DAN implementation.                                                                                           |
-| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray). |
-| Examples                     | ![](../../assets/augmentations/line_grayscale.png) ![](../../assets/augmentations/document_grayscale.png)                                                        |
-| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                  |
+|                              | ToGray                                                                                                                                                          |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Description                  | This transformation transforms an RGB image into grayscale.                                                                                                     |
+| Comments                     | It is a new transform that was not in the original DAN implementation.                                                                                          |
+| Documentation                | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray) |
+| Examples                     | ![](../../assets/augmentations/line_grayscale.png) ![](../../assets/augmentations/document_grayscale.png)                                                       |
+| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels)                                                                                                                 |

 ## Full augmentation pipeline


--- a/docs/usage/train/parameters.md
+++ b/docs/usage/train/parameters.md
@@ -16,6 +16,7 @@ All hyperparameters are specified and editable in the training scripts `dan/ocr/
 | `dataset_params.config.augmentation`   | Whether to use data augmentation on the training set.                                  | `bool` | `True` (see [dedicated section](#data-augmentation)) |

 !!! warning
+    The variables `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path` must have values such that the data is located in `{dataset_path}/{dataset_name}_{dataset_level}{dataset_variant}`.

 ### Data preprocessing

@@ -90,21 +91,30 @@ DAN takes advantage of transforms from [albumentations](https://albumentations.a
 The following configuration is used by default when using the `teklia-dan train document` command. Data augmentation is applied with a probability of 0.9. In this case, two transformations are randomly selected to be applied.

 ```py
-transforms = SomeOf(
+transforms = A.Compose(
    [
-        Perspective(scale=(0.05, 0.09), fit_output=True),
-        GaussianBlur(sigma_limit=2.5),
-        GaussNoise(var_limit=50**2),
-        ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
-        ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0),
-        Sharpen(alpha=(0.0, 1.0)),
-        ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
-        Affine(shear={"x": (-20, 20), "y": (0, 0)}),
-        CoarseDropout(),
-        Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
-        ToGray(),
+        DPIAdjusting(min_factor=0.75, max_factor=1),
+        A.SomeOf(
+            [
+                ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
+                Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4),
+                GaussianBlur(sigma_limit=2.5, p=1),
+                GaussNoise(var_limit=50**2, p=1),
+                ColorJitter(
+                    contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1
+                ),
+                ElasticTransform(
+                    alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1
+                ),
+                Sharpen(alpha=(0.0, 1.0), p=1),
+                Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1),
+                CoarseDropout(p=1),
+                ToGray(p=0.5),
+            ],
+            n=2,
+            p=0.9,
+        ),
    ],
-    n=2,
    p=0.9,
 )
 ```

--- a/tests/data/prediction/parameters.yml
+++ b/tests/data/prediction/parameters.yml
 ---
 parameters:
+  mean: [166.8418783515498, 166.8418783515498, 166.8418783515498]
+  std: [34.084189571536385, 34.084189571536385, 34.084189571536385]
  max_char_prediction: 200
  encoder:
    dropout: 0.5

--- a/tests/data/training/models/best_0.pt
+++ b/tests/data/training/models/best_0.pt
--- a/tests/data/training/models/last_3.pt
+++ b/tests/data/training/models/last_3.pt
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -98,7 +98,7 @@ def test_predict(
                    "by ner token": [],
                    "total": 0.93,
                    "word": [
-                        {"text": "ⓈBellisson", "confidence": 0.92},
+                        {"text": "ⓈBellisson", "confidence": 0.93},
                        {"text": "ⒻGeorges", "confidence": 0.94},
                        {"text": "Ⓑ91", "confidence": 0.92},
                        {"text": "ⓁP", "confidence": 0.94},
@@ -169,7 +169,7 @@ def test_predict(
                        {"text": "p", "confidence": 1.0},
                        {"text": "l", "confidence": 1.0},
                        {"text": "i", "confidence": 1.0},
-                        {"text": "é", "confidence": 0.86},
+                        {"text": "é", "confidence": 0.85},
                        {"text": " ", "confidence": 1.0},
                        {"text": "Ⓕ", "confidence": 1.0},
                        {"text": "M", "confidence": 1.0},

--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -16,7 +16,7 @@ from tests.conftest import FIXTURES
            "last_3.pt",
            {
                "nb_chars": 43,
-                "cer": 1.2558,
+                "cer": 1.3023,
                "nb_words": 9,
                "wer": 1.0,
                "nb_words_no_punct": 9,
@@ -66,6 +66,12 @@ from tests.conftest import FIXTURES
                            "type": "max_resize",
                        }
                    ],
+                    "mean": [
+                        242.10595854671013,
+                        242.10595854671013,
+                        242.10595854671013,
+                    ],
+                    "std": [28.29919517652322, 28.29919517652322, 28.29919517652322],
                },
            },
        ),
@@ -175,12 +181,12 @@ def test_train_and_test(
            }
            assert res == expected_res

-    # Check that the parameters file is correct
+    # Check that the inference parameters file is correct
    with (
        tmp_path
        / training_config["training_params"]["output_folder"]
        / "results"
-        / "parameters.yml"
+        / "inference_parameters.yml"
    ).open() as f:
        res = yaml.safe_load(f)
        assert res == params_res