Compare revisions

Manon Blanco · Manon Blanco · Manon Blanco · Mélodie Boillet · Manon Blanco · Mélodie Boillet
--- a/dan/manager/ocr.py
+++ b/dan/manager/ocr.py
@@ -6,10 +6,10 @@ import pickle
 import cv2
 import numpy as np
 import torch
+from torch import randint
 from dan.manager.dataset import DatasetManager, GenericDataset, apply_preprocessing
-from dan.ocr.utils import LM_str_to_ind
+from dan.utils import pad_image, pad_images, pad_sequences_1D, token_to_ind
-from dan.utils import pad_image, pad_images, pad_sequences_1D, randint
 class OCRDatasetManager(DatasetManager):
@@ -111,9 +111,8 @@ class OCRDataset(GenericDataset):
                )
                sample["img"] = cv2.resize(sample["img"], (new_w, new_h))
-        # Normalization if requested
+        # Normalization
-        if "normalize" in self.params["config"] and self.params["config"]["normalize"]:
+        sample["img"] = (sample["img"] - self.mean) / self.std
-            sample["img"] = (sample["img"] - self.mean) / self.std
        sample["img_reduced_shape"] = np.ceil(
            sample["img"].shape / self.reduce_dims_factor
@@ -137,12 +136,12 @@ class OCRDataset(GenericDataset):
                min_pad = self.params["config"]["padding"]["min_pad"]
                max_pad = self.params["config"]["padding"]["max_pad"]
                pad_width = (
-                    randint(min_pad, max_pad)
+                    randint(min_pad, max_pad, (1,))
                    if min_pad is not None and max_pad is not None
                    else None
                )
                pad_height = (
-                    randint(min_pad, max_pad)
+                    randint(min_pad, max_pad, (1,))
                    if min_pad is not None and max_pad is not None
                    else None
                )
@@ -174,12 +173,10 @@ class OCRDataset(GenericDataset):
            full_label = label
        sample["label"] = full_label
-        sample["token_label"] = LM_str_to_ind(self.charset, full_label)
+        sample["token_label"] = token_to_ind(self.charset, full_label)
-        if "add_eot" in self.params["config"]["constraints"]:
+        sample["token_label"].append(self.tokens["end"])
-            sample["token_label"].append(self.tokens["end"])
        sample["label_len"] = len(sample["token_label"])
-        if "add_sot" in self.params["config"]["constraints"]:
+        sample["token_label"].insert(0, self.tokens["start"])
-            sample["token_label"].insert(0, self.tokens["start"])
        return sample

--- a/dan/manager/training.py
+++ b/dan/manager/training.py
@@ -20,8 +20,8 @@ from tqdm import tqdm
 from dan.manager.metrics import MetricManager
 from dan.manager.ocr import OCRDatasetManager
 from dan.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics
-from dan.ocr.utils import LM_ind_to_str
 from dan.schedulers import DropoutScheduler
+from dan.utils import ind_to_token
 if MLFLOW_AVAILABLE:
    import mlflow
@@ -1010,7 +1010,7 @@ class Manager(OCRManager):
            predicted_tokens = torch.argmax(pred, dim=1).detach().cpu().numpy()
            predicted_tokens = [predicted_tokens[i, : y_len[i]] for i in range(b)]
            str_x = [
-                LM_ind_to_str(self.dataset.charset, t, oov_symbol="")
+                ind_to_token(self.dataset.charset, t, oov_symbol="")
                for t in predicted_tokens
            ]
@@ -1130,7 +1130,7 @@ class Manager(OCRManager):
                confidence_scores[i, : prediction_len[i]].tolist() for i in range(b)
            ]
            str_x = [
-                LM_ind_to_str(self.dataset.charset, t, oov_symbol="")
+                ind_to_token(self.dataset.charset, t, oov_symbol="")
                for t in predicted_tokens
            ]

--- a/dan/ocr/document/train.py
+++ b/dan/ocr/document/train.py
@@ -109,11 +109,7 @@ def get_config():
                "height_divisor": 32,  # Image height will be divided by 32
                "padding_value": 0,  # Image padding value
                "padding_token": None,  # Label padding value
-                "constraints": [
+                "constraints": [],
-                    "add_eot",
-                    "add_sot",
-                ],  # add end-of-transcription and start-of-transcription tokens in labels
-                "normalize": True,  # Normalize with mean and variance of training dataset
                "preprocessings": [
                    {
                        "type": "to_RGB",

--- a/dan/ocr/utils.py
+++ b/dan/ocr/utils.py
-# -*- coding: utf-8 -*-
-# Charset / labels conversion
-def LM_str_to_ind(labels, str):
-    return [labels.index(c) for c in str]
-def LM_ind_to_str(labels, ind, oov_symbol=None):
-    if oov_symbol is not None:
-        res = []
-        for i in ind:
-            if i < len(labels):
-                res.append(labels[i])
-            else:
-                res.append(oov_symbol)
-    else:
-        res = [labels[i] for i in ind]
-    return "".join(res)
--- a/dan/predict/attention.py
+++ b/dan/predict/attention.py
@@ -6,7 +6,6 @@ import numpy as np
 from PIL import Image
 from dan import logger
-from dan.utils import round_floats
 def parse_delimiters(delimiters):
@@ -78,7 +77,7 @@ def split_text_and_confidences(
        offset = 1
    else:
        logger.error("Level should be either 'char', 'word', or 'line'")
-    return texts, round_floats(probs), offset
+    return texts, [np.around(num, 2) for num in probs], offset
 def get_predicted_polygons_with_confidence(

--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -2,6 +2,7 @@
 import os
 import pickle
+from itertools import pairwise
 from pathlib import Path
 import cv2
@@ -13,14 +14,13 @@ from dan import logger
 from dan.datasets.extract.utils import save_json
 from dan.decoder import GlobalHTADecoder
 from dan.encoder import FCN_Encoder
-from dan.ocr.utils import LM_ind_to_str
 from dan.predict.attention import (
    get_predicted_polygons_with_confidence,
    parse_delimiters,
    plot_attention,
    split_text_and_confidences,
 )
-from dan.utils import pairwise, read_image
+from dan.utils import ind_to_token, read_image
 class DAN:
@@ -220,7 +220,7 @@ class DAN:
            # Transform tokens to characters
            predicted_text = [
-                LM_ind_to_str(self.charset, t, oov_symbol="") for t in predicted_tokens
+                ind_to_token(self.charset, t, oov_symbol="") for t in predicted_tokens
            ]
            logger.info("Images processed")

--- a/dan/transforms.py
+++ b/dan/transforms.py
@@ -9,6 +9,8 @@ import numpy as np
 from cv2 import dilate, erode, normalize
 from numpy import random
 from PIL import Image
+from torch import rand, randint
+from torch.distributions.uniform import Uniform
 from torchvision.transforms import (
    ColorJitter,
    GaussianBlur,
@@ -17,8 +19,6 @@ from torchvision.transforms import (
 )
 from torchvision.transforms.functional import InterpolationMode
-from dan.utils import rand, rand_uniform, randint
 class DPIAdjusting:
    """
@@ -173,14 +173,14 @@ def get_list_augmenters(img, aug_configs, fill_value):
    """
    augmenters = list()
    for aug_config in aug_configs:
-        if rand() > aug_config["proba"]:
+        if rand((1,)) > aug_config["proba"]:
            continue
        if aug_config["type"] == "dpi":
            valid_factor = False
            while not valid_factor:
-                factor = rand_uniform(
+                factor = Uniform(
                    aug_config["min_factor"], aug_config["max_factor"]
-                )
+                ).sample()
                valid_factor = not (
                    (
                        "max_width" in aug_config
@@ -202,8 +202,12 @@ def get_list_augmenters(img, aug_configs, fill_value):
            augmenters.append(DPIAdjusting(factor))
        elif aug_config["type"] == "zoom_ratio":
-            ratio_h = rand_uniform(aug_config["min_ratio_h"], aug_config["max_ratio_h"])
+            ratio_h = Uniform(
-            ratio_w = rand_uniform(aug_config["min_ratio_w"], aug_config["max_ratio_w"])
+                aug_config["min_ratio_h"], aug_config["max_ratio_h"]
+            ).sample()
+            ratio_w = Uniform(
+                aug_config["min_ratio_w"], aug_config["max_ratio_w"]
+            ).sample()
            augmenters.append(
                ZoomRatio(
                    ratio_h=ratio_h, ratio_w=ratio_w, keep_dim=aug_config["keep_dim"]
@@ -211,7 +215,7 @@ def get_list_augmenters(img, aug_configs, fill_value):
            )
        elif aug_config["type"] == "perspective":
-            scale = rand_uniform(aug_config["min_factor"], aug_config["max_factor"])
+            scale = Uniform(aug_config["min_factor"], aug_config["max_factor"]).sample()
            augmenters.append(
                RandomPerspective(
                    distortion_scale=scale,
@@ -223,13 +227,20 @@ def get_list_augmenters(img, aug_configs, fill_value):
        elif aug_config["type"] == "elastic_distortion":
            kernel_size = (
-                randint(aug_config["min_kernel_size"], aug_config["max_kernel_size"])
+                randint(
-                // 2
+                    aug_config["min_kernel_size"], aug_config["max_kernel_size"], (1,)
-                * 2
+                ).item()
-                + 1
+            ) // 2 * 2 + 1
+            sigma = (
+                Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
+                .sample()
+                .item()
+            )
+            alpha = (
+                Uniform(aug_config["min_alpha"], aug_config["max_alpha"])
+                .sample()
+                .item()
            )
-            sigma = rand_uniform(aug_config["min_sigma"], aug_config["max_sigma"])
-            alpha = rand_uniform(aug_config["min_alpha"], aug_config["max_alpha"])
            augmenters.append(
                ElasticDistortion(
                    kernel_size=(kernel_size, kernel_size), sigma=sigma, alpha=alpha
@@ -237,9 +248,13 @@ def get_list_augmenters(img, aug_configs, fill_value):
            )
        elif aug_config["type"] == "dilation_erosion":
-            kernel_h = randint(aug_config["min_kernel"], aug_config["max_kernel"] + 1)
+            kernel_h = randint(
-            kernel_w = randint(aug_config["min_kernel"], aug_config["max_kernel"] + 1)
+                aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
-            if randint(0, 2) == 0:
+            )
+            kernel_w = randint(
+                aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
+            )
+            if randint(0, 2, (1,)) == 0:
                augmenters.append(
                    Erosion((kernel_w, kernel_h), aug_config["iterations"])
                )
@@ -261,9 +276,17 @@ def get_list_augmenters(img, aug_configs, fill_value):
        elif aug_config["type"] == "gaussian_blur":
            max_kernel_h = min(aug_config["max_kernel"], img.size[1])
            max_kernel_w = min(aug_config["max_kernel"], img.size[0])
-            kernel_h = randint(aug_config["min_kernel"], max_kernel_h + 1) // 2 * 2 + 1
+            kernel_h = (
-            kernel_w = randint(aug_config["min_kernel"], max_kernel_w + 1) // 2 * 2 + 1
+                randint(aug_config["min_kernel"], max_kernel_h + 1, (1,)).item()
-            sigma = rand_uniform(aug_config["min_sigma"], aug_config["max_sigma"])
+            ) // 2 * 2 + 1
+            kernel_w = (
+                randint(aug_config["min_kernel"], max_kernel_w + 1, (1,)).item()
+            ) // 2 * 2 + 1
+            sigma = (
+                Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
+                .sample()
+                .item()
+            )
            augmenters.append(
                GaussianBlur(kernel_size=(kernel_w, kernel_h), sigma=sigma)
            )
@@ -272,10 +295,10 @@ def get_list_augmenters(img, aug_configs, fill_value):
            augmenters.append(GaussianNoise(std=aug_config["std"]))
        elif aug_config["type"] == "sharpen":
-            alpha = rand_uniform(aug_config["min_alpha"], aug_config["max_alpha"])
+            alpha = Uniform(aug_config["min_alpha"], aug_config["max_alpha"]).sample()
-            strength = rand_uniform(
+            strength = Uniform(
                aug_config["min_strength"], aug_config["max_strength"]
-            )
+            ).sample()
            augmenters.append(Sharpen(alpha=alpha, strength=strength))
        else:
@@ -289,7 +312,7 @@ def apply_data_augmentation(img, da_config):
    """
    Apply data augmentation strategy on input image
    """
-    if da_config["proba"] != 1 and rand() > da_config["proba"]:
+    if da_config["proba"] != 1 and rand((1,)) > da_config["proba"]:
        return img
    # Convert to PIL Image

--- a/dan/utils.py
+++ b/dan/utils.py
 # -*- coding: utf-8 -*-
-from itertools import tee
 import cv2
 import numpy as np
-import torch
+from torch import randint
-from torch.distributions.uniform import Uniform
 # Layout begin-token to end-token
 SEM_MATCHING_TOKENS = {"ⓘ": "Ⓘ", "ⓓ": "Ⓓ", "ⓢ": "Ⓢ", "ⓒ": "Ⓒ", "ⓟ": "Ⓟ", "ⓐ": "Ⓐ"}
@@ -16,27 +13,6 @@ class MLflowNotInstalled(Exception):
    """
-def randint(low, high):
-    """
-    call torch.randint to preserve random among dataloader workers
-    """
-    return int(torch.randint(low, high, (1,)))
-def rand():
-    """
-    call torch.rand to preserve random among dataloader workers
-    """
-    return float(torch.rand((1,)))
-def rand_uniform(low, high):
-    """
-    call torch uniform to preserve random among dataloader workers
-    """
-    return float(Uniform(low, high).sample())
 def pad_sequences_1D(data, padding_value):
    """
    Pad data with padding_value to get same length
@@ -70,8 +46,8 @@ def pad_images(data, padding_value, padding_mode="br"):
        elif padding_mode == "random":
            xmax = longest_x - x_len
            ymax = longest_y - y_len
-            xi = randint(0, xmax) if xmax >= 1 else 0
+            xi = randint(0, xmax, (1,)) if xmax >= 1 else 0
-            yi = randint(0, ymax) if ymax >= 1 else 0
+            yi = randint(0, ymax, (1,)) if ymax >= 1 else 0
            padded_data[i, xi : xi + x_len, yi : yi + y_len, ...] = data[i]
        else:
            raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
@@ -120,8 +96,8 @@ def pad_image(
        elif padding_mode == "tl":
            hi, wi = pad_height, pad_width
        elif padding_mode == "random":
-            hi = randint(0, pad_height) if pad_height >= 1 else 0
+            hi = randint(0, pad_height, (1,)) if pad_height >= 1 else 0
-            wi = randint(0, pad_width) if pad_width >= 1 else 0
+            wi = randint(0, pad_width, (1,)) if pad_width >= 1 else 0
        else:
            raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
        padded_image[hi : hi + h, wi : wi + w, ...] = image
@@ -156,11 +132,19 @@ def round_floats(float_list, decimals=2):
    return [np.around(num, decimals) for num in float_list]
-def pairwise(iterable):
+# Charset / labels conversion
-    """
+def token_to_ind(labels, str):
-    Not necessary when using 3.10. See https://docs.python.org/3/library/itertools.html#itertools.pairwise.
+    return [labels.index(c) for c in str]
-    """
-    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
-    a, b = tee(iterable)
+def ind_to_token(labels, ind, oov_symbol=None):
-    next(b, None)
+    if oov_symbol is not None:
-    return zip(a, b)
+        res = []
+        for i in ind:
+            if i < len(labels):
+                res.append(labels[i])
+            else:
+                res.append(oov_symbol)
+    else:
+        res = [labels[i] for i in ind]
+    return "".join(res)
--- a/docs/ref/ocr/utils.md
+++ b/docs/ref/ocr/utils.md
-# Utils
-::: dan.ocr.utils
--- a/docs/usage/train/parameters.md
+++ b/docs/usage/train/parameters.md
@@ -8,7 +8,7 @@ All hyperparameters are specified and editable in the training scripts (meaning
 | `dataset_name`                          | Name of the dataset.                                                                   | `str`        |                                                |
 | `dataset_level`                         | Level of the dataset. Should be named after the element type.                          | `str`        |                                                |
 | `dataset_variant`                       | Variant of the dataset. Usually empty for HTR datasets, `"_sem"` for HTR+NER datasets. | `str`        |                                                |
-| `dataset_path`                          | Path to the dataset.                                                                   | `str`        |
+| `dataset_path`                          | Path to the dataset.                                                                   | `str`        |                                                |
 | `dataset_params.config.dataset_manager` | Dataset manager class.                                                                 | custom class | `OCRDatasetManager`                            |
 | `dataset_params.config.dataset_class`   | Dataset class.                                                                         | custom class | `OCRDataset`                                   |
 | `dataset_params.config.datasets`        | Dataset dictionary with the dataset name as key and dataset path as value.             | `dict`       |                                                |
@@ -18,8 +18,7 @@ All hyperparameters are specified and editable in the training scripts (meaning
 | `dataset_params.config.width_divisor`   | Factor to reduce the height of the feature vector before feeding the decoder.          | `int`        | `32`                                           |
 | `dataset_params.config.padding_value`   | Image padding value.                                                                   | `int`        | `0`                                            |
 | `dataset_params.config.padding_token`   | Transcription padding value.                                                           | `int`        | `None`                                         |
-| `dataset_params.config.constraints`     | Whether to add end-of-transcription and start-of-transcription tokens in labels.       | `list`       | `["add_eot", "add_sot"]`                       |
+| `dataset_params.config.constraints`     | Whether to add end-of-transcription and start-of-transcription tokens in labels.       | `list`       | `[]`                                           |
-| `dataset_params.config.normalize`       | Normalize with mean and variance of training dataset.                                  | `bool`       | `True`                                         |
 | `dataset_params.config.preprocessings`  | List of pre-processing functions to apply to input images.                             | `list`       | (see [dedicated section](#data-preprocessing)) |
 | `dataset_params.config.augmentation`    | Configuration for data augmentation.                                                   | `dict`       | (see [dedicated section](#data-augmentation))  |

--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -86,7 +86,6 @@ nav:
      - Training managers: ref/managers/training.md
    - OCR:
      - ref/ocr/index.md
-      - Utils: ref/ocr/utils.md
      - Document:
        - ref/ocr/document/index.md
        - Training: ref/ocr/document/train.md

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -72,11 +72,7 @@ def training_config():
                "height_divisor": 32,  # Image height will be divided by 32
                "padding_value": 0,  # Image padding value
                "padding_token": None,  # Label padding value
-                "constraints": [
+                "constraints": [],
-                    "add_eot",
-                    "add_sot",
-                ],  # add end-of-transcription and start-of-transcription tokens in labels
-                "normalize": True,  # Normalize with mean and variance of training dataset
                "preprocessings": [
                    {
                        "type": "to_RGB",
No results found