From 32ce7f17e3f893e3db6597641f7b79e30950d6df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lodie=20Boillet?= <boillet@teklia.com> Date: Fri, 4 Aug 2023 12:04:02 +0000 Subject: [PATCH] Fix version 0.2.0-dev3 and later --- dan/manager/dataset.py | 56 ++++++--- dan/manager/ocr.py | 35 ++++-- dan/manager/training.py | 76 +++++++++--- dan/predict/attention.py | 1 - dan/predict/prediction.py | 18 ++- dan/transforms.py | 117 +++++++++++-------- dan/utils.py | 28 ++--- docs/get_started/training.md | 2 +- docs/usage/train/augmentation.md | 165 +++++++++++++-------------- docs/usage/train/parameters.md | 35 +++--- tests/data/prediction/parameters.yml | 2 + tests/data/training/models/best_0.pt | 2 +- tests/data/training/models/last_3.pt | 2 +- tests/test_prediction.py | 4 +- tests/test_training.py | 16 ++- 15 files changed, 340 insertions(+), 219 deletions(-) diff --git a/dan/manager/dataset.py b/dan/manager/dataset.py index d441c6a1..d84b4372 100644 --- a/dan/manager/dataset.py +++ b/dan/manager/dataset.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import copy import json import os @@ -21,18 +22,20 @@ class OCRDataset(Dataset): charset, tokens, preprocessing_transforms, - normalization_transforms, augmentation_transforms, load_in_memory=False, + mean=None, + std=None, ): self.set_name = set_name self.charset = charset self.tokens = tokens self.load_in_memory = load_in_memory + self.mean = mean + self.std = std - # Pre-processing, augmentation, normalization + # Pre-processing, augmentation self.preprocessing_transforms = preprocessing_transforms - self.normalization_transforms = normalization_transforms self.augmentation_transforms = augmentation_transforms # Factor to reduce the height and width of the feature vector before feeding the decoder. @@ -54,20 +57,20 @@ class OCRDataset(Dataset): """ Return an item from the dataset (image and label) """ - # Load preprocessed image - sample = dict(**self.samples[idx]) + sample = copy.deepcopy(self.samples[idx]) if not self.load_in_memory: sample["img"] = self.get_sample_img(idx) + # Convert to numpy + sample["img"] = np.array(sample["img"]) + # Apply data augmentation if self.augmentation_transforms: - sample["img"] = self.augmentation_transforms(image=np.array(sample["img"]))[ - "image" - ] + sample["img"] = self.augmentation_transforms(image=sample["img"])["image"] # Image normalization - sample["img"] = self.normalization_transforms(sample["img"]) + sample["img"] = (sample["img"] - self.mean) / self.std # Get final height and width sample["img_reduced_shape"], sample["img_position"] = self.compute_final_size( @@ -119,21 +122,44 @@ class OCRDataset(Dataset): return self.preprocessing_transforms(read_image(self.samples[i]["path"])) + def compute_std_mean(self): + """ + Compute cumulated variance and mean of whole dataset + """ + if self.mean is not None and self.std is not None: + return self.mean, self.std + + total = np.zeros((3,)) + diff = np.zeros((3,)) + nb_pixels = 0 + for metric in ["mean", "std"]: + for ind in range(len(self.samples)): + img = np.array(self.get_sample_img(ind)) + if metric == "mean": + total += np.sum(img, axis=(0, 1)) + nb_pixels += np.prod(img.shape[:2]) + elif metric == "std": + diff += [ + np.sum((img[:, :, k] - self.mean[k]) ** 2) for k in range(3) + ] + if metric == "mean": + self.mean = total / nb_pixels + elif metric == "std": + self.std = np.sqrt(diff / nb_pixels) + return self.mean, self.std + def compute_final_size(self, img): """ Compute the final image size and position after feature extraction """ - final_c, final_h, final_w = img.shape - image_reduced_shape = np.ceil( - [final_h, final_w, final_c] / self.reduce_dims_factor - ).astype(int) + image_reduced_shape = np.ceil(img.shape / self.reduce_dims_factor).astype(int) if self.set_name == "train": image_reduced_shape = [max(1, t) for t in image_reduced_shape] image_position = [ - [0, final_h], - [0, final_w], + [0, img.shape[0]], + [0, img.shape[1]], ] return image_reduced_shape, image_position diff --git a/dan/manager/ocr.py b/dan/manager/ocr.py index fb79581f..077f965c 100644 --- a/dan/manager/ocr.py +++ b/dan/manager/ocr.py @@ -9,11 +9,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from dan.manager.dataset import OCRDataset -from dan.transforms import ( - get_augmentation_transforms, - get_normalization_transforms, - get_preprocessing_transforms, -) +from dan.transforms import get_augmentation_transforms, get_preprocessing_transforms from dan.utils import pad_images, pad_sequences_1D @@ -36,6 +32,17 @@ class OCRDatasetManager: self.valid_samplers = dict() self.test_samplers = dict() + self.mean = ( + np.array(params["config"]["mean"]) + if "mean" in params["config"].keys() + else None + ) + self.std = ( + np.array(params["config"]["std"]) + if "std" in params["config"].keys() + else None + ) + self.generator = torch.Generator() self.generator.manual_seed(0) @@ -49,7 +56,6 @@ class OCRDatasetManager: self.params["config"]["padding_token"] = self.tokens["pad"] self.my_collate_function = OCRCollateFunction(self.params["config"]) - self.normalization = get_normalization_transforms(from_pil_image=True) self.augmentation = ( get_augmentation_transforms() if self.params["config"]["augmentation"] @@ -69,11 +75,14 @@ class OCRDatasetManager: charset=self.charset, tokens=self.tokens, preprocessing_transforms=self.preprocessing, - normalization_transforms=self.normalization, augmentation_transforms=self.augmentation, load_in_memory=self.load_in_memory, + mean=self.mean, + std=self.std, ) + self.mean, self.std = self.train_dataset.compute_std_mean() + for custom_name in self.params["val"].keys(): self.valid_datasets[custom_name] = OCRDataset( set_name="val", @@ -81,9 +90,10 @@ class OCRDatasetManager: charset=self.charset, tokens=self.tokens, preprocessing_transforms=self.preprocessing, - normalization_transforms=self.normalization, augmentation_transforms=None, load_in_memory=self.load_in_memory, + mean=self.mean, + std=self.std, ) def load_ddp_samplers(self): @@ -167,9 +177,10 @@ class OCRDatasetManager: charset=self.charset, tokens=self.tokens, preprocessing_transforms=self.preprocessing, - normalization_transforms=self.normalization, augmentation_transforms=None, load_in_memory=self.load_in_memory, + mean=self.mean, + std=self.std, ) if self.params["use_ddp"]: @@ -181,6 +192,7 @@ class OCRDatasetManager: ) else: self.test_samplers[custom_name] = None + self.test_loaders[custom_name] = DataLoader( self.test_datasets[custom_name], batch_size=1, @@ -243,7 +255,10 @@ class OCRCollateFunction: labels = [batch_data[i]["token_label"] for i in range(len(batch_data))] labels = pad_sequences_1D(labels, padding_value=self.label_padding_value).long() - imgs = [batch_data[i]["img"] for i in range(len(batch_data))] + imgs = [ + torch.from_numpy(batch_data[i]["img"]).permute(2, 0, 1) + for i in range(len(batch_data)) + ] imgs = pad_images(imgs) formatted_batch_data = { diff --git a/dan/manager/training.py b/dan/manager/training.py index 7b73a487..af6156cc 100644 --- a/dan/manager/training.py +++ b/dan/manager/training.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os import random +from copy import deepcopy from enum import Enum from time import time @@ -452,22 +453,68 @@ class GenericTrainingManager: def save_params(self): """ - Output yaml file containing a summary of all hyperparameters chosen for the training + Output a yaml file containing a summary of all hyperparameters chosen for the training + and a yaml file containing parameters used for inference """ - path = os.path.join(self.paths["results"], "parameters.yml") + + def compute_nb_params(module): + return sum([np.prod(p.size()) for p in list(module.parameters())]) + + def class_to_str_dict(my_dict): + for key in my_dict: + if key == "preprocessings": + my_dict[key] = [ + { + key: value.value if isinstance(value, Enum) else value + for key, value in preprocessing.items() + } + for preprocessing in my_dict[key] + ] + elif callable(my_dict[key]): + my_dict[key] = my_dict[key].__name__ + elif isinstance(my_dict[key], np.ndarray): + my_dict[key] = my_dict[key].tolist() + elif isinstance(my_dict[key], list) and isinstance( + my_dict[key][0], tuple + ): + my_dict[key] = [list(elt) for elt in my_dict[key]] + elif isinstance(my_dict[key], dict): + my_dict[key] = class_to_str_dict(my_dict[key]) + return my_dict + + # Save training parameters + path = os.path.join(self.paths["results"], "training_parameters.yml") if os.path.isfile(path): return + params = class_to_str_dict(my_dict=deepcopy(self.params)) + total_params = 0 + for model_name in self.models.keys(): + current_params = compute_nb_params(self.models[model_name]) + params["model_params"]["models"][model_name] = [ + params["model_params"]["models"][model_name], + "{:,}".format(current_params), + ] + total_params += current_params + params["model_params"]["total_params"] = "{:,}".format(total_params) + params["mean"] = self.dataset.mean.tolist() + params["std"] = self.dataset.std.tolist() + with open(path, "w") as f: + yaml.dump(params, f) - params = { + # Save inference parameters + path = os.path.join(self.paths["results"], "inference_parameters.yml") + if os.path.isfile(path): + return + inference_params = { "parameters": { - "max_char_prediction": self.params["training_params"][ - "max_char_prediction" - ], + "mean": params["mean"], + "std": params["std"], + "max_char_prediction": params["training_params"]["max_char_prediction"], "encoder": { - "dropout": self.params["model_params"]["dropout"], + "dropout": params["model_params"]["dropout"], }, "decoder": { - key: self.params["model_params"][key] + key: params["model_params"][key] for key in [ "enc_dim", "l_max", @@ -483,20 +530,11 @@ class GenericTrainingManager: "attention_win", ] }, - "preprocessings": [ - { - key: value.value if isinstance(value, Enum) else value - for key, value in preprocessing.items() - } - for preprocessing in self.params["dataset_params"]["config"].get( - "preprocessings", [] - ) - ], + "preprocessings": params["dataset_params"]["config"]["preprocessings"], }, } - with open(path, "w") as f: - yaml.dump(params, f) + yaml.dump(inference_params, f) def backward_loss(self, loss, retain_graph=False): self.scaler.scale(loss).backward(retain_graph=retain_graph) diff --git a/dan/predict/attention.py b/dan/predict/attention.py index dc8c9397..6c569f56 100644 --- a/dan/predict/attention.py +++ b/dan/predict/attention.py @@ -303,7 +303,6 @@ def plot_attention( :param line_separators: List of line separators :param display_polygons: Whether to plot extracted polygons """ - image = to_pil_image(image) attention_map = [] diff --git a/dan/predict/prediction.py b/dan/predict/prediction.py index e79029fe..f2e2ee16 100644 --- a/dan/predict/prediction.py +++ b/dan/predict/prediction.py @@ -19,7 +19,7 @@ from dan.predict.attention import ( plot_attention, split_text_and_confidences, ) -from dan.transforms import get_normalization_transforms, get_preprocessing_transforms +from dan.transforms import get_preprocessing_transforms from dan.utils import ind_to_token, list_to_batches, pad_images, read_image @@ -74,7 +74,10 @@ class DAN: self.encoder = encoder self.decoder = decoder - self.normalization = get_normalization_transforms() + self.mean, self.std = ( + torch.tensor(parameters["mean"]) / 255, + torch.tensor(parameters["std"]) / 255, + ) self.preprocessing_transforms = get_preprocessing_transforms( parameters.get("preprocessings", []) ) @@ -87,7 +90,12 @@ class DAN: """ image = read_image(path) preprocessed_image = self.preprocessing_transforms(image) - return preprocessed_image, self.normalization(preprocessed_image) + normalized_image = torch.zeros(preprocessed_image.shape) + for ch in range(preprocessed_image.shape[0]): + normalized_image[ch, :, :] = ( + preprocessed_image[ch, :, :] - self.mean[ch] + ) / self.std[ch] + return preprocessed_image, normalized_image def predict( self, @@ -276,7 +284,6 @@ def process_batch( # Convert to tensor of size (batch_size, channel, height, width) with batch_size=1 input_tensor = pad_images(input_images).to(device) visu_tensor = pad_images(visu_images).to(device) - logger.info("Images preprocessed!") # Parse delimiters to regex @@ -297,8 +304,8 @@ def process_batch( threshold_method=threshold_method, threshold_value=threshold_value, ) - logger.info("Prediction parsing...") + logger.info("Prediction parsing...") for idx, image_path in enumerate(image_batch): predicted_text = prediction["text"][idx] result = {"text": predicted_text} @@ -319,7 +326,6 @@ def process_batch( ] # calculates scores by token - result["confidences"]["by ner token"] = [ { "text": f"{predicted_text[current: next_token]}".replace("\n", " "), diff --git a/dan/transforms.py b/dan/transforms.py index b0501e82..f17aa900 100644 --- a/dan/transforms.py +++ b/dan/transforms.py @@ -5,13 +5,12 @@ Each transform class defined here takes as input a PIL Image and returns the mod from enum import Enum from random import randint +import albumentations as A import numpy as np -from albumentations import SomeOf from albumentations.augmentations import ( Affine, CoarseDropout, ColorJitter, - Downscale, ElasticTransform, GaussianBlur, GaussNoise, @@ -19,15 +18,15 @@ from albumentations.augmentations import ( Sharpen, ToGray, ) -from cv2 import INTER_NEAREST, dilate, erode +from albumentations.core.transforms_interface import ImageOnlyTransform +from cv2 import dilate, erode from numpy import random +from PIL import Image from torch import Tensor -from torchvision.transforms import Compose, Normalize, ToPILImage, ToTensor +from torch.distributions.uniform import Uniform +from torchvision.transforms import Compose, ToPILImage from torchvision.transforms.functional import resize -IMAGENET_MEAN = [0.485, 0.456, 0.406] -IMAGENET_STD = [0.229, 0.224, 0.225] - class Preprocessing(str, Enum): # If the image is bigger than the given size, resize it while keeping the original ratio @@ -119,30 +118,64 @@ class Erosion: return erode(np.array(x), self.kernel, iterations=self.iterations) -class ErosionDilation: +class ErosionDilation(ImageOnlyTransform): """ Random erosion or dilation """ - def __init__(self, min_kernel, max_kernel, iterations, p=1.0): + def __init__( + self, + min_kernel: int, + max_kernel: int, + iterations: int, + always_apply: bool = False, + p: float = 1.0, + ): + super(ErosionDilation, self).__init__(always_apply, p) self.min_kernel = min_kernel self.max_kernel = max_kernel self.iterations = iterations self.p = p self.always_apply = False - def __call__(self, image, force_apply=False): - if not (random.random() <= self.p or self.always_apply or force_apply): - return {"image": image} + def apply(self, img: np.ndarray, **params): kernel_h = randint(self.min_kernel, self.max_kernel) kernel_w = randint(self.min_kernel, self.max_kernel) kernel = np.ones((kernel_h, kernel_w), np.uint8) augmented_image = ( - Erosion(kernel, iterations=self.iterations)(image) + Erosion(kernel, iterations=self.iterations)(img) if random.random() < 0.5 - else Dilation(kernel=kernel, iterations=self.iterations)(image) + else Dilation(kernel=kernel, iterations=self.iterations)(img) ) - return {"image": augmented_image} + return augmented_image + + +class DPIAdjusting(ImageOnlyTransform): + """ + Resolution modification + """ + + def __init__( + self, + min_factor: float = 0.75, + max_factor: float = 1, + always_apply: bool = False, + p: float = 1.0, + ): + super(DPIAdjusting, self).__init__(always_apply, p) + self.min_factor = min_factor + self.max_factor = max_factor + self.p = p + self.always_apply = False + + def apply(self, img: np.ndarray, **params): + factor = float(Uniform(self.min_factor, self.max_factor).sample()) + img = Image.fromarray(img) + augmented_image = img.resize( + (int(np.ceil(img.width * factor)), int(np.ceil(img.height * factor))), + Image.BILINEAR, + ) + return np.array(augmented_image) def get_preprocessing_transforms( @@ -167,44 +200,38 @@ def get_preprocessing_transforms( ) case Preprocessing.FixedWidthResize: transforms.append(FixedWidthResize(width=preprocessing["fixed_width"])) - if to_pil_image: transforms.append(ToPILImage()) - return Compose(transforms) -def get_augmentation_transforms() -> SomeOf: +def get_augmentation_transforms() -> A.Compose: """ - Returns a list of transformations to be applied to the image. + Returns a list of transformation to be applied to the image. """ - return SomeOf( + return A.Compose( [ - Perspective(scale=(0.05, 0.09), fit_output=True), - GaussianBlur(sigma_limit=2.5), - GaussNoise(var_limit=50**2), - ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2), - ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0), - Sharpen(alpha=(0.0, 1.0)), - ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), - Affine(shear={"x": (-20, 20), "y": (0, 0)}), - CoarseDropout(), - Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST), - ToGray(), + DPIAdjusting(min_factor=0.75, max_factor=1), + A.SomeOf( + [ + ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), + Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4), + GaussianBlur(sigma_limit=2.5, p=1), + GaussNoise(var_limit=50**2, p=1), + ColorJitter( + contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1 + ), + ElasticTransform( + alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1 + ), + Sharpen(alpha=(0.0, 1.0), p=1), + Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1), + CoarseDropout(p=1), + ToGray(p=0.5), + ], + n=2, + p=0.9, + ), ], - n=2, p=0.9, ) - - -def get_normalization_transforms(from_pil_image: bool = False) -> Compose: - """ - Returns a list of normalization transformations. - """ - transforms = [] - - if from_pil_image: - transforms.append(ToTensor()) - - transforms.append(Normalize(IMAGENET_MEAN, IMAGENET_STD)) - return Compose(transforms) diff --git a/dan/utils.py b/dan/utils.py index 9eddb9c5..0e39aba6 100644 --- a/dan/utils.py +++ b/dan/utils.py @@ -26,27 +26,23 @@ def pad_sequences_1D(data, padding_value): return padded_data -def pad_images(data): +def pad_images(images): """ - Pad the images so that they are in the middle of the large padded image (tb-lr mode). - :param data: List of numpy arrays. - :return padded_data: A tensor containing all the padded images. + Pad the images so that they are at the top left of the large padded image. + :param images: List of images as torch tensors. + :return padded_images: A tensor containing all the padded images. """ - longest_x = max([x.shape[1] for x in data]) - longest_y = max([x.shape[2] for x in data]) - padded_data = torch.zeros((len(data), data[0].shape[0], longest_x, longest_y)) - for index, image in enumerate(data): - delta_x = longest_x - image.shape[1] - delta_y = longest_y - image.shape[2] - top, bottom = delta_x // 2, delta_x - (delta_x // 2) - left, right = delta_y // 2, delta_y - (delta_y // 2) - padded_data[ + longest_x = max([x.shape[1] for x in images]) + longest_y = max([x.shape[2] for x in images]) + padded_images = torch.zeros((len(images), images[0].shape[0], longest_x, longest_y)) + for index, image in enumerate(images): + padded_images[ index, :, - top : padded_data.shape[2] - bottom, - left : padded_data.shape[3] - right, + 0 : image.shape[1], + 0 : image.shape[2], ] = image - return padded_data + return padded_images def read_image(path): diff --git a/docs/get_started/training.md b/docs/get_started/training.md index 3bc0bd48..b996f018 100644 --- a/docs/get_started/training.md +++ b/docs/get_started/training.md @@ -42,4 +42,4 @@ The training command does not take any input parameters for now. To train a DAN ## 3. Predict -Once the training is complete, you can apply a trained DAN model on an image using the [predict command](../usage/predict.md). +Once the training is complete, you can apply a trained DAN model on an image using the [predict command](../usage/predict.md) and the `inference_parameters.yml` file, located in `{training_params.output_folder}/results`. diff --git a/docs/usage/train/augmentation.md b/docs/usage/train/augmentation.md index ab9f3870..1f98e024 100644 --- a/docs/usage/train/augmentation.md +++ b/docs/usage/train/augmentation.md @@ -6,26 +6,26 @@ This page lists data augmentation transforms used in DAN. ### Elastic Transform -| | Elastic Transform | -| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation applies local distortions that rotate characters locally. | -| Comments | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform). | -| Examples |   | -| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels) | +| | Elastic Transform | +| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation applies local distortions that rotate characters locally. | +| Comments | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform) | +| Examples |   | +| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels) | ### PieceWise Affine !!! warning This transform is temporarily removed from the pipeline until [this issue](https://github.com/albumentations-team/albumentations/issues/1442) is fixed. -| | PieceWise Affine | -| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation also applies local distortions but with a larger grid than ElasticTransform. | -| Comments | This transformation is very slow. It is a new transform that was not in the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine). | -| Examples |   | -| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels) | +| | PieceWise Affine | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation also applies local distortions but with a larger grid than ElasticTransform. | +| Comments | This transformation is very slow. It is a new transform that was not in the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine) | +| Examples |   | +| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels) | ### Dilation Erosion @@ -33,99 +33,96 @@ This page lists data augmentation transforms used in DAN. | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | Description | This transformation makes the pen stroke thicker or thinner. | | Comments | The `RandomDilationErosion` class randomly selects a kernel size and applies a dilation or an erosion to the image. It relies on opencv and is similar to the original DAN implementation. | -| Documentation | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html). | +| Documentation | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html) | | Examples |   | | CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.03 (1116x581 pixels) | ### Sharpen -| | Sharpen | -| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation makes the image sharper. | -| Comments | Similar to the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen). | -| Examples |   | -| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels) | +| | Sharpen | +| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation makes the image sharper. | +| Comments | Similar to the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen) | +| Examples |   | +| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels) | ### Color Jittering -| | Color Jittering | -| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation alters the colors of the image. | -| Comments | Similar to the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter). | -| Examples |   | -| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels) | +| | Color Jittering | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation alters the colors of the image. | +| Comments | Similar to the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter) | +| Examples |   | +| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels) | ### Gaussian Noise -| | Gaussian Noise | -| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation adds Gaussian noise to the image. | -| Comments | The noise from the original DAN implementation is more uniform. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise). | -| Examples |   | -| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels) | +| | Gaussian Noise | +| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation adds Gaussian noise to the image. | +| Comments | The noise from the original DAN implementation is more uniform. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise) | +| Examples |   | +| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels) | ### Gaussian Blur -| | Gaussian Blur | -| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation blurs the image. | -| Comments | Similar to the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur). | -| Examples |   | -| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels) | +| | Gaussian Blur | +| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation blurs the image. | +| Comments | Similar to the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur) | +| Examples |   | +| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels) | ### Random Perspective -| | Random Perspective | -| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation changes the perspective from which the photo is taken. | -| Comments | Similar to the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective). | -| Examples |   | -| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels) | +| | Random Perspective | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation changes the perspective from which the photo is taken. | +| Comments | Similar to the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective) | +| Examples |   | +| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels) | ### Shearing (x-axis) -| | Shearing (x-axis) | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| Description | This transformation changes the slant of the text on the image. | -| Comments | New transform that was not in the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine). | -| Examples |   | -| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels) | +| | Shearing (x-axis) | +| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation changes the slant of the text on the image. | +| Comments | New transform that was not in the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine) | +| Examples |   | +| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels) | ### Coarse Dropout -| | Coarse Dropout | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation adds dropout on the image, turning small patches into black pixels. | -| Comments | It is a new transform that was not in the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout). | -| Examples |   | -| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) | - -### Downscale - -| | Downscale | -| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation downscales the image by a random factor. | -| Comments | It is a new transform that was not in the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Downscale). | -| Examples |   | -| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.03 (1116x581 pixels) | - -### Grayscale - -| | Grayscale | -| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Description | This transformation transforms an RGB image into grayscale. | -| Comments | It is a new transform that was not in the original DAN implementation. | -| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray). | -| Examples |   | -| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) | +| | Coarse Dropout | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Description | This transformation adds dropout on the image, turning small patches into black pixels. | +| Comments | It is a new transform that was not in the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout) | +| Examples |   | +| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) | + +### DPIAdjusting + +| | DPIAdjusting | +| ----------- | -------------------------------------------------------------- | +| Description | This transformation downscales the image from a random factor. | +| Comments | Similar to the original DAN implementation. | + +### ToGray + +| | ToGray | +| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation transforms an RGB image into grayscale. | +| Comments | It is a new transform that was not in the original DAN implementation. | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray) | +| Examples |   | +| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) | ## Full augmentation pipeline diff --git a/docs/usage/train/parameters.md b/docs/usage/train/parameters.md index feb0a48c..c4a97265 100644 --- a/docs/usage/train/parameters.md +++ b/docs/usage/train/parameters.md @@ -91,21 +91,30 @@ DAN takes advantage of transforms from [albumentations](https://albumentations.a The following configuration is used by default when using the `teklia-dan train document` command. Data augmentation is applied with a probability of 0.9. In this case, two transformations are randomly selected to be applied. ```py -transforms = SomeOf( +transforms = A.Compose( [ - Perspective(scale=(0.05, 0.09), fit_output=True), - GaussianBlur(sigma_limit=2.5), - GaussNoise(var_limit=50**2), - ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2), - ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0), - Sharpen(alpha=(0.0, 1.0)), - ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), - Affine(shear={"x": (-20, 20), "y": (0, 0)}), - CoarseDropout(), - Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST), - ToGray(), + DPIAdjusting(min_factor=0.75, max_factor=1), + A.SomeOf( + [ + ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), + Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4), + GaussianBlur(sigma_limit=2.5, p=1), + GaussNoise(var_limit=50**2, p=1), + ColorJitter( + contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1 + ), + ElasticTransform( + alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1 + ), + Sharpen(alpha=(0.0, 1.0), p=1), + Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1), + CoarseDropout(p=1), + ToGray(p=0.5), + ], + n=2, + p=0.9, + ), ], - n=2, p=0.9, ) ``` diff --git a/tests/data/prediction/parameters.yml b/tests/data/prediction/parameters.yml index db1880b0..f6014227 100644 --- a/tests/data/prediction/parameters.yml +++ b/tests/data/prediction/parameters.yml @@ -1,5 +1,7 @@ --- parameters: + mean: [166.8418783515498, 166.8418783515498, 166.8418783515498] + std: [34.084189571536385, 34.084189571536385, 34.084189571536385] max_char_prediction: 200 encoder: dropout: 0.5 diff --git a/tests/data/training/models/best_0.pt b/tests/data/training/models/best_0.pt index 0a64dc36..79bcb28a 100644 --- a/tests/data/training/models/best_0.pt +++ b/tests/data/training/models/best_0.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6c70bcb4f97182b750d0ed206725c0f5e28b494907ee7d993aac210ce0b2bb8 +oid sha256:428ceb4d08363c05b6e60e87e5e1ae65560d345756926c23f13e6d191dc33d69 size 84773087 diff --git a/tests/data/training/models/last_3.pt b/tests/data/training/models/last_3.pt index abd80b84..c7ecb5db 100644 --- a/tests/data/training/models/last_3.pt +++ b/tests/data/training/models/last_3.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:851fabb41bc61e79edff2db4edbb477005181bafa58eb21f115e139eee91cb67 +oid sha256:c2029a5822c5a8d4253a95c33a05357e707b9b46d056988dcab730945dfd5775 size 84773087 diff --git a/tests/test_prediction.py b/tests/test_prediction.py index 5395c16e..a644fa80 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -98,7 +98,7 @@ def test_predict( "by ner token": [], "total": 0.93, "word": [ - {"text": "ⓈBellisson", "confidence": 0.92}, + {"text": "ⓈBellisson", "confidence": 0.93}, {"text": "â’»Georges", "confidence": 0.94}, {"text": "â’·91", "confidence": 0.92}, {"text": "â“P", "confidence": 0.94}, @@ -169,7 +169,7 @@ def test_predict( {"text": "p", "confidence": 1.0}, {"text": "l", "confidence": 1.0}, {"text": "i", "confidence": 1.0}, - {"text": "é", "confidence": 0.86}, + {"text": "é", "confidence": 0.85}, {"text": " ", "confidence": 1.0}, {"text": "â’»", "confidence": 1.0}, {"text": "M", "confidence": 1.0}, diff --git a/tests/test_training.py b/tests/test_training.py index 92a3c6bb..1dc2201e 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -16,7 +16,7 @@ from tests.conftest import FIXTURES "last_3.pt", { "nb_chars": 43, - "cer": 1.2558, + "cer": 1.3023, "nb_words": 9, "wer": 1.0, "nb_words_no_punct": 9, @@ -25,7 +25,7 @@ from tests.conftest import FIXTURES }, { "nb_chars": 41, - "cer": 1.2927, + "cer": 1.2683, "nb_words": 9, "wer": 1.0, "nb_words_no_punct": 9, @@ -34,7 +34,7 @@ from tests.conftest import FIXTURES }, { "nb_chars": 49, - "cer": 1.102, + "cer": 1.1224, "nb_words": 9, "wer": 1.0, "nb_words_no_punct": 9, @@ -66,6 +66,12 @@ from tests.conftest import FIXTURES "type": "max_resize", } ], + "mean": [ + 242.10595854671013, + 242.10595854671013, + 242.10595854671013, + ], + "std": [28.29919517652322, 28.29919517652322, 28.29919517652322], }, }, ), @@ -175,12 +181,12 @@ def test_train_and_test( } assert res == expected_res - # Check that the parameters file is correct + # Check that the inference parameters file is correct with ( tmp_path / training_config["training_params"]["output_folder"] / "results" - / "parameters.yml" + / "inference_parameters.yml" ).open() as f: res = yaml.safe_load(f) assert res == params_res -- GitLab