diff --git a/dan/ocr/transforms.py b/dan/ocr/transforms.py index 8a10ce170857a284b43715972e4a0e1c3580e1b6..3a8c74f64ac5f586a02ce070f4ca6aaf1c3bd03f 100644 --- a/dan/ocr/transforms.py +++ b/dan/ocr/transforms.py @@ -6,6 +6,7 @@ from enum import Enum from random import randint import albumentations as A +import cv2 import numpy as np from albumentations.augmentations import ( Affine, @@ -15,16 +16,16 @@ from albumentations.augmentations import ( GaussianBlur, GaussNoise, Perspective, + RandomScale, Sharpen, ToGray, ) from albumentations.core.transforms_interface import ImageOnlyTransform -from cv2 import dilate, erode, resize +from cv2 import dilate, erode from numpy import random from torch import Tensor -from torch.distributions.uniform import Uniform from torchvision.transforms import Compose, ToPILImage -from torchvision.transforms.functional import resize as resize_tensor +from torchvision.transforms.functional import resize class Preprocessing(str, Enum): @@ -54,7 +55,7 @@ class FixedHeightResize: def __call__(self, img: Tensor) -> Tensor: size = (self.height, self._calc_new_width(img)) - return resize_tensor(img, size, antialias=False) + return resize(img, size, antialias=False) def _calc_new_width(self, img: Tensor) -> int: aspect_ratio = img.shape[2] / img.shape[1] @@ -71,7 +72,7 @@ class FixedWidthResize: def __call__(self, img: Tensor) -> Tensor: size = (self._calc_new_height(img), self.width) - return resize_tensor(img, size, antialias=False) + return resize(img, size, antialias=False) def _calc_new_height(self, img: Tensor) -> int: aspect_ratio = img.shape[1] / img.shape[2] @@ -96,7 +97,7 @@ class MaxResize: ratio = min(height_ratio, width_ratio) new_width = int(width * ratio) new_height = int(height * ratio) - return resize_tensor(img, (new_height, new_width), antialias=False) + return resize(img, (new_height, new_width), antialias=False) class Dilation: @@ -156,29 +157,6 @@ class ErosionDilation(ImageOnlyTransform): ) -class DPIAdjusting(ImageOnlyTransform): - """ - Resolution modification - """ - - def __init__( - self, - min_factor: float = 0.75, - max_factor: float = 1, - always_apply: bool = False, - p: float = 1.0, - ): - super(DPIAdjusting, self).__init__(always_apply, p) - self.min_factor = min_factor - self.max_factor = max_factor - self.p = p - self.always_apply = False - - def apply(self, img: np.ndarray, **params): - factor = float(Uniform(self.min_factor, self.max_factor).sample()) - return resize(img, None, fx=factor, fy=factor) - - def get_preprocessing_transforms( preprocessings: list, to_pil_image: bool = False ) -> Compose: @@ -212,7 +190,10 @@ def get_augmentation_transforms() -> A.Compose: """ return A.Compose( [ - DPIAdjusting(min_factor=0.75, max_factor=1), + # Scale between 0.75 and 1.0 + RandomScale( + scale_limit=[-0.25, 0], always_apply=True, interpolation=cv2.INTER_AREA + ), A.SomeOf( [ ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), @@ -220,10 +201,18 @@ def get_augmentation_transforms() -> A.Compose: GaussianBlur(sigma_limit=2.5, p=1), GaussNoise(var_limit=50**2, p=1), ColorJitter( - contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1 + contrast=0.2, + brightness=0.2, + saturation=0.2, + hue=0.2, + p=1, ), ElasticTransform( - alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1 + alpha=20.0, + sigma=5.0, + alpha_affine=1.0, + border_mode=0, + p=1, ), Sharpen(alpha=(0.0, 1.0), p=1), Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1), diff --git a/docs/assets/augmentations/document_original.png b/docs/assets/augmentations/document_original.png new file mode 100644 index 0000000000000000000000000000000000000000..57fe507d33f2ee10c831c6c5bf7fa789904d110b Binary files /dev/null and b/docs/assets/augmentations/document_original.png differ diff --git a/docs/assets/augmentations/document_random_scale.png b/docs/assets/augmentations/document_random_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..1da83f90866f7fa1393f1f0f6419b987eabe9e68 Binary files /dev/null and b/docs/assets/augmentations/document_random_scale.png differ diff --git a/docs/assets/augmentations/line_full_pipeline_2.png b/docs/assets/augmentations/line_full_pipeline_2.png new file mode 100644 index 0000000000000000000000000000000000000000..ca0e9203977dd46c28e86548b69bf6f75dab96fd Binary files /dev/null and b/docs/assets/augmentations/line_full_pipeline_2.png differ diff --git a/docs/assets/augmentations/line_original.png b/docs/assets/augmentations/line_original.png new file mode 100644 index 0000000000000000000000000000000000000000..46d9cc3421ad47fda6fc12791296c037b4096c74 Binary files /dev/null and b/docs/assets/augmentations/line_original.png differ diff --git a/docs/assets/augmentations/line_random_scale.png b/docs/assets/augmentations/line_random_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..1c402e682ec6c7fe7b687be7ceac837bf0d4f4bd Binary files /dev/null and b/docs/assets/augmentations/line_random_scale.png differ diff --git a/docs/usage/train/augmentation.md b/docs/usage/train/augmentation.md index 1f98e0244ca9346b0f27ecdeac21c8ee4baff277..cc18d45a797362dc356d56b74aa47ebff87cb8ea 100644 --- a/docs/usage/train/augmentation.md +++ b/docs/usage/train/augmentation.md @@ -107,14 +107,16 @@ This page lists data augmentation transforms used in DAN. | Examples |   | | CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) | -### DPIAdjusting +### Random Scale -| | DPIAdjusting | -| ----------- | -------------------------------------------------------------- | -| Description | This transformation downscales the image from a random factor. | -| Comments | Similar to the original DAN implementation. | +| | RandomScale | +| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Description | This transformation downscales the image from a random factor. | +| Comments | The original DAN implementation reimplemented it as [DPIAdjusting](https://github.com/FactoDeepLearning/DAN/blob/da3046a1cc83e9be3e54dd31a5e74d6134d1ebdc/basic/transforms.py#L62). | +| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/resize/#albumentations.augmentations.geometric.resize.RandomScale) | +| Examples |   | -### ToGray +### To Gray | | ToGray | | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/docs/usage/train/config.md b/docs/usage/train/config.md index 148f0fa96536b453afd2337607d46063c4ef402e..a435fdc6340f2e8a9ed3987021c0dfc00eb24724 100644 --- a/docs/usage/train/config.md +++ b/docs/usage/train/config.md @@ -134,7 +134,8 @@ The following configuration is used by default when using the `teklia-dan train` ```py transforms = A.Compose( [ - DPIAdjusting(min_factor=0.75, max_factor=1), + # Scale between 0.75 and 1.0 + RandomScale(scale_limit=[-0.25, 0], always_apply=True, interpolation=cv2.INTER_AREA), A.SomeOf( [ ErosionDilation(min_kernel=1, max_kernel=4, iterations=1), diff --git a/tests/data/training/models/best_0.pt b/tests/data/training/models/best_0.pt index 79bcb28a2388f266d497ee4d612f1835eba5cae6..889355b039f64ff270086a11714986e13621a50b 100644 --- a/tests/data/training/models/best_0.pt +++ b/tests/data/training/models/best_0.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:428ceb4d08363c05b6e60e87e5e1ae65560d345756926c23f13e6d191dc33d69 +oid sha256:a1fc3f9bdf52055aadefe49ff3537d9e6c9b1af5b14364d027d3fcdebeabf9a7 size 84773087 diff --git a/tests/data/training/models/last_3.pt b/tests/data/training/models/last_3.pt index c7ecb5dbaa3f26c8d0933faa42685277b66cbd05..2cbaf1a2cf52e11e10390e51c98969d01de67038 100644 --- a/tests/data/training/models/last_3.pt +++ b/tests/data/training/models/last_3.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2029a5822c5a8d4253a95c33a05357e707b9b46d056988dcab730945dfd5775 +oid sha256:8e8536dc913af4d1560413c1d91aa41d1636c0b8ed557389b82e74fb991312b0 size 84773087