Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (3)
Showing
with 240 additions and 498 deletions
......@@ -5,7 +5,7 @@ repos:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/ambv/black
rev: 23.1.0
rev: 23.3.0
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
......@@ -35,7 +35,7 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.2.5
hooks:
- id: codespell
args: ['--write-changes']
......
......@@ -3,15 +3,18 @@ import json
import os
import random
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler
from torchvision.io import ImageReadMode, read_image
from dan.datasets.utils import natural_sort
from dan.transforms import apply_data_augmentation
from dan.transforms import (
get_augmentation_transforms,
get_normalization_transforms,
get_preprocessing_transforms,
)
class DatasetManager:
......@@ -60,11 +63,13 @@ class DatasetManager:
"train",
self.params["train"]["name"],
self.get_paths_and_sets(self.params["train"]["datasets"]),
normalization_transforms=get_normalization_transforms(),
augmentation_transforms=(
get_augmentation_transforms()
if self.params["config"]["augmentation"]
else None
),
)
(
self.params["config"]["mean"],
self.params["config"]["std"],
) = self.train_dataset.compute_std_mean()
self.my_collate_function = self.train_dataset.collate_function(
self.params["config"]
......@@ -77,6 +82,8 @@ class DatasetManager:
"val",
custom_name,
self.get_paths_and_sets(self.params["val"][custom_name]),
normalization_transforms=get_normalization_transforms(),
augmentation_transforms=None,
)
self.apply_specific_treatment_after_dataset_loading(
self.valid_datasets[custom_name]
......@@ -155,7 +162,11 @@ class DatasetManager:
{"path": self.params["datasets"][set_info[0]], "set_name": set_info[1]}
)
self.test_datasets[custom_name] = self.dataset_class(
self.params, "test", custom_name, paths_and_sets
self.params,
"test",
custom_name,
paths_and_sets,
normalization_transforms=get_normalization_transforms(),
)
self.apply_specific_treatment_after_dataset_loading(
self.test_datasets[custom_name]
......@@ -199,29 +210,18 @@ class GenericDataset(Dataset):
self.params = params
self.name = custom_name
self.set_name = set_name
self.mean = (
np.array(params["config"]["mean"])
if "mean" in params["config"].keys()
else None
)
self.std = (
np.array(params["config"]["std"])
if "std" in params["config"].keys()
else None
)
self.preprocessing_transforms = get_preprocessing_transforms(
params["config"]["preprocessings"]
)
self.load_in_memory = (
self.params["config"]["load_in_memory"]
if "load_in_memory" in self.params["config"]
else True
)
self.samples = self.load_samples(
paths_and_sets, load_in_memory=self.load_in_memory
)
if self.load_in_memory:
self.apply_preprocessing(params["config"]["preprocessings"])
# Load samples and preprocess images if load_in_memory is True
self.samples = self.load_samples(paths_and_sets)
self.curriculum_config = None
......@@ -230,15 +230,13 @@ class GenericDataset(Dataset):
@staticmethod
def load_image(path):
with Image.open(path) as pil_img:
img = np.array(pil_img)
# grayscale images
if len(img.shape) == 2:
img = np.expand_dims(img, axis=2)
return img
"""
Load an image as a torch.Tensor and scale the values between 0 and 1.
"""
img = read_image(path, mode=ImageReadMode.RGB)
return img.to(dtype=torch.get_default_dtype()).div(255)
@staticmethod
def load_samples(paths_and_sets, load_in_memory=True):
def load_samples(self, paths_and_sets):
"""
Load images and labels
"""
......@@ -262,64 +260,12 @@ class GenericDataset(Dataset):
"path": os.path.abspath(filename),
}
)
if load_in_memory:
samples[-1]["img"] = GenericDataset.load_image(filename)
if self.load_in_memory:
samples[-1]["img"] = self.preprocessing_transforms(
self.load_image(filename)
)
return samples
def apply_preprocessing(self, preprocessings):
for i in range(len(self.samples)):
(
self.samples[i]["img"],
self.samples[i]["resize_ratio"],
) = apply_preprocessing(self.samples[i]["img"], preprocessings)
def compute_std_mean(self):
"""
Compute cumulated variance and mean of whole dataset
"""
if self.mean is not None and self.std is not None:
return self.mean, self.std
sum = np.zeros((3,))
diff = np.zeros((3,))
nb_pixels = 0
for metric in ["mean", "std"]:
for ind in range(len(self.samples)):
img = (
self.get_sample_img(ind)
if self.load_in_memory
else apply_preprocessing(
self.get_sample_img(ind),
self.params["config"]["preprocessings"],
)[0]
)
if metric == "mean":
sum += np.sum(img, axis=(0, 1))
nb_pixels += np.prod(img.shape[:2])
elif metric == "std":
diff += [
np.sum((img[:, :, k] - self.mean[k]) ** 2) for k in range(3)
]
if metric == "mean":
self.mean = sum / nb_pixels
elif metric == "std":
self.std = np.sqrt(diff / nb_pixels)
return self.mean, self.std
def apply_data_augmentation(self, img):
"""
Apply data augmentation strategy on the input image
"""
augs = [
self.params["config"][key] if key in self.params["config"].keys() else None
for key in ["augmentation", "valid_augmentation", "test_augmentation"]
]
for aug, set_name in zip(augs, ["train", "val", "test"]):
if aug and self.set_name == set_name:
return apply_data_augmentation(img, aug)
return img
def get_sample_img(self, i):
"""
Get image by index
......@@ -327,60 +273,6 @@ class GenericDataset(Dataset):
if self.load_in_memory:
return self.samples[i]["img"]
else:
return GenericDataset.load_image(self.samples[i]["path"])
def apply_preprocessing(img, preprocessings):
"""
Apply preprocessings on an image
"""
resize_ratio = [1, 1]
for preprocessing in preprocessings:
if preprocessing["type"] == "to_grayscaled":
temp_img = img
h, w, c = temp_img.shape
if c == 3:
img = np.expand_dims(
0.2125 * temp_img[:, :, 0]
+ 0.7154 * temp_img[:, :, 1]
+ 0.0721 * temp_img[:, :, 2],
axis=2,
).astype(np.uint8)
if preprocessing["type"] == "to_RGB":
temp_img = img
h, w, c = temp_img.shape
if c == 1:
img = np.concatenate([temp_img, temp_img, temp_img], axis=2)
if preprocessing["type"] == "resize":
keep_ratio = preprocessing["keep_ratio"]
max_h, max_w = preprocessing["max_height"], preprocessing["max_width"]
temp_img = img
h, w, c = temp_img.shape
ratio_h = max_h / h if max_h else 1
ratio_w = max_w / w if max_w else 1
if keep_ratio:
ratio_h = ratio_w = min(ratio_w, ratio_h)
new_h = min(max_h, int(h * ratio_h))
new_w = min(max_w, int(w * ratio_w))
temp_img = cv2.resize(temp_img, (new_w, new_h))
if len(temp_img.shape) == 2:
temp_img = np.expand_dims(temp_img, axis=2)
img = temp_img
resize_ratio = [ratio_h, ratio_w]
if preprocessing["type"] == "fixed_height":
new_h = preprocessing["height"]
temp_img = img
h, w, c = temp_img.shape
ratio = new_h / h
temp_img = cv2.resize(temp_img, (int(w * ratio), new_h))
if len(temp_img.shape) == 2:
temp_img = np.expand_dims(temp_img, axis=2)
img = temp_img
resize_ratio = [ratio, ratio]
return img, resize_ratio
return self.preprocessing_transforms(
self.load_image(self.samples[i]["path"])
)
......@@ -2,11 +2,9 @@
import os
import pickle
import cv2
import numpy as np
import torch
from dan.manager.dataset import DatasetManager, GenericDataset, apply_preprocessing
from dan.manager.dataset import DatasetManager, GenericDataset
from dan.utils import pad_images, pad_sequences_1D, token_to_ind
......@@ -52,43 +50,43 @@ class OCRDataset(GenericDataset):
Specific class to handle OCR/HTR datasets
"""
def __init__(self, params, set_name, custom_name, paths_and_sets):
def __init__(
self,
params,
set_name,
custom_name,
paths_and_sets,
normalization_transforms,
augmentation_transforms=None,
):
super(OCRDataset, self).__init__(params, set_name, custom_name, paths_and_sets)
self.charset = None
self.tokens = None
# Factor to reduce the height and width of the feature vector before feeding the decoder.
self.reduce_dims_factor = np.array([32, 8, 1])
self.collate_function = OCRCollateFunction
self.normalization_transforms = normalization_transforms
self.augmentation_transforms = augmentation_transforms
def __getitem__(self, idx):
sample = dict(**self.samples[idx])
if not self.load_in_memory:
sample["img"] = self.get_sample_img(idx)
sample["img"], sample["resize_ratio"] = apply_preprocessing(
sample["img"], self.params["config"]["preprocessings"]
)
# Data augmentation
sample["img"] = self.apply_data_augmentation(sample["img"])
if "max_size" in self.params["config"] and self.params["config"]["max_size"]:
max_ratio = max(
sample["img"].shape[0]
/ self.params["config"]["max_size"]["max_height"],
sample["img"].shape[1] / self.params["config"]["max_size"]["max_width"],
)
if max_ratio > 1:
new_h, new_w = int(np.ceil(sample["img"].shape[0] / max_ratio)), int(
np.ceil(sample["img"].shape[1] / max_ratio)
)
sample["img"] = cv2.resize(sample["img"], (new_w, new_h))
if self.augmentation_transforms:
sample["img"] = self.augmentation_transforms(image=np.array(sample["img"]))[
"image"
]
# Normalization
sample["img"] = (sample["img"] - self.mean) / self.std
sample["img"] = self.normalization_transforms(sample["img"])
# Get final height and width
final_c, final_h, final_w = sample["img"].shape
sample["img_reduced_shape"] = np.ceil(
sample["img"].shape / self.reduce_dims_factor
[final_h, final_w, final_c] / self.reduce_dims_factor
).astype(int)
if self.set_name == "train":
......@@ -97,8 +95,8 @@ class OCRDataset(GenericDataset):
]
sample["img_position"] = [
[0, sample["img"].shape[0]],
[0, sample["img"].shape[1]],
[0, final_h],
[0, final_w],
]
return sample
......@@ -131,12 +129,10 @@ class OCRCollateFunction:
def __call__(self, batch_data):
labels = [batch_data[i]["token_label"] for i in range(len(batch_data))]
labels = pad_sequences_1D(labels, padding_value=self.label_padding_value)
labels = torch.tensor(labels).long()
labels = pad_sequences_1D(labels, padding_value=self.label_padding_value).long()
imgs = [batch_data[i]["img"] for i in range(len(batch_data))]
imgs = pad_images(imgs)
imgs = torch.tensor(imgs).float().permute(0, 3, 1, 2)
formatted_batch_data = {
formatted_key: [batch_data[i][initial_key] for i in range(len(batch_data))]
......
......@@ -15,7 +15,7 @@ from dan.encoder import FCN_Encoder
from dan.manager.training import Manager
from dan.mlflow import MLFLOW_AVAILABLE
from dan.schedulers import exponential_dropout_scheduler
from dan.transforms import aug_config
from dan.transforms import Preprocessing
from dan.utils import MLflowNotInstalled
if MLFLOW_AVAILABLE:
......@@ -107,11 +107,12 @@ def get_config():
"worker_per_gpu": 4, # Num of parallel processes per gpu for data loading
"preprocessings": [
{
"type": "to_RGB",
# if grayscaled image, produce RGB one (3 channels with same value) otherwise do nothing
},
"type": Preprocessing.MaxResize,
"max_width": 2000,
"max_height": 2000,
}
],
"augmentation": aug_config(0.9, 0.1),
"augmentation": True,
},
},
"model_params": {
......@@ -257,18 +258,18 @@ def serialize_config(config):
return serialized_config
def start_training(config) -> None:
def start_training(config, mlflow_logging: bool) -> None:
if (
config["training_params"]["use_ddp"]
and not config["training_params"]["force_cpu"]
):
mp.spawn(
train_and_test,
args=(config, True),
args=(config, mlflow_logging),
nprocs=config["training_params"]["nb_gpu"],
)
else:
train_and_test(0, config, True)
train_and_test(0, config, mlflow_logging)
def run():
......@@ -285,7 +286,7 @@ def run():
raise MLflowNotInstalled()
if "mlflow" not in config:
start_training(config)
start_training(config, mlflow_logging=False)
else:
labels_path = (
Path(config["dataset_params"]["datasets"][dataset_name]) / "labels.json"
......@@ -313,4 +314,4 @@ def run():
dictionary=artifact,
artifact_file=filename,
)
start_training(config)
start_training(config, mlflow_logging=True)
......@@ -20,6 +20,7 @@ from dan.predict.attention import (
plot_attention,
split_text_and_confidences,
)
from dan.transforms import get_normalization_transforms
from dan.utils import ind_to_token, read_image
......@@ -74,7 +75,7 @@ class DAN:
self.encoder = encoder
self.decoder = decoder
self.mean, self.std = parameters["mean"], parameters["std"]
self.normalization = get_normalization_transforms()
self.max_chars = parameters["max_char_prediction"]
def preprocess(self, input_image):
......@@ -89,7 +90,7 @@ class DAN:
if len(input_image.shape) < 3:
input_image = cv2.cvtColor(input_image, cv2.COLOR_GRAY2RGB)
input_image = (input_image - self.mean) / self.std
input_image = self.normalization(input_image)
return input_image
def predict(
......
......@@ -2,354 +2,206 @@
"""
Each transform class defined here takes as input a PIL Image and returns the modified PIL Image
"""
import math
from enum import Enum
from random import randint
import cv2
import numpy as np
from cv2 import dilate, erode, normalize
from numpy import random
from PIL import Image
from torch import rand, randint
from torch.distributions.uniform import Uniform
from torchvision.transforms import (
from albumentations import OneOf, SomeOf
from albumentations.augmentations import (
Affine,
CoarseDropout,
ColorJitter,
Downscale,
ElasticTransform,
GaussianBlur,
RandomCrop,
RandomPerspective,
GaussNoise,
Perspective,
PiecewiseAffine,
Sharpen,
ToGray,
)
from torchvision.transforms.functional import InterpolationMode
from cv2 import INTER_NEAREST, dilate, erode
from numpy import random
from torch import Tensor
from torchvision.transforms import Compose, Normalize, ToPILImage, ToTensor
from torchvision.transforms.functional import resize
class Dilation:
"""
OCR: stroke width increasing
"""
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
def __init__(self, kernel, iterations):
self.kernel = np.ones(kernel, np.uint8)
self.iterations = iterations
def __call__(self, x):
return Image.fromarray(
dilate(np.array(x), self.kernel, iterations=self.iterations)
)
class Preprocessing(str, Enum):
# If the image is bigger than the given size, resize it while keeping the original ratio
MaxResize = "max_resize"
# Resize the height to a fixed value while keeping the original ratio
FixedHeightResize = "fixed_height_resize"
# Resize the width to a fixed value while keeping the original ratio
FixedWidthResize = "fixed_width_resize"
class Erosion:
class FixedHeightResize:
"""
OCR: stroke width decreasing
Resize an image tensor to a fixed height
"""
def __init__(self, kernel, iterations):
self.kernel = np.ones(kernel, np.uint8)
self.iterations = iterations
def __init__(self, height: int) -> None:
self.height = height
def __call__(self, x):
return Image.fromarray(
erode(np.array(x), self.kernel, iterations=self.iterations)
)
def __call__(self, img: Tensor) -> Tensor:
size = (self.height, self._calc_new_width(img))
return resize(img, size, antialias=False)
def _calc_new_width(self, img: Tensor) -> int:
aspect_ratio = img.shape[2] / img.shape[1]
return round(self.height * aspect_ratio)
class GaussianNoise:
"""
Add Gaussian Noise
"""
def __init__(self, std):
self.std = std
def __call__(self, x):
x_np = np.array(x)
mean, std = np.mean(x_np), np.std(x_np)
std = math.copysign(max(abs(std), 0.000001), std)
min_, max_ = np.min(
x_np,
), np.max(x_np)
normal_noise = np.random.randn(*x_np.shape)
if (
len(x_np.shape) == 3
and x_np.shape[2] == 3
and np.all(x_np[:, :, 0] == x_np[:, :, 1])
and np.all(x_np[:, :, 0] == x_np[:, :, 2])
):
normal_noise[:, :, 1] = normal_noise[:, :, 2] = normal_noise[:, :, 0]
x_np = ((x_np - mean) / std + normal_noise * self.std) * std + mean
x_np = normalize(x_np, x_np, max_, min_, cv2.NORM_MINMAX)
return Image.fromarray(x_np.astype(np.uint8))
class Sharpen:
class FixedWidthResize:
"""
Add Gaussian Noise
Resize an image tensor to a fixed width
"""
def __init__(self, alpha, strength):
self.alpha = alpha
self.strength = strength
def __init__(self, width: int) -> None:
self.width = width
def __call__(self, x):
x_np = np.array(x)
id_matrix = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]])
effect_matrix = np.array([[1, 1, 1], [1, -(8 + self.strength), 1], [1, 1, 1]])
kernel = (1 - self.alpha) * id_matrix - self.alpha * effect_matrix
kernel = np.expand_dims(kernel, axis=2)
kernel = np.concatenate([kernel, kernel, kernel], axis=2)
sharpened = cv2.filter2D(x_np, -1, kernel=kernel[:, :, 0])
return Image.fromarray(sharpened.astype(np.uint8))
def __call__(self, img: Tensor) -> Tensor:
size = (self._calc_new_height(img), self.width)
return resize(img, size, antialias=False)
def _calc_new_height(self, img: Tensor) -> int:
aspect_ratio = img.shape[1] / img.shape[2]
return round(self.width * aspect_ratio)
class ZoomRatio:
class MaxResize:
"""
Crop by ratio
Preserve dimensions if keep_dim = True (= zoom)
Resize an image tensor if it is bigger than the maximum size
"""
def __init__(self, ratio_h, ratio_w, keep_dim=True):
self.ratio_w = ratio_w
self.ratio_h = ratio_h
self.keep_dim = keep_dim
def __init__(self, height: int, width: int) -> None:
self.max_width = width
self.max_height = height
def __call__(self, x):
w, h = x.size
x = RandomCrop((int(h * self.ratio_h), int(w * self.ratio_w)))(x)
if self.keep_dim:
x = x.resize((w, h), Image.BILINEAR)
return x
def __call__(self, img: Tensor) -> Tensor:
height, width = img.shape[1:]
if width <= self.max_width and height <= self.max_height:
return img
width_ratio = self.max_width / width
height_ratio = self.max_height / height
ratio = min(height_ratio, width_ratio)
new_width = int(width * ratio)
new_height = int(height * ratio)
return resize(img, (new_height, new_width), antialias=False)
class ElasticDistortion:
def __init__(self, kernel_size=(7, 7), sigma=5, alpha=1):
self.kernel_size = kernel_size
self.sigma = sigma
self.alpha = alpha
class Dilation:
"""
OCR: stroke width increasing
"""
def __init__(self, kernel, iterations):
self.kernel = kernel
self.iterations = iterations
def __call__(self, x):
x_np = np.array(x)
return dilate(np.array(x), self.kernel, iterations=self.iterations)
h, w = x_np.shape[:2]
dx = np.random.uniform(-1, 1, (h, w))
dy = np.random.uniform(-1, 1, (h, w))
x_gauss = cv2.GaussianBlur(dx, self.kernel_size, self.sigma)
y_gauss = cv2.GaussianBlur(dy, self.kernel_size, self.sigma)
class Erosion:
"""
OCR: stroke width decreasing
"""
n = np.sqrt(x_gauss**2 + y_gauss**2)
def __init__(self, kernel, iterations):
self.kernel = kernel
self.iterations = iterations
nd_x = self.alpha * x_gauss / n
nd_y = self.alpha * y_gauss / n
def __call__(self, x):
return erode(np.array(x), self.kernel, iterations=self.iterations)
ind_y, ind_x = np.indices((h, w), dtype=np.float32)
map_x = nd_x + ind_x
map_x = map_x.reshape(h, w).astype(np.float32)
map_y = nd_y + ind_y
map_y = map_y.reshape(h, w).astype(np.float32)
class ErosionDilation:
"""
Random erosion or dilation
"""
dst = cv2.remap(x_np, map_x, map_y, cv2.INTER_LINEAR)
return Image.fromarray(dst.astype(np.uint8))
def __init__(self, min_kernel, max_kernel, iterations, p=1.0):
self.min_kernel = min_kernel
self.max_kernel = max_kernel
self.iterations = iterations
self.p = p
self.always_apply = False
def __call__(self, image, force_apply=False):
if not (random.random() <= self.p or self.always_apply or force_apply):
return {"image": image}
kernel_h = randint(self.min_kernel, self.max_kernel)
kernel_w = randint(self.min_kernel, self.max_kernel)
kernel = np.ones((kernel_h, kernel_w), np.uint8)
augmented_image = (
Erosion(kernel, iterations=self.iterations)(image)
if random.random() < 0.5
else Dilation(kernel=kernel, iterations=self.iterations)(image)
)
return {"image": augmented_image}
def get_list_augmenters(img, aug_configs, fill_value):
def get_preprocessing_transforms(preprocessings: list) -> Compose:
"""
Randomly select a list of data augmentation techniques to used based on aug_configs
Returns a list of transformations to be applied to the image.
"""
augmenters = list()
for aug_config in aug_configs:
if rand((1,)) > aug_config["proba"]:
continue
if aug_config["type"] == "zoom_ratio":
ratio_h = Uniform(
aug_config["min_ratio_h"], aug_config["max_ratio_h"]
).sample()
ratio_w = Uniform(
aug_config["min_ratio_w"], aug_config["max_ratio_w"]
).sample()
augmenters.append(
ZoomRatio(
ratio_h=ratio_h, ratio_w=ratio_w, keep_dim=aug_config["keep_dim"]
)
)
elif aug_config["type"] == "perspective":
scale = Uniform(aug_config["min_factor"], aug_config["max_factor"]).sample()
augmenters.append(
RandomPerspective(
distortion_scale=scale,
p=1,
interpolation=InterpolationMode.BILINEAR,
fill=fill_value,
)
)
elif aug_config["type"] == "elastic_distortion":
kernel_size = (
randint(
aug_config["min_kernel_size"], aug_config["max_kernel_size"], (1,)
).item()
) // 2 * 2 + 1
sigma = (
Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
.sample()
.item()
)
alpha = (
Uniform(aug_config["min_alpha"], aug_config["max_alpha"])
.sample()
.item()
)
augmenters.append(
ElasticDistortion(
kernel_size=(kernel_size, kernel_size), sigma=sigma, alpha=alpha
transforms = []
for preprocessing in preprocessings:
match preprocessing["type"]:
case Preprocessing.MaxResize:
transforms.append(
MaxResize(
height=preprocessing["max_height"],
width=preprocessing["max_width"],
)
)
)
elif aug_config["type"] == "dilation_erosion":
kernel_h = randint(
aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
)
kernel_w = randint(
aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
)
if randint(0, 2, (1,)) == 0:
augmenters.append(
Erosion((kernel_w, kernel_h), aug_config["iterations"])
)
else:
augmenters.append(
Dilation((kernel_w, kernel_h), aug_config["iterations"])
case Preprocessing.FixedHeightResize:
transforms.append(
FixedHeightResize(height=preprocessing["fixed_height"])
)
case Preprocessing.FixedWidthResize:
transforms.append(FixedWidthResize(width=preprocessing["fixed_width"]))
transforms.append(ToPILImage())
return Compose(transforms)
elif aug_config["type"] == "color_jittering":
augmenters.append(
ColorJitter(
contrast=aug_config["factor_contrast"],
brightness=aug_config["factor_brightness"],
saturation=aug_config["factor_saturation"],
hue=aug_config["factor_hue"],
)
)
elif aug_config["type"] == "gaussian_blur":
max_kernel_h = min(aug_config["max_kernel"], img.size[1])
max_kernel_w = min(aug_config["max_kernel"], img.size[0])
kernel_h = (
randint(aug_config["min_kernel"], max_kernel_h + 1, (1,)).item()
) // 2 * 2 + 1
kernel_w = (
randint(aug_config["min_kernel"], max_kernel_w + 1, (1,)).item()
) // 2 * 2 + 1
sigma = (
Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
.sample()
.item()
)
augmenters.append(
GaussianBlur(kernel_size=(kernel_w, kernel_h), sigma=sigma)
)
elif aug_config["type"] == "gaussian_noise":
augmenters.append(GaussianNoise(std=aug_config["std"]))
elif aug_config["type"] == "sharpen":
alpha = Uniform(aug_config["min_alpha"], aug_config["max_alpha"]).sample()
strength = Uniform(
aug_config["min_strength"], aug_config["max_strength"]
).sample()
augmenters.append(Sharpen(alpha=alpha, strength=strength))
else:
print("Error - unknown augmentor: {}".format(aug_config["type"]))
exit(-1)
return augmenters
def apply_data_augmentation(img, da_config):
def get_augmentation_transforms() -> SomeOf:
"""
Apply data augmentation strategy on input image
Returns a list of transformations to be applied to the image.
"""
if da_config["proba"] != 1 and rand((1,)) > da_config["proba"]:
return img
return SomeOf(
[
Perspective(scale=(0.05, 0.09), fit_output=True),
GaussianBlur(sigma_limit=2.5),
GaussNoise(var_limit=50**2),
ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
OneOf(
[
ElasticTransform(
alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0
),
PiecewiseAffine(scale=(0.01, 0.04), nb_rows=1, nb_cols=4),
]
),
Sharpen(alpha=(0.0, 1.0)),
ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
Affine(shear={"x": (-20, 20), "y": (0, 0)}),
CoarseDropout(),
Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
ToGray(),
],
n=2,
p=0.9,
)
# Convert to PIL Image
img = img[:, :, 0] if img.shape[2] == 1 else img
img = Image.fromarray(img)
fill_value = da_config["fill_value"] if "fill_value" in da_config else 255
augmenters = get_list_augmenters(
img, da_config["augmentations"], fill_value=fill_value
)
if da_config["order"] == "random":
random.shuffle(augmenters)
for augmenter in augmenters:
img = augmenter(img)
# convert to numpy array
img = np.array(img)
img = np.expand_dims(img, axis=2) if len(img.shape) == 2 else img
return img
def aug_config(proba_use_da, p):
return {
"order": "random",
"proba": proba_use_da,
"augmentations": [
{
"type": "perspective",
"proba": p,
"min_factor": 0,
"max_factor": 0.4,
},
{
"type": "elastic_distortion",
"proba": p,
"min_alpha": 0.5,
"max_alpha": 1,
"min_sigma": 1,
"max_sigma": 10,
"min_kernel_size": 3,
"max_kernel_size": 9,
},
{
"type": "dilation_erosion",
"proba": p,
"min_kernel": 1,
"max_kernel": 3,
"iterations": 1,
},
{
"type": "color_jittering",
"proba": p,
"factor_hue": 0.2,
"factor_brightness": 0.4,
"factor_contrast": 0.4,
"factor_saturation": 0.4,
},
{
"type": "gaussian_blur",
"proba": p,
"min_kernel": 3,
"max_kernel": 5,
"min_sigma": 3,
"max_sigma": 5,
},
{
"type": "gaussian_noise",
"proba": p,
"std": 0.5,
},
{
"type": "sharpen",
"proba": p,
"min_alpha": 0,
"max_alpha": 1,
"min_strength": 0,
"max_strength": 1,
},
],
}
def get_normalization_transforms() -> Compose:
"""
Returns a list of normalization transformations.
"""
return Compose([ToTensor(), Normalize(IMAGENET_MEAN, IMAGENET_STD)])
# -*- coding: utf-8 -*-
import cv2
import numpy as np
import torch
# Layout begin-token to end-token
SEM_MATCHING_TOKENS = {"": "", "": "", "": "", "": "", "": "", "": ""}
......@@ -18,9 +18,9 @@ def pad_sequences_1D(data, padding_value):
"""
x_lengths = [len(x) for x in data]
longest_x = max(x_lengths)
padded_data = np.ones((len(data), longest_x)).astype(np.int32) * padding_value
padded_data = torch.ones((len(data), longest_x), dtype=torch.int32) * padding_value
for i, x_len in enumerate(x_lengths):
padded_data[i, :x_len] = data[i][:x_len]
padded_data[i, :x_len] = torch.tensor(data[i][:x_len])
return padded_data
......@@ -30,19 +30,19 @@ def pad_images(data):
:param data: List of numpy arrays.
:return padded_data: A tensor containing all the padded images.
"""
longest_x = max([x.shape[0] for x in data])
longest_y = max([x.shape[1] for x in data])
padded_data = np.zeros((len(data), longest_x, longest_y, data[0].shape[2]))
longest_x = max([x.shape[1] for x in data])
longest_y = max([x.shape[2] for x in data])
padded_data = torch.zeros((len(data), data[0].shape[0], longest_x, longest_y))
for index, image in enumerate(data):
delta_x = longest_x - image.shape[0]
delta_y = longest_y - image.shape[1]
delta_x = longest_x - image.shape[1]
delta_y = longest_y - image.shape[2]
top, bottom = delta_x // 2, delta_x - (delta_x // 2)
left, right = delta_y // 2, delta_y - (delta_y // 2)
padded_data[
index,
top : padded_data.shape[1] - bottom,
left : padded_data.shape[2] - right,
:,
top : padded_data.shape[2] - bottom,
left : padded_data.shape[3] - right,
] = image
return padded_data
......
docs/assets/augmentations/document_color_jitter.png

392 KiB

docs/assets/augmentations/document_downscale.png

365 KiB

docs/assets/augmentations/document_dropout.png

366 KiB

docs/assets/augmentations/document_elastic.png

389 KiB

docs/assets/augmentations/document_erosion_dilation.png

366 KiB

docs/assets/augmentations/document_full_pipeline.png

330 KiB

docs/assets/augmentations/document_full_pipeline_2.png

482 KiB

docs/assets/augmentations/document_gaussian_blur.png

371 KiB

docs/assets/augmentations/document_gaussian_noise.png

612 KiB

docs/assets/augmentations/document_grayscale.png

118 KiB

docs/assets/augmentations/document_perspective.png

379 KiB

docs/assets/augmentations/document_piecewise.png

420 KiB

docs/assets/augmentations/document_sharpen.png

440 KiB