Skip to content
Snippets Groups Projects
Verified Commit 8dace579 authored by Mélodie Boillet's avatar Mélodie Boillet
Browse files

Fix rebase

parent f627d8e6
No related branches found
No related tags found
1 merge request!224Fix version 0.2.0-dev3 and later
......@@ -22,18 +22,20 @@ class OCRDataset(Dataset):
charset,
tokens,
preprocessing_transforms,
normalization_transforms,
augmentation_transforms,
load_in_memory=False,
mean=None,
std=None,
):
self.set_name = set_name
self.charset = charset
self.tokens = tokens
self.load_in_memory = load_in_memory
self.mean = mean
self.std = std
# Pre-processing, augmentation, normalization
# Pre-processing, augmentation
self.preprocessing_transforms = preprocessing_transforms
self.normalization_transforms = normalization_transforms
self.augmentation_transforms = augmentation_transforms
# Factor to reduce the height and width of the feature vector before feeding the decoder.
......@@ -55,20 +57,20 @@ class OCRDataset(Dataset):
"""
Return an item from the dataset (image and label)
"""
# Load preprocessed image
sample = dict(**self.samples[idx])
sample = copy.deepcopy(self.samples[idx])
if not self.load_in_memory:
sample["img"] = self.get_sample_img(idx)
# Convert to numpy
sample["img"] = np.array(sample["img"])
# Apply data augmentation
if self.augmentation_transforms:
sample["img"] = self.augmentation_transforms(image=np.array(sample["img"]))[
"image"
]
sample["img"] = self.augmentation_transforms(image=sample["img"])["image"]
# Image normalization
sample["img"] = self.normalization_transforms(sample["img"])
sample["img"] = (sample["img"] - self.mean) / self.std
# Get final height and width
sample["img_reduced_shape"], sample["img_position"] = self.compute_final_size(
......@@ -113,28 +115,51 @@ class OCRDataset(Dataset):
def get_sample_img(self, i):
"""
Compute the final image size and position after feature extraction
Get image by index
"""
if self.load_in_memory:
return self.samples[i]["img"]
return self.preprocessing_transforms(read_image(self.samples[i]["path"]))
def compute_std_mean(self):
"""
Compute cumulated variance and mean of whole dataset
"""
if self.mean is not None and self.std is not None:
return self.mean, self.std
sum = np.zeros((3,))
diff = np.zeros((3,))
nb_pixels = 0
for metric in ["mean", "std"]:
for ind in range(len(self.samples)):
img = np.array(self.get_sample_img(ind))
if metric == "mean":
sum += np.sum(img, axis=(0, 1))
nb_pixels += np.prod(img.shape[:2])
elif metric == "std":
diff += [
np.sum((img[:, :, k] - self.mean[k]) ** 2) for k in range(3)
]
if metric == "mean":
self.mean = sum / nb_pixels
elif metric == "std":
self.std = np.sqrt(diff / nb_pixels)
return self.mean, self.std
def compute_final_size(self, img):
"""
Compute the final image size and position after feature extraction
"""
final_c, final_h, final_w = img.shape
image_reduced_shape = np.ceil(
[final_h, final_w, final_c] / self.reduce_dims_factor
).astype(int)
image_reduced_shape = np.ceil(img.shape / self.reduce_dims_factor).astype(int)
if self.set_name == "train":
image_reduced_shape = [max(1, t) for t in image_reduced_shape]
image_position = [
[0, final_h],
[0, final_w],
[0, img.shape[0]],
[0, img.shape[1]],
]
return image_reduced_shape, image_position
......
......@@ -9,11 +9,7 @@ from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from dan.manager.dataset import OCRDataset
from dan.transforms import (
get_augmentation_transforms,
get_normalization_transforms,
get_preprocessing_transforms,
)
from dan.transforms import get_augmentation_transforms, get_preprocessing_transforms
from dan.utils import pad_images, pad_sequences_1D
......@@ -36,6 +32,17 @@ class OCRDatasetManager:
self.valid_samplers = dict()
self.test_samplers = dict()
self.mean = (
np.array(params["config"]["mean"])
if "mean" in params["config"].keys()
else None
)
self.std = (
np.array(params["config"]["std"])
if "std" in params["config"].keys()
else None
)
self.generator = torch.Generator()
self.generator.manual_seed(0)
......@@ -49,7 +56,6 @@ class OCRDatasetManager:
self.params["config"]["padding_token"] = self.tokens["pad"]
self.my_collate_function = OCRCollateFunction(self.params["config"])
self.normalization = get_normalization_transforms(from_pil_image=True)
self.augmentation = (
get_augmentation_transforms()
if self.params["config"]["augmentation"]
......@@ -69,11 +75,14 @@ class OCRDatasetManager:
charset=self.charset,
tokens=self.tokens,
preprocessing_transforms=self.preprocessing,
normalization_transforms=self.normalization,
augmentation_transforms=self.augmentation,
load_in_memory=self.load_in_memory,
mean=self.mean,
std=self.std,
)
self.mean, self.std = self.train_dataset.compute_std_mean()
for custom_name in self.params["val"].keys():
self.valid_datasets[custom_name] = OCRDataset(
set_name="val",
......@@ -81,9 +90,10 @@ class OCRDatasetManager:
charset=self.charset,
tokens=self.tokens,
preprocessing_transforms=self.preprocessing,
normalization_transforms=self.normalization,
augmentation_transforms=None,
load_in_memory=self.load_in_memory,
mean=self.mean,
std=self.std,
)
def load_ddp_samplers(self):
......@@ -167,9 +177,10 @@ class OCRDatasetManager:
charset=self.charset,
tokens=self.tokens,
preprocessing_transforms=self.preprocessing,
normalization_transforms=self.normalization,
augmentation_transforms=None,
load_in_memory=self.load_in_memory,
mean=self.mean,
std=self.std,
)
if self.params["use_ddp"]:
......@@ -181,6 +192,7 @@ class OCRDatasetManager:
)
else:
self.test_samplers[custom_name] = None
self.test_loaders[custom_name] = DataLoader(
self.test_datasets[custom_name],
batch_size=1,
......
# -*- coding: utf-8 -*-
import os
import random
from copy import deepcopy
from enum import Enum
from time import time
......@@ -452,22 +453,68 @@ class GenericTrainingManager:
def save_params(self):
"""
Output yaml file containing a summary of all hyperparameters chosen for the training
Output a yaml file containing a summary of all hyperparameters chosen for the training
and a yaml file containing parameters used for inference
"""
path = os.path.join(self.paths["results"], "parameters.yml")
def compute_nb_params(module):
return sum([np.prod(p.size()) for p in list(module.parameters())])
def class_to_str_dict(my_dict):
for key in my_dict.keys():
if key == "preprocessings":
my_dict[key] = [
{
key: value.value if isinstance(value, Enum) else value
for key, value in preprocessing.items()
}
for preprocessing in my_dict[key]
]
elif callable(my_dict[key]):
my_dict[key] = my_dict[key].__name__
elif isinstance(my_dict[key], np.ndarray):
my_dict[key] = my_dict[key].tolist()
elif isinstance(my_dict[key], list) and isinstance(
my_dict[key][0], tuple
):
my_dict[key] = [list(elt) for elt in my_dict[key]]
elif isinstance(my_dict[key], dict):
my_dict[key] = class_to_str_dict(my_dict[key])
return my_dict
# Save training parameters
path = os.path.join(self.paths["results"], "training_parameters.yml")
if os.path.isfile(path):
return
params = class_to_str_dict(my_dict=deepcopy(self.params))
total_params = 0
for model_name in self.models.keys():
current_params = compute_nb_params(self.models[model_name])
params["model_params"]["models"][model_name] = [
params["model_params"]["models"][model_name],
"{:,}".format(current_params),
]
total_params += current_params
params["model_params"]["total_params"] = "{:,}".format(total_params)
params["mean"] = self.dataset.mean.tolist()
params["std"] = self.dataset.std.tolist()
with open(path, "w") as f:
yaml.dump(params, f)
params = {
# Save inference parameters
path = os.path.join(self.paths["results"], "inference_parameters.yml")
if os.path.isfile(path):
return
inference_params = {
"parameters": {
"max_char_prediction": self.params["training_params"][
"max_char_prediction"
],
"mean": params["mean"],
"std": params["std"],
"max_char_prediction": params["training_params"]["max_char_prediction"],
"encoder": {
"dropout": self.params["model_params"]["dropout"],
"dropout": params["model_params"]["dropout"],
},
"decoder": {
key: self.params["model_params"][key]
key: params["model_params"][key]
for key in [
"enc_dim",
"l_max",
......@@ -483,20 +530,11 @@ class GenericTrainingManager:
"attention_win",
]
},
"preprocessings": [
{
key: value.value if isinstance(value, Enum) else value
for key, value in preprocessing.items()
}
for preprocessing in self.params["dataset_params"]["config"].get(
"preprocessings", []
)
],
"preprocessings": params["dataset_params"]["config"]["preprocessings"],
},
}
with open(path, "w") as f:
yaml.dump(params, f)
yaml.dump(inference_params, f)
def backward_loss(self, loss, retain_graph=False):
self.scaler.scale(loss).backward(retain_graph=retain_graph)
......
......@@ -303,7 +303,6 @@ def plot_attention(
:param line_separators: List of line separators
:param display_polygons: Whether to plot extracted polygons
"""
image = to_pil_image(image)
attention_map = []
......
......@@ -19,7 +19,7 @@ from dan.predict.attention import (
plot_attention,
split_text_and_confidences,
)
from dan.transforms import get_normalization_transforms, get_preprocessing_transforms
from dan.transforms import get_preprocessing_transforms
from dan.utils import ind_to_token, list_to_batches, pad_images, read_image
......@@ -74,7 +74,10 @@ class DAN:
self.encoder = encoder
self.decoder = decoder
self.normalization = get_normalization_transforms()
self.mean, self.std = (
torch.tensor(parameters["mean"]) / 255,
torch.tensor(parameters["std"]) / 255,
)
self.preprocessing_transforms = get_preprocessing_transforms(
parameters.get("preprocessings", [])
)
......@@ -87,7 +90,12 @@ class DAN:
"""
image = read_image(path)
preprocessed_image = self.preprocessing_transforms(image)
return preprocessed_image, self.normalization(preprocessed_image)
normalized_image = torch.zeros(preprocessed_image.shape)
for ch in range(preprocessed_image.shape[0]):
normalized_image[ch, :, :] = (
preprocessed_image[ch, :, :] - self.mean[ch]
) / self.std[ch]
return preprocessed_image, normalized_image
def predict(
self,
......@@ -276,7 +284,6 @@ def process_batch(
# Convert to tensor of size (batch_size, channel, height, width) with batch_size=1
input_tensor = pad_images(input_images).to(device)
visu_tensor = pad_images(visu_images).to(device)
logger.info("Images preprocessed!")
# Parse delimiters to regex
......@@ -297,8 +304,8 @@ def process_batch(
threshold_method=threshold_method,
threshold_value=threshold_value,
)
logger.info("Prediction parsing...")
logger.info("Prediction parsing...")
for idx, image_path in enumerate(image_batch):
predicted_text = prediction["text"][idx]
result = {"text": predicted_text}
......@@ -319,7 +326,6 @@ def process_batch(
]
# calculates scores by token
result["confidences"]["by ner token"] = [
{
"text": f"{predicted_text[current: next_token]}".replace("\n", " "),
......
......@@ -5,8 +5,8 @@ Each transform class defined here takes as input a PIL Image and returns the mod
from enum import Enum
from random import randint
import albumentations as A
import numpy as np
from albumentations import SomeOf
from albumentations.augmentations import (
Affine,
CoarseDropout,
......@@ -18,15 +18,15 @@ from albumentations.augmentations import (
Sharpen,
ToGray,
)
from cv2 import INTER_NEAREST, dilate, erode
from albumentations.core.transforms_interface import ImageOnlyTransform
from cv2 import dilate, erode
from numpy import random
from PIL import Image
from torch import Tensor
from torchvision.transforms import Compose, Normalize, ToPILImage, ToTensor
from torch.distributions.uniform import Uniform
from torchvision.transforms import Compose, ToPILImage
from torchvision.transforms.functional import resize
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
class Preprocessing(str, Enum):
# If the image is bigger than the given size, resize it while keeping the original ratio
......@@ -200,43 +200,38 @@ def get_preprocessing_transforms(
)
case Preprocessing.FixedWidthResize:
transforms.append(FixedWidthResize(width=preprocessing["fixed_width"]))
if to_pil_image:
transforms.append(ToPILImage())
return Compose(transforms)
def get_augmentation_transforms() -> SomeOf:
def get_augmentation_transforms() -> A.Compose:
"""
Returns a list of transformations to be applied to the image.
Returns a list of transformation to be applied to the image.
"""
return SomeOf(
return A.Compose(
[
Perspective(scale=(0.05, 0.09), fit_output=True),
GaussianBlur(sigma_limit=2.5),
GaussNoise(var_limit=50**2),
ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0),
Sharpen(alpha=(0.0, 1.0)),
ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
Affine(shear={"x": (-20, 20), "y": (0, 0)}),
CoarseDropout(),
Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
ToGray(),
DPIAdjusting(min_factor=0.75, max_factor=1),
A.SomeOf(
[
ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4),
GaussianBlur(sigma_limit=2.5, p=1),
GaussNoise(var_limit=50**2, p=1),
ColorJitter(
contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1
),
ElasticTransform(
alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1
),
Sharpen(alpha=(0.0, 1.0), p=1),
Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1),
CoarseDropout(p=1),
ToGray(p=0.5),
],
n=2,
p=0.9,
),
],
p=0.9,
)
def get_normalization_transforms(from_pil_image: bool = False) -> Compose:
"""
Returns a list of normalization transformations.
"""
transforms = []
if from_pil_image:
transforms.append(ToTensor())
transforms.append(Normalize(IMAGENET_MEAN, IMAGENET_STD))
return Compose(transforms)
......@@ -32,19 +32,15 @@ def pad_images(images):
:param images: List of images as torch tensors.
:return padded_images: A tensor containing all the padded images.
"""
longest_x = max([x.shape[1] for x in data])
longest_y = max([x.shape[2] for x in data])
padded_data = torch.zeros((len(data), data[0].shape[0], longest_x, longest_y))
for index, image in enumerate(data):
delta_x = longest_x - image.shape[1]
delta_y = longest_y - image.shape[2]
top, bottom = delta_x // 2, delta_x - (delta_x // 2)
left, right = delta_y // 2, delta_y - (delta_y // 2)
padded_data[
longest_x = max([x.shape[1] for x in images])
longest_y = max([x.shape[2] for x in images])
padded_images = torch.zeros((len(images), images[0].shape[0], longest_x, longest_y))
for index, image in enumerate(images):
padded_images[
index,
:,
top : padded_data.shape[2] - bottom,
left : padded_data.shape[3] - right,
0 : image.shape[1],
0 : image.shape[2],
] = image
return padded_images
......
......@@ -42,4 +42,4 @@ The training command does not take any input parameters for now. To train a DAN
## 3. Predict
Once the training is complete, you can apply a trained DAN model on an image using the [predict command](../usage/predict.md).
Once the training is complete, you can apply a trained DAN model on an image using the [predict command](../usage/predict.md) and the `inference_parameters.yml` file, located in `{training_params.output_folder}/results`.
......@@ -6,26 +6,26 @@ This page lists data augmentation transforms used in DAN.
### Elastic Transform
| | Elastic Transform |
| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation applies local distortions that rotate characters locally. |
| Comments | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform). |
| Examples | ![](../../assets/augmentations/line_elastic.png) ![](../../assets/augmentations/document_elastic.png) |
| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels) |
| | Elastic Transform |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation applies local distortions that rotate characters locally. |
| Comments | The impact of this transformation is mostly visible on documents, not so much on lines. Results are comparable to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.ElasticTransform) |
| Examples | ![](../../assets/augmentations/line_elastic.png) ![](../../assets/augmentations/document_elastic.png) |
| CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels) |
### PieceWise Affine
!!! warning
This transform is temporarily removed from the pipeline until [this issue](https://github.com/albumentations-team/albumentations/issues/1442) is fixed.
| | PieceWise Affine |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation also applies local distortions but with a larger grid than ElasticTransform. |
| Comments | This transformation is very slow. It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine). |
| Examples | ![](../../assets/augmentations/line_piecewise.png) ![](../../assets/augmentations/document_piecewise.png) |
| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels) |
| | PieceWise Affine |
| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation also applies local distortions but with a larger grid than ElasticTransform. |
| Comments | This transformation is very slow. It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.PiecewiseAffine) |
| Examples | ![](../../assets/augmentations/line_piecewise.png) ![](../../assets/augmentations/document_piecewise.png) |
| CPU time (seconds/10 images) | 2.92 (3013x128 pixels) / 3.76 (1116x581 pixels) |
### Dilation Erosion
......@@ -33,99 +33,96 @@ This page lists data augmentation transforms used in DAN.
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| Description | This transformation makes the pen stroke thicker or thinner. |
| Comments | The `RandomDilationErosion` class randomly selects a kernel size and applies a dilation or an erosion to the image. It relies on opencv and is similar to the original DAN implementation. |
| Documentation | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html). |
| Documentation | See the [`opencv` documentation](https://docs.opencv.org/3.4/db/df6/tutorial_erosion_dilatation.html) |
| Examples | ![](../../assets/augmentations/line_erosion_dilation.png) ![](../../assets/augmentations/document_erosion_dilation.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.03 (1116x581 pixels) |
### Sharpen
| | Sharpen |
| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation makes the image sharper. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen). |
| Examples | ![](../../assets/augmentations/line_sharpen.png) ![](../../assets/augmentations/document_sharpen.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels) |
| | Sharpen |
| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation makes the image sharper. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Sharpen) |
| Examples | ![](../../assets/augmentations/line_sharpen.png) ![](../../assets/augmentations/document_sharpen.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.04 (1116x581 pixels) |
### Color Jittering
| | Color Jittering |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation alters the colors of the image. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter). |
| Examples | ![](../../assets/augmentations/line_color_jitter.png) ![](../../assets/augmentations/document_color_jitter.png) |
| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels) |
| | Color Jittering |
| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation alters the colors of the image. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ColorJitter) |
| Examples | ![](../../assets/augmentations/line_color_jitter.png) ![](../../assets/augmentations/document_color_jitter.png) |
| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.04 (1116x581 pixels) |
### Gaussian Noise
| | Gaussian Noise |
| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation adds Gaussian noise to the image. |
| Comments | The noise from the original DAN implementation is more uniform. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise). |
| Examples | ![](../../assets/augmentations/line_gaussian_noise.png) ![](../../assets/augmentations/document_gaussian_noise.png) |
| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels) |
| | Gaussian Noise |
| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation adds Gaussian noise to the image. |
| Comments | The noise from the original DAN implementation is more uniform. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianNoise) |
| Examples | ![](../../assets/augmentations/line_gaussian_noise.png) ![](../../assets/augmentations/document_gaussian_noise.png) |
| CPU time (seconds/10 images) | 0.29 (3013x128 pixels) / 0.53 (1116x581 pixels) |
### Gaussian Blur
| | Gaussian Blur |
| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation blurs the image. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur). |
| Examples | ![](../../assets/augmentations/line_gaussian_blur.png) ![](../../assets/augmentations/document_gaussian_blur.png) |
| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels) |
| | Gaussian Blur |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation blurs the image. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.GaussianBlur) |
| Examples | ![](../../assets/augmentations/line_gaussian_blur.png) ![](../../assets/augmentations/document_gaussian_blur.png) |
| CPU time (seconds/10 images) | 0.01 (3013x128 pixels) / 0.02 (1116x581 pixels) |
### Random Perspective
| | Random Perspective |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation changes the perspective from which the photo is taken. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective). |
| Examples | ![](../../assets/augmentations/line_perspective.png) ![](../../assets/augmentations/document_perspective.png) |
| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels) |
| | Random Perspective |
| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation changes the perspective from which the photo is taken. |
| Comments | Similar to the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Perspective) |
| Examples | ![](../../assets/augmentations/line_perspective.png) ![](../../assets/augmentations/document_perspective.png) |
| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.05 (1116x581 pixels) |
### Shearing (x-axis)
| | Shearing (x-axis) |
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| Description | This transformation changes the slant of the text on the image. |
| Comments | New transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine). |
| Examples | ![](../../assets/augmentations/line_shearx.png) ![](../../assets/augmentations/document_shearx.png) |
| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels) |
| | Shearing (x-axis) |
| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation changes the slant of the text on the image. |
| Comments | New transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#albumentations.augmentations.geometric.transforms.Affine) |
| Examples | ![](../../assets/augmentations/line_shearx.png) ![](../../assets/augmentations/document_shearx.png) |
| CPU time (seconds/10 images) | 0.05 (3013x128 pixels) / 0.04 (1116x581 pixels) |
### Coarse Dropout
| | Coarse Dropout |
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation adds dropout on the image, turning small patches into black pixels. |
| Comments | It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout). |
| Examples | ![](../../assets/augmentations/line_dropout.png) ![](../../assets/augmentations/document_dropout.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) |
| | Coarse Dropout |
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| Description | This transformation adds dropout on the image, turning small patches into black pixels. |
| Comments | It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/dropout/coarse_dropout/#coarsedropout-augmentation-augmentationsdropoutcoarse_dropout) |
| Examples | ![](../../assets/augmentations/line_dropout.png) ![](../../assets/augmentations/document_dropout.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) |
### DPIAdjusting
| | Downscale |
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation downscales the image by a random factor. |
| Comments | It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.Downscale). |
| Examples | ![](../../assets/augmentations/line_downscale.png) ![](../../assets/augmentations/document_downscale.png) |
| CPU time (seconds/10 images) | 0.03 (3013x128 pixels) / 0.03 (1116x581 pixels) |
| | DPIAdjusting |
| ----------- | -------------------------------------------------------------- |
| Description | This transformation downscales the image from a random factor. |
| Comments | Similar to the original DAN implementation. |
### ToGray
| | Grayscale |
| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation transforms an RGB image into grayscale. |
| Comments | It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray). |
| Examples | ![](../../assets/augmentations/line_grayscale.png) ![](../../assets/augmentations/document_grayscale.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) |
| | ToGray |
| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Description | This transformation transforms an RGB image into grayscale. |
| Comments | It is a new transform that was not in the original DAN implementation. |
| Documentation | See the [`albumentations` documentation](https://albumentations.ai/docs/api_reference/augmentations/transforms/#albumentations.augmentations.transforms.ToGray) |
| Examples | ![](../../assets/augmentations/line_grayscale.png) ![](../../assets/augmentations/document_grayscale.png) |
| CPU time (seconds/10 images) | 0.02 (3013x128 pixels) / 0.02 (1116x581 pixels) |
## Full augmentation pipeline
......
......@@ -16,6 +16,7 @@ All hyperparameters are specified and editable in the training scripts `dan/ocr/
| `dataset_params.config.augmentation` | Whether to use data augmentation on the training set. | `bool` | `True` (see [dedicated section](#data-augmentation)) |
!!! warning
The variables `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path` must have values such that the data is located in `{dataset_path}/{dataset_name}_{dataset_level}{dataset_variant}`.
### Data preprocessing
......@@ -90,21 +91,30 @@ DAN takes advantage of transforms from [albumentations](https://albumentations.a
The following configuration is used by default when using the `teklia-dan train document` command. Data augmentation is applied with a probability of 0.9. In this case, two transformations are randomly selected to be applied.
```py
transforms = SomeOf(
transforms = A.Compose(
[
Perspective(scale=(0.05, 0.09), fit_output=True),
GaussianBlur(sigma_limit=2.5),
GaussNoise(var_limit=50**2),
ColorJitter(contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2),
ElasticTransform(alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0),
Sharpen(alpha=(0.0, 1.0)),
ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
Affine(shear={"x": (-20, 20), "y": (0, 0)}),
CoarseDropout(),
Downscale(scale_min=0.5, scale_max=0.9, interpolation=INTER_NEAREST),
ToGray(),
DPIAdjusting(min_factor=0.75, max_factor=1),
A.SomeOf(
[
ErosionDilation(min_kernel=1, max_kernel=4, iterations=1),
Perspective(scale=(0.05, 0.09), fit_output=True, p=0.4),
GaussianBlur(sigma_limit=2.5, p=1),
GaussNoise(var_limit=50**2, p=1),
ColorJitter(
contrast=0.2, brightness=0.2, saturation=0.2, hue=0.2, p=1
),
ElasticTransform(
alpha=20.0, sigma=5.0, alpha_affine=1.0, border_mode=0, p=1
),
Sharpen(alpha=(0.0, 1.0), p=1),
Affine(shear={"x": (-20, 20), "y": (0, 0)}, p=1),
CoarseDropout(p=1),
ToGray(p=0.5),
],
n=2,
p=0.9,
),
],
n=2,
p=0.9,
)
```
......
---
parameters:
mean: [166.8418783515498, 166.8418783515498, 166.8418783515498]
std: [34.084189571536385, 34.084189571536385, 34.084189571536385]
max_char_prediction: 200
encoder:
dropout: 0.5
......
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
......@@ -98,7 +98,7 @@ def test_predict(
"by ner token": [],
"total": 0.93,
"word": [
{"text": "ⓈBellisson", "confidence": 0.92},
{"text": "ⓈBellisson", "confidence": 0.93},
{"text": "ⒻGeorges", "confidence": 0.94},
{"text": "Ⓑ91", "confidence": 0.92},
{"text": "ⓁP", "confidence": 0.94},
......@@ -169,7 +169,7 @@ def test_predict(
{"text": "p", "confidence": 1.0},
{"text": "l", "confidence": 1.0},
{"text": "i", "confidence": 1.0},
{"text": "é", "confidence": 0.86},
{"text": "é", "confidence": 0.85},
{"text": " ", "confidence": 1.0},
{"text": "", "confidence": 1.0},
{"text": "M", "confidence": 1.0},
......
......@@ -16,7 +16,7 @@ from tests.conftest import FIXTURES
"last_3.pt",
{
"nb_chars": 43,
"cer": 1.2558,
"cer": 1.3023,
"nb_words": 9,
"wer": 1.0,
"nb_words_no_punct": 9,
......@@ -66,6 +66,12 @@ from tests.conftest import FIXTURES
"type": "max_resize",
}
],
"mean": [
242.10595854671013,
242.10595854671013,
242.10595854671013,
],
"std": [28.29919517652322, 28.29919517652322, 28.29919517652322],
},
},
),
......@@ -175,12 +181,12 @@ def test_train_and_test(
}
assert res == expected_res
# Check that the parameters file is correct
# Check that the inference parameters file is correct
with (
tmp_path
/ training_config["training_params"]["output_folder"]
/ "results"
/ "parameters.yml"
/ "inference_parameters.yml"
).open() as f:
res = yaml.safe_load(f)
assert res == params_res
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment