Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (5)
......@@ -6,10 +6,10 @@ import pickle
import cv2
import numpy as np
import torch
from torch import randint
from dan.manager.dataset import DatasetManager, GenericDataset, apply_preprocessing
from dan.ocr.utils import LM_str_to_ind
from dan.utils import pad_image, pad_images, pad_sequences_1D, randint
from dan.utils import pad_image, pad_images, pad_sequences_1D, token_to_ind
class OCRDatasetManager(DatasetManager):
......@@ -111,9 +111,8 @@ class OCRDataset(GenericDataset):
)
sample["img"] = cv2.resize(sample["img"], (new_w, new_h))
# Normalization if requested
if "normalize" in self.params["config"] and self.params["config"]["normalize"]:
sample["img"] = (sample["img"] - self.mean) / self.std
# Normalization
sample["img"] = (sample["img"] - self.mean) / self.std
sample["img_reduced_shape"] = np.ceil(
sample["img"].shape / self.reduce_dims_factor
......@@ -137,12 +136,12 @@ class OCRDataset(GenericDataset):
min_pad = self.params["config"]["padding"]["min_pad"]
max_pad = self.params["config"]["padding"]["max_pad"]
pad_width = (
randint(min_pad, max_pad)
randint(min_pad, max_pad, (1,))
if min_pad is not None and max_pad is not None
else None
)
pad_height = (
randint(min_pad, max_pad)
randint(min_pad, max_pad, (1,))
if min_pad is not None and max_pad is not None
else None
)
......@@ -174,12 +173,10 @@ class OCRDataset(GenericDataset):
full_label = label
sample["label"] = full_label
sample["token_label"] = LM_str_to_ind(self.charset, full_label)
if "add_eot" in self.params["config"]["constraints"]:
sample["token_label"].append(self.tokens["end"])
sample["token_label"] = token_to_ind(self.charset, full_label)
sample["token_label"].append(self.tokens["end"])
sample["label_len"] = len(sample["token_label"])
if "add_sot" in self.params["config"]["constraints"]:
sample["token_label"].insert(0, self.tokens["start"])
sample["token_label"].insert(0, self.tokens["start"])
return sample
......
......@@ -20,8 +20,8 @@ from tqdm import tqdm
from dan.manager.metrics import MetricManager
from dan.manager.ocr import OCRDatasetManager
from dan.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics
from dan.ocr.utils import LM_ind_to_str
from dan.schedulers import DropoutScheduler
from dan.utils import ind_to_token
if MLFLOW_AVAILABLE:
import mlflow
......@@ -1010,7 +1010,7 @@ class Manager(OCRManager):
predicted_tokens = torch.argmax(pred, dim=1).detach().cpu().numpy()
predicted_tokens = [predicted_tokens[i, : y_len[i]] for i in range(b)]
str_x = [
LM_ind_to_str(self.dataset.charset, t, oov_symbol="")
ind_to_token(self.dataset.charset, t, oov_symbol="")
for t in predicted_tokens
]
......@@ -1130,7 +1130,7 @@ class Manager(OCRManager):
confidence_scores[i, : prediction_len[i]].tolist() for i in range(b)
]
str_x = [
LM_ind_to_str(self.dataset.charset, t, oov_symbol="")
ind_to_token(self.dataset.charset, t, oov_symbol="")
for t in predicted_tokens
]
......
......@@ -109,11 +109,7 @@ def get_config():
"height_divisor": 32, # Image height will be divided by 32
"padding_value": 0, # Image padding value
"padding_token": None, # Label padding value
"constraints": [
"add_eot",
"add_sot",
], # add end-of-transcription and start-of-transcription tokens in labels
"normalize": True, # Normalize with mean and variance of training dataset
"constraints": [],
"preprocessings": [
{
"type": "to_RGB",
......
# -*- coding: utf-8 -*-
# Charset / labels conversion
def LM_str_to_ind(labels, str):
return [labels.index(c) for c in str]
def LM_ind_to_str(labels, ind, oov_symbol=None):
if oov_symbol is not None:
res = []
for i in ind:
if i < len(labels):
res.append(labels[i])
else:
res.append(oov_symbol)
else:
res = [labels[i] for i in ind]
return "".join(res)
......@@ -6,7 +6,6 @@ import numpy as np
from PIL import Image
from dan import logger
from dan.utils import round_floats
def parse_delimiters(delimiters):
......@@ -78,7 +77,7 @@ def split_text_and_confidences(
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return texts, round_floats(probs), offset
return texts, [np.around(num, 2) for num in probs], offset
def get_predicted_polygons_with_confidence(
......
......@@ -2,6 +2,7 @@
import os
import pickle
from itertools import pairwise
from pathlib import Path
import cv2
......@@ -13,14 +14,13 @@ from dan import logger
from dan.datasets.extract.utils import save_json
from dan.decoder import GlobalHTADecoder
from dan.encoder import FCN_Encoder
from dan.ocr.utils import LM_ind_to_str
from dan.predict.attention import (
get_predicted_polygons_with_confidence,
parse_delimiters,
plot_attention,
split_text_and_confidences,
)
from dan.utils import pairwise, read_image
from dan.utils import ind_to_token, read_image
class DAN:
......@@ -220,7 +220,7 @@ class DAN:
# Transform tokens to characters
predicted_text = [
LM_ind_to_str(self.charset, t, oov_symbol="") for t in predicted_tokens
ind_to_token(self.charset, t, oov_symbol="") for t in predicted_tokens
]
logger.info("Images processed")
......
......@@ -9,6 +9,8 @@ import numpy as np
from cv2 import dilate, erode, normalize
from numpy import random
from PIL import Image
from torch import rand, randint
from torch.distributions.uniform import Uniform
from torchvision.transforms import (
ColorJitter,
GaussianBlur,
......@@ -17,8 +19,6 @@ from torchvision.transforms import (
)
from torchvision.transforms.functional import InterpolationMode
from dan.utils import rand, rand_uniform, randint
class DPIAdjusting:
"""
......@@ -173,14 +173,14 @@ def get_list_augmenters(img, aug_configs, fill_value):
"""
augmenters = list()
for aug_config in aug_configs:
if rand() > aug_config["proba"]:
if rand((1,)) > aug_config["proba"]:
continue
if aug_config["type"] == "dpi":
valid_factor = False
while not valid_factor:
factor = rand_uniform(
factor = Uniform(
aug_config["min_factor"], aug_config["max_factor"]
)
).sample()
valid_factor = not (
(
"max_width" in aug_config
......@@ -202,8 +202,12 @@ def get_list_augmenters(img, aug_configs, fill_value):
augmenters.append(DPIAdjusting(factor))
elif aug_config["type"] == "zoom_ratio":
ratio_h = rand_uniform(aug_config["min_ratio_h"], aug_config["max_ratio_h"])
ratio_w = rand_uniform(aug_config["min_ratio_w"], aug_config["max_ratio_w"])
ratio_h = Uniform(
aug_config["min_ratio_h"], aug_config["max_ratio_h"]
).sample()
ratio_w = Uniform(
aug_config["min_ratio_w"], aug_config["max_ratio_w"]
).sample()
augmenters.append(
ZoomRatio(
ratio_h=ratio_h, ratio_w=ratio_w, keep_dim=aug_config["keep_dim"]
......@@ -211,7 +215,7 @@ def get_list_augmenters(img, aug_configs, fill_value):
)
elif aug_config["type"] == "perspective":
scale = rand_uniform(aug_config["min_factor"], aug_config["max_factor"])
scale = Uniform(aug_config["min_factor"], aug_config["max_factor"]).sample()
augmenters.append(
RandomPerspective(
distortion_scale=scale,
......@@ -223,13 +227,20 @@ def get_list_augmenters(img, aug_configs, fill_value):
elif aug_config["type"] == "elastic_distortion":
kernel_size = (
randint(aug_config["min_kernel_size"], aug_config["max_kernel_size"])
// 2
* 2
+ 1
randint(
aug_config["min_kernel_size"], aug_config["max_kernel_size"], (1,)
).item()
) // 2 * 2 + 1
sigma = (
Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
.sample()
.item()
)
alpha = (
Uniform(aug_config["min_alpha"], aug_config["max_alpha"])
.sample()
.item()
)
sigma = rand_uniform(aug_config["min_sigma"], aug_config["max_sigma"])
alpha = rand_uniform(aug_config["min_alpha"], aug_config["max_alpha"])
augmenters.append(
ElasticDistortion(
kernel_size=(kernel_size, kernel_size), sigma=sigma, alpha=alpha
......@@ -237,9 +248,13 @@ def get_list_augmenters(img, aug_configs, fill_value):
)
elif aug_config["type"] == "dilation_erosion":
kernel_h = randint(aug_config["min_kernel"], aug_config["max_kernel"] + 1)
kernel_w = randint(aug_config["min_kernel"], aug_config["max_kernel"] + 1)
if randint(0, 2) == 0:
kernel_h = randint(
aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
)
kernel_w = randint(
aug_config["min_kernel"], aug_config["max_kernel"] + 1, (1,)
)
if randint(0, 2, (1,)) == 0:
augmenters.append(
Erosion((kernel_w, kernel_h), aug_config["iterations"])
)
......@@ -261,9 +276,17 @@ def get_list_augmenters(img, aug_configs, fill_value):
elif aug_config["type"] == "gaussian_blur":
max_kernel_h = min(aug_config["max_kernel"], img.size[1])
max_kernel_w = min(aug_config["max_kernel"], img.size[0])
kernel_h = randint(aug_config["min_kernel"], max_kernel_h + 1) // 2 * 2 + 1
kernel_w = randint(aug_config["min_kernel"], max_kernel_w + 1) // 2 * 2 + 1
sigma = rand_uniform(aug_config["min_sigma"], aug_config["max_sigma"])
kernel_h = (
randint(aug_config["min_kernel"], max_kernel_h + 1, (1,)).item()
) // 2 * 2 + 1
kernel_w = (
randint(aug_config["min_kernel"], max_kernel_w + 1, (1,)).item()
) // 2 * 2 + 1
sigma = (
Uniform(aug_config["min_sigma"], aug_config["max_sigma"])
.sample()
.item()
)
augmenters.append(
GaussianBlur(kernel_size=(kernel_w, kernel_h), sigma=sigma)
)
......@@ -272,10 +295,10 @@ def get_list_augmenters(img, aug_configs, fill_value):
augmenters.append(GaussianNoise(std=aug_config["std"]))
elif aug_config["type"] == "sharpen":
alpha = rand_uniform(aug_config["min_alpha"], aug_config["max_alpha"])
strength = rand_uniform(
alpha = Uniform(aug_config["min_alpha"], aug_config["max_alpha"]).sample()
strength = Uniform(
aug_config["min_strength"], aug_config["max_strength"]
)
).sample()
augmenters.append(Sharpen(alpha=alpha, strength=strength))
else:
......@@ -289,7 +312,7 @@ def apply_data_augmentation(img, da_config):
"""
Apply data augmentation strategy on input image
"""
if da_config["proba"] != 1 and rand() > da_config["proba"]:
if da_config["proba"] != 1 and rand((1,)) > da_config["proba"]:
return img
# Convert to PIL Image
......
# -*- coding: utf-8 -*-
from itertools import tee
import cv2
import numpy as np
import torch
from torch.distributions.uniform import Uniform
from torch import randint
# Layout begin-token to end-token
SEM_MATCHING_TOKENS = {"": "", "": "", "": "", "": "", "": "", "": ""}
......@@ -16,27 +13,6 @@ class MLflowNotInstalled(Exception):
"""
def randint(low, high):
"""
call torch.randint to preserve random among dataloader workers
"""
return int(torch.randint(low, high, (1,)))
def rand():
"""
call torch.rand to preserve random among dataloader workers
"""
return float(torch.rand((1,)))
def rand_uniform(low, high):
"""
call torch uniform to preserve random among dataloader workers
"""
return float(Uniform(low, high).sample())
def pad_sequences_1D(data, padding_value):
"""
Pad data with padding_value to get same length
......@@ -70,8 +46,8 @@ def pad_images(data, padding_value, padding_mode="br"):
elif padding_mode == "random":
xmax = longest_x - x_len
ymax = longest_y - y_len
xi = randint(0, xmax) if xmax >= 1 else 0
yi = randint(0, ymax) if ymax >= 1 else 0
xi = randint(0, xmax, (1,)) if xmax >= 1 else 0
yi = randint(0, ymax, (1,)) if ymax >= 1 else 0
padded_data[i, xi : xi + x_len, yi : yi + y_len, ...] = data[i]
else:
raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
......@@ -120,8 +96,8 @@ def pad_image(
elif padding_mode == "tl":
hi, wi = pad_height, pad_width
elif padding_mode == "random":
hi = randint(0, pad_height) if pad_height >= 1 else 0
wi = randint(0, pad_width) if pad_width >= 1 else 0
hi = randint(0, pad_height, (1,)) if pad_height >= 1 else 0
wi = randint(0, pad_width, (1,)) if pad_width >= 1 else 0
else:
raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
padded_image[hi : hi + h, wi : wi + w, ...] = image
......@@ -156,11 +132,19 @@ def round_floats(float_list, decimals=2):
return [np.around(num, decimals) for num in float_list]
def pairwise(iterable):
"""
Not necessary when using 3.10. See https://docs.python.org/3/library/itertools.html#itertools.pairwise.
"""
# pairwise('ABCDEFG') --> AB BC CD DE EF FG
a, b = tee(iterable)
next(b, None)
return zip(a, b)
# Charset / labels conversion
def token_to_ind(labels, str):
return [labels.index(c) for c in str]
def ind_to_token(labels, ind, oov_symbol=None):
if oov_symbol is not None:
res = []
for i in ind:
if i < len(labels):
res.append(labels[i])
else:
res.append(oov_symbol)
else:
res = [labels[i] for i in ind]
return "".join(res)
# Utils
::: dan.ocr.utils
......@@ -8,7 +8,7 @@ All hyperparameters are specified and editable in the training scripts (meaning
| `dataset_name` | Name of the dataset. | `str` | |
| `dataset_level` | Level of the dataset. Should be named after the element type. | `str` | |
| `dataset_variant` | Variant of the dataset. Usually empty for HTR datasets, `"_sem"` for HTR+NER datasets. | `str` | |
| `dataset_path` | Path to the dataset. | `str` |
| `dataset_path` | Path to the dataset. | `str` | |
| `dataset_params.config.dataset_manager` | Dataset manager class. | custom class | `OCRDatasetManager` |
| `dataset_params.config.dataset_class` | Dataset class. | custom class | `OCRDataset` |
| `dataset_params.config.datasets` | Dataset dictionary with the dataset name as key and dataset path as value. | `dict` | |
......@@ -18,8 +18,7 @@ All hyperparameters are specified and editable in the training scripts (meaning
| `dataset_params.config.width_divisor` | Factor to reduce the height of the feature vector before feeding the decoder. | `int` | `32` |
| `dataset_params.config.padding_value` | Image padding value. | `int` | `0` |
| `dataset_params.config.padding_token` | Transcription padding value. | `int` | `None` |
| `dataset_params.config.constraints` | Whether to add end-of-transcription and start-of-transcription tokens in labels. | `list` | `["add_eot", "add_sot"]` |
| `dataset_params.config.normalize` | Normalize with mean and variance of training dataset. | `bool` | `True` |
| `dataset_params.config.constraints` | Whether to add end-of-transcription and start-of-transcription tokens in labels. | `list` | `[]` |
| `dataset_params.config.preprocessings` | List of pre-processing functions to apply to input images. | `list` | (see [dedicated section](#data-preprocessing)) |
| `dataset_params.config.augmentation` | Configuration for data augmentation. | `dict` | (see [dedicated section](#data-augmentation)) |
......
......@@ -86,7 +86,6 @@ nav:
- Training managers: ref/managers/training.md
- OCR:
- ref/ocr/index.md
- Utils: ref/ocr/utils.md
- Document:
- ref/ocr/document/index.md
- Training: ref/ocr/document/train.md
......
......@@ -72,11 +72,7 @@ def training_config():
"height_divisor": 32, # Image height will be divided by 32
"padding_value": 0, # Image padding value
"padding_token": None, # Label padding value
"constraints": [
"add_eot",
"add_sot",
], # add end-of-transcription and start-of-transcription tokens in labels
"normalize": True, # Normalize with mean and variance of training dataset
"constraints": [],
"preprocessings": [
{
"type": "to_RGB",
......