Skip to content
Snippets Groups Projects
Verified Commit 6bb96aac authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

remove obsolete code

parent 22353a0a
No related branches found
No related tags found
2 merge requests!23Implement format command,!11Implement extraction command
This commit is part of merge request !11. Comments created here will be created in the context of that merge request.
import os
import shutil
import tarfile
import pickle
import re
from PIL import Image
import numpy as np
class DatasetFormatter:
"""
Global pipeline/functions for dataset formatting
"""
def __init__(self, dataset_name, level, extra_name="", set_names=["train", "valid", "test"]):
self.dataset_name = dataset_name
self.level = level
self.set_names = set_names
self.target_fold_path = os.path.join(
"Datasets", "formatted", "{}_{}{}".format(dataset_name, level, extra_name))
self.map_datasets_files = dict()
self.extract_with_dirname = False
def format(self):
self.init_format()
self.map_datasets_files[self.dataset_name][self.level]["format_function"]()
self.end_format()
def init_format(self):
"""
Load and extracts needed files
"""
os.makedirs(self.target_fold_path, exist_ok=True)
for set_name in self.set_names:
os.makedirs(os.path.join(self.target_fold_path, set_name), exist_ok=True)
class OCRDatasetFormatter(DatasetFormatter):
"""
Specific pipeline/functions for OCR/HTR dataset formatting
"""
def __init__(self, source_dataset, level, extra_name="", set_names=["train", "valid", "test"]):
super(OCRDatasetFormatter, self).__init__(source_dataset, level, extra_name, set_names)
self.charset = set()
self.gt = dict()
for set_name in set_names:
self.gt[set_name] = dict()
def format_text_label(self, label):
"""
Remove extra space or line break characters
"""
temp = re.sub("(\n)+", '\n', label)
return re.sub("( )+", ' ', temp).strip(" \n")
def load_resize_save(self, source_path, target_path):
"""
Load image, apply resolution modification and save it
"""
shutil.copyfile(source_path, target_path)
def resize(self, img, source_dpi, target_dpi):
"""
Apply resolution modification to image
"""
if source_dpi == target_dpi:
return img
if isinstance(img, np.ndarray):
h, w = img.shape[:2]
img = Image.fromarray(img)
else:
w, h = img.size
ratio = target_dpi / source_dpi
img = img.resize((int(w*ratio), int(h*ratio)), Image.BILINEAR)
return np.array(img)
def end_format(self):
"""
Save label and charset files
"""
with open(os.path.join(self.target_fold_path, "labels.pkl"), "wb") as f:
pickle.dump({
"ground_truth": self.gt,
"charset": sorted(list(self.charset)),
}, f)
with open(os.path.join(self.target_fold_path, "charset.pkl"), "wb") as f:
pickle.dump(sorted(list(self.charset)), f)
from Datasets.dataset_formatters.generic_dataset_formatter import OCRDatasetFormatter
import os
import numpy as np
from Datasets.dataset_formatters.utils_dataset import natural_sort
from PIL import Image
import xml.etree.ElementTree as ET
import re
from tqdm import tqdm
# Layout string to token
SEM_MATCHING_TOKENS_STR = {
"INTITULE": "",
"DATE": "",
"COTE_SERIE": "",
"ANALYSE_COMPL": "",
"PRECISIONS_SUR_COTE": "",
"COTE_ARTICLE": ""
}
# Layout begin-token to end-token
SEM_MATCHING_TOKENS = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": ""
}
class SimaraDatasetFormatter(OCRDatasetFormatter):
def __init__(self, level, set_names=["train", "valid", "test"], dpi=150, sem_token=True):
super(SimaraDatasetFormatter, self).__init__("simara", level, "_sem" if sem_token else "", set_names)
self.dpi = dpi
self.sem_token = sem_token
self.map_datasets_files.update({
"simara": {
# (1,050 for train, 100 for validation and 100 for test)
"page": {
"format_function": self.format_simara_page,
},
}
})
self.matching_tokens_str = SEM_MATCHING_TOKENS_STR
self.matching_tokens = SEM_MATCHING_TOKENS
def preformat_simara_page(self):
"""
Extract all information from dataset and correct some annotations
"""
dataset = {
"train": list(),
"valid": list(),
"test": list()
}
img_folder_path = os.path.join("Datasets", "raw", "simara", "images")
labels_folder_path = os.path.join("Datasets", "raw", "simara", "labels")
sem_labels_folder_path = os.path.join("Datasets", "raw", "simara", "labels_sem")
train_files = [
os.path.join(labels_folder_path, 'train', name)
for name in os.listdir(os.path.join(sem_labels_folder_path, 'train'))]
valid_files = [
os.path.join(labels_folder_path, 'valid', name)
for name in os.listdir(os.path.join(sem_labels_folder_path, 'valid'))]
test_files = [
os.path.join(labels_folder_path, 'test', name)
for name in os.listdir(os.path.join(sem_labels_folder_path, 'test'))]
for set_name, files in zip(self.set_names, [train_files, valid_files, test_files]):
for i, label_file in enumerate(tqdm(files, desc='Pre-formatting '+set_name)):
with open(label_file, 'r') as f:
text = f.read()
with open(label_file.replace('labels', 'labels_sem'), 'r') as f:
sem_text = f.read()
dataset[set_name].append({
"img_path": os.path.join(
img_folder_path, set_name, label_file.split('/')[-1].replace('txt', 'jpg')),
"label": text,
"sem_label": sem_text,
})
print(dataset['test'], len(dataset['test']))
return dataset
def format_simara_page(self):
"""
Format simara page dataset
"""
dataset = self.preformat_simara_page()
for set_name in self.set_names:
fold = os.path.join(self.target_fold_path, set_name)
for sample in tqdm(dataset[set_name], desc='Formatting '+set_name):
new_name = sample['img_path'].split('/')[-1]
new_img_path = os.path.join(fold, new_name)
self.load_resize_save(sample["img_path"], new_img_path)#, 300, self.dpi)
page = {
"text": sample["label"] if not self.sem_token else sample["sem_label"],
}
self.charset = self.charset.union(set(page["text"]))
self.gt[set_name][new_name] = page
if __name__ == "__main__":
SimaraDatasetFormatter("page", sem_token=True).format()
#SimaraDatasetFormatter("page", sem_token=False).format()
from torch.nn.functional import log_softmax
from torch.nn import AdaptiveMaxPool2d, Conv1d
from torch.nn import Module
class Decoder(Module):
def __init__(self, params):
super(Decoder, self).__init__()
self.vocab_size = params["vocab_size"]
self.ada_pool = AdaptiveMaxPool2d((1, None))
self.end_conv = Conv1d(in_channels=params["enc_size"], out_channels=self.vocab_size+1, kernel_size=1)
def forward(self, x):
x = self.ada_pool(x).squeeze(2)
x = self.end_conv(x)
return log_softmax(x, dim=1)
from basic.generic_training_manager import GenericTrainingManager
import os
from PIL import Image
import pickle
class OCRManager(GenericTrainingManager):
def __init__(self, params):
super(OCRManager, self).__init__(params)
self.params["model_params"]["vocab_size"] = len(self.dataset.charset)
def generate_syn_line_dataset(self, name):
"""
Generate synthetic line dataset from currently loaded dataset
"""
dataset_name = list(self.params['dataset_params']["datasets"].keys())[0]
path = os.path.join(os.path.dirname(self.params['dataset_params']["datasets"][dataset_name]), name)
os.makedirs(path, exist_ok=True)
charset = set()
dataset = None
gt = {
"train": dict(),
"valid": dict(),
"test": dict()
}
for set_name in ["train", "valid", "test"]:
set_path = os.path.join(path, set_name)
os.makedirs(set_path, exist_ok=True)
if set_name == "train":
dataset = self.dataset.train_dataset
elif set_name == "valid":
dataset = self.dataset.valid_datasets["{}-valid".format(dataset_name)]
elif set_name == "test":
self.dataset.generate_test_loader("{}-test".format(dataset_name), [(dataset_name, "test"), ])
dataset = self.dataset.test_datasets["{}-test".format(dataset_name)]
samples = list()
for sample in dataset.samples:
for line_label in sample["label"].split("\n"):
for chunk in [line_label[i:i+100] for i in range(0, len(line_label), 100)]:
charset = charset.union(set(chunk))
if len(chunk) > 0:
samples.append({
"path": sample["path"],
"label": chunk,
"nb_cols": 1,
})
for i, sample in enumerate(samples):
ext = sample['path'].split(".")[-1]
img_name = "{}_{}.{}".format(set_name, i, ext)
img_path = os.path.join(set_path, img_name)
img = dataset.generate_typed_text_line_image(sample["label"])
Image.fromarray(img).save(img_path)
gt[set_name][img_name] = {
"text": sample["label"],
"nb_cols": sample["nb_cols"] if "nb_cols" in sample else 1
}
if "line_label" in sample:
gt[set_name][img_name]["lines"] = sample["line_label"]
with open(os.path.join(path, "labels.pkl"), "wb") as f:
pickle.dump({
"ground_truth": gt,
"charset": sorted(list(charset)),
}, f)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
from torch.nn import Dropout, Dropout2d
import numpy as np
class DropoutScheduler:
def __init__(self, models, function, T=1e5):
"""
T: number of gradient updates to converge
"""
self.teta_list = list()
self.init_teta_list(models)
self.function = function
self.T = T
self.step_num = 0
def step(self):
self.step(1)
def step(self, num):
self.step_num += num
def init_teta_list(self, models):
for model_name in models.keys():
self.init_teta_list_module(models[model_name])
def init_teta_list_module(self, module):
for child in module.children():
if isinstance(child, Dropout) or isinstance(child, Dropout2d):
self.teta_list.append([child, child.p])
else:
self.init_teta_list_module(child)
def update_dropout_rate(self):
for (module, p) in self.teta_list:
module.p = self.function(p, self.step_num, self.T)
def exponential_dropout_scheduler(dropout_rate, step, max_step):
return dropout_rate * (1 - np.exp(-10 * step / max_step))
def exponential_scheduler(init_value, end_value, step, max_step):
step = min(step, max_step-1)
return init_value - (init_value - end_value) * (1 - np.exp(-10*step/max_step))
def linear_scheduler(init_value, end_value, step, max_step):
return init_value + step * (end_value - init_value) / max_step
\ No newline at end of file
import numpy as np
import torch
from torch.distributions.uniform import Uniform
import cv2
def randint(low, high):
"""
call torch.randint to preserve random among dataloader workers
"""
return int(torch.randint(low, high, (1, )))
def rand():
"""
call torch.rand to preserve random among dataloader workers
"""
return float(torch.rand((1, )))
def rand_uniform(low, high):
"""
call torch uniform to preserve random among dataloader workers
"""
return float(Uniform(low, high).sample())
def pad_sequences_1D(data, padding_value):
"""
Pad data with padding_value to get same length
"""
x_lengths = [len(x) for x in data]
longest_x = max(x_lengths)
padded_data = np.ones((len(data), longest_x)).astype(np.int32) * padding_value
for i, x_len in enumerate(x_lengths):
padded_data[i, :x_len] = data[i][:x_len]
return padded_data
def resize_max(img, max_width=None, max_height=None):
if max_width is not None and img.shape[1] > max_width:
ratio = max_width / img.shape[1]
new_h = int(np.floor(ratio * img.shape[0]))
new_w = int(np.floor(ratio * img.shape[1]))
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
if max_height is not None and img.shape[0] > max_height:
ratio = max_height / img.shape[0]
new_h = int(np.floor(ratio * img.shape[0]))
new_w = int(np.floor(ratio * img.shape[1]))
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
return img
def pad_images(data, padding_value, padding_mode="br"):
"""
data: list of numpy array
mode: "br"/"tl"/"random" (bottom-right, top-left, random)
"""
x_lengths = [x.shape[0] for x in data]
y_lengths = [x.shape[1] for x in data]
longest_x = max(x_lengths)
longest_y = max(y_lengths)
padded_data = np.ones((len(data), longest_x, longest_y, data[0].shape[2])) * padding_value
for i, xy_len in enumerate(zip(x_lengths, y_lengths)):
x_len, y_len = xy_len
if padding_mode == "br":
padded_data[i, :x_len, :y_len, ...] = data[i]
elif padding_mode == "tl":
padded_data[i, -x_len:, -y_len:, ...] = data[i]
elif padding_mode == "random":
xmax = longest_x - x_len
ymax = longest_y - y_len
xi = randint(0, xmax) if xmax >= 1 else 0
yi = randint(0, ymax) if ymax >= 1 else 0
padded_data[i, xi:xi+x_len, yi:yi+y_len, ...] = data[i]
else:
raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
return padded_data
def pad_image(image, padding_value, new_height=None, new_width=None, pad_width=None, pad_height=None, padding_mode="br", return_position=False):
"""
data: list of numpy array
mode: "br"/"tl"/"random" (bottom-right, top-left, random)
"""
if pad_width is not None and new_width is not None:
raise NotImplementedError("pad_with and new_width are not compatible")
if pad_height is not None and new_height is not None:
raise NotImplementedError("pad_height and new_height are not compatible")
h, w, c = image.shape
pad_width = pad_width if pad_width is not None else max(0, new_width - w) if new_width is not None else 0
pad_height = pad_height if pad_height is not None else max(0, new_height - h) if new_height is not None else 0
if not (pad_width == 0 and pad_height == 0):
padded_image = np.ones((h+pad_height, w+pad_width, c)) * padding_value
if padding_mode == "br":
hi, wi = 0, 0
elif padding_mode == "tl":
hi, wi = pad_height, pad_width
elif padding_mode == "random":
hi = randint(0, pad_height) if pad_height >= 1 else 0
wi = randint(0, pad_width) if pad_width >= 1 else 0
else:
raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
padded_image[hi:hi + h, wi:wi + w, ...] = image
output = padded_image
else:
hi, wi = 0, 0
output = image
if return_position:
return output, [[hi, hi+h], [wi, wi+w]]
return output
def pad_image_width_right(img, new_width, padding_value):
"""
Pad img to right side with padding value to reach new_width as width
"""
h, w, c = img.shape
pad_width = max((new_width - w), 0)
pad_right = np.ones((h, pad_width, c), dtype=img.dtype) * padding_value
img = np.concatenate([img, pad_right], axis=1)
return img
def pad_image_width_left(img, new_width, padding_value):
"""
Pad img to left side with padding value to reach new_width as width
"""
h, w, c = img.shape
pad_width = max((new_width - w), 0)
pad_left = np.ones((h, pad_width, c), dtype=img.dtype) * padding_value
img = np.concatenate([pad_left, img], axis=1)
return img
def pad_image_width_random(img, new_width, padding_value, max_pad_left_ratio=1):
"""
Randomly pad img to left and right sides with padding value to reach new_width as width
"""
h, w, c = img.shape
pad_width = max((new_width - w), 0)
max_pad_left = int(max_pad_left_ratio*pad_width)
pad_left = randint(0, min(pad_width, max_pad_left)) if pad_width != 0 and max_pad_left > 0 else 0
pad_right = pad_width - pad_left
pad_left = np.ones((h, pad_left, c), dtype=img.dtype) * padding_value
pad_right = np.ones((h, pad_right, c), dtype=img.dtype) * padding_value
img = np.concatenate([pad_left, img, pad_right], axis=1)
return img
def pad_image_height_random(img, new_height, padding_value, max_pad_top_ratio=1):
"""
Randomly pad img top and bottom sides with padding value to reach new_width as width
"""
h, w, c = img.shape
pad_height = max((new_height - h), 0)
max_pad_top = int(max_pad_top_ratio*pad_height)
pad_top = randint(0, min(pad_height, max_pad_top)) if pad_height != 0 and max_pad_top > 0 else 0
pad_bottom = pad_height - pad_top
pad_top = np.ones((pad_top, w, c), dtype=img.dtype) * padding_value
pad_bottom = np.ones((pad_bottom, w, c), dtype=img.dtype) * padding_value
img = np.concatenate([pad_top, img, pad_bottom], axis=0)
return img
def pad_image_height_bottom(img, new_height, padding_value):
"""
Pad img to bottom side with padding value to reach new_height as height
"""
h, w, c = img.shape
pad_height = max((new_height - h), 0)
pad_bottom = np.ones((pad_height, w, c)) * padding_value
img = np.concatenate([img, pad_bottom], axis=0)
return img
arkindex-client==1.0.11
editdistance==0.6.0
fontTools==4.29.1
imageio==2.16.0
networkx==2.6.3
tensorboard==0.2.1
torchvision==0.12.0
tqdm==4.62.3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment