remove obsolete code

6bb96aac · Yoann Schneider · 22353a0a · 22353a0a · 22353a0a · 22353a0a
Verified Commit 6bb96aac authored 2 years ago by Yoann Schneider
--- a/Datasets/dataset_formatters/generic_dataset_formatter.py
+++ b/Datasets/dataset_formatters/generic_dataset_formatter.py
-import os
-import shutil
-import tarfile
-import pickle
-import re
-from PIL import Image
-import numpy as np
-
-
-class DatasetFormatter:
-    """
-    Global pipeline/functions for dataset formatting
-    """
-
-    def __init__(self, dataset_name, level, extra_name="", set_names=["train", "valid", "test"]):
-        self.dataset_name = dataset_name
-        self.level = level
-        self.set_names = set_names
-        self.target_fold_path = os.path.join(
-            "Datasets", "formatted", "{}_{}{}".format(dataset_name, level, extra_name))
-        self.map_datasets_files = dict()
-        self.extract_with_dirname = False
-
-    def format(self):
-        self.init_format()
-        self.map_datasets_files[self.dataset_name][self.level]["format_function"]()
-        self.end_format()
-
-    def init_format(self):
-        """
-        Load and extracts needed files
-        """
-        os.makedirs(self.target_fold_path, exist_ok=True)
-
-        for set_name in self.set_names:
-            os.makedirs(os.path.join(self.target_fold_path, set_name), exist_ok=True)
-
-
-class OCRDatasetFormatter(DatasetFormatter):
-    """
-    Specific pipeline/functions for OCR/HTR dataset formatting
-    """
-
-    def __init__(self, source_dataset, level, extra_name="", set_names=["train", "valid", "test"]):
-        super(OCRDatasetFormatter, self).__init__(source_dataset, level, extra_name, set_names)
-        self.charset = set()
-        self.gt = dict()
-        for set_name in set_names:
-            self.gt[set_name] = dict()
-
-    def format_text_label(self, label):
-        """
-        Remove extra space or line break characters
-        """
-        temp = re.sub("(\n)+", '\n', label)
-        return re.sub("( )+", ' ', temp).strip(" \n")
-
-    def load_resize_save(self, source_path, target_path):
-        """
-        Load image, apply resolution modification and save it
-        """
-        shutil.copyfile(source_path, target_path)
-
-    def resize(self, img, source_dpi, target_dpi):
-        """
-        Apply resolution modification to image
-        """
-        if source_dpi == target_dpi:
-            return img
-        if isinstance(img, np.ndarray):
-            h, w = img.shape[:2]
-            img = Image.fromarray(img)
-        else:
-            w, h = img.size
-        ratio = target_dpi / source_dpi
-        img = img.resize((int(w*ratio), int(h*ratio)), Image.BILINEAR)
-        return np.array(img)
-
-    def end_format(self):
-        """
-        Save label and charset files
-        """
-        with open(os.path.join(self.target_fold_path, "labels.pkl"), "wb") as f:
-            pickle.dump({
-                "ground_truth": self.gt,
-                "charset": sorted(list(self.charset)),
-            }, f)
-        with open(os.path.join(self.target_fold_path, "charset.pkl"), "wb") as f:
-            pickle.dump(sorted(list(self.charset)), f)
--- a/Datasets/dataset_formatters/simara_formatter.py
+++ b/Datasets/dataset_formatters/simara_formatter.py
-from Datasets.dataset_formatters.generic_dataset_formatter import OCRDatasetFormatter
-import os
-import numpy as np
-from Datasets.dataset_formatters.utils_dataset import natural_sort
-from PIL import Image
-import xml.etree.ElementTree as ET
-import re
-from tqdm import tqdm
-
-# Layout string to token
-SEM_MATCHING_TOKENS_STR = {
-    "INTITULE": "ⓘ",
-    "DATE": "ⓓ",
-    "COTE_SERIE": "ⓢ",
-    "ANALYSE_COMPL": "ⓒ",
-    "PRECISIONS_SUR_COTE": "ⓟ",
-    "COTE_ARTICLE": "ⓐ"
-}
-
-# Layout begin-token to end-token
-SEM_MATCHING_TOKENS = {
-    "ⓘ": "Ⓘ",
-    "ⓓ": "Ⓓ",
-    "ⓢ": "Ⓢ",
-    "ⓒ": "Ⓒ",
-    "ⓟ": "Ⓟ",
-    "ⓐ": "Ⓐ"
-}
-
-class SimaraDatasetFormatter(OCRDatasetFormatter):
-    def __init__(self, level, set_names=["train", "valid", "test"], dpi=150, sem_token=True):
-        super(SimaraDatasetFormatter, self).__init__("simara", level, "_sem" if sem_token else "", set_names)
-
-        self.dpi = dpi
-        self.sem_token = sem_token
-        self.map_datasets_files.update({
-            "simara": {
-                # (1,050 for train, 100 for validation and 100 for test)
-                "page": {
-                    "format_function": self.format_simara_page,
-                },
-            }
-        })
-        self.matching_tokens_str = SEM_MATCHING_TOKENS_STR
-        self.matching_tokens = SEM_MATCHING_TOKENS
-
-    def preformat_simara_page(self):
-        """
-        Extract all information from dataset and correct some annotations
-        """
-        dataset = {
-            "train": list(),
-            "valid": list(),
-            "test": list()
-        }
-        img_folder_path = os.path.join("Datasets", "raw", "simara", "images")
-        labels_folder_path = os.path.join("Datasets", "raw", "simara", "labels")
-        sem_labels_folder_path = os.path.join("Datasets", "raw", "simara", "labels_sem")
-        train_files = [
-            os.path.join(labels_folder_path, 'train', name)
-            for name in os.listdir(os.path.join(sem_labels_folder_path, 'train'))]
-        valid_files = [
-            os.path.join(labels_folder_path, 'valid', name)
-            for name in os.listdir(os.path.join(sem_labels_folder_path, 'valid'))]
-        test_files = [
-            os.path.join(labels_folder_path, 'test', name)
-            for name in os.listdir(os.path.join(sem_labels_folder_path, 'test'))]
-        for set_name, files in zip(self.set_names, [train_files, valid_files, test_files]):
-            for i, label_file in enumerate(tqdm(files, desc='Pre-formatting '+set_name)):
-                with open(label_file, 'r') as f:
-                    text = f.read()
-                with open(label_file.replace('labels', 'labels_sem'), 'r') as f:
-                    sem_text = f.read()
-                dataset[set_name].append({
-                    "img_path": os.path.join(
-                        img_folder_path, set_name, label_file.split('/')[-1].replace('txt', 'jpg')),
-                    "label": text,
-                    "sem_label": sem_text,
-                })
-        print(dataset['test'], len(dataset['test']))
-        return dataset
-
-    def format_simara_page(self):
-        """
-        Format simara page dataset
-        """
-        dataset = self.preformat_simara_page()
-        for set_name in self.set_names:
-            fold = os.path.join(self.target_fold_path, set_name)
-            for sample in tqdm(dataset[set_name], desc='Formatting '+set_name):
-                new_name = sample['img_path'].split('/')[-1]
-                new_img_path = os.path.join(fold, new_name)
-                self.load_resize_save(sample["img_path"], new_img_path)#, 300, self.dpi)
-                page = {
-                    "text": sample["label"] if not self.sem_token else sample["sem_label"],
-                }
-                self.charset = self.charset.union(set(page["text"]))
-                self.gt[set_name][new_name] = page
-
-
-if __name__ == "__main__":
-
-    SimaraDatasetFormatter("page", sem_token=True).format()
-    #SimaraDatasetFormatter("page", sem_token=False).format()
--- a/OCR/line_OCR/ctc/models_line_ctc.py
+++ b/OCR/line_OCR/ctc/models_line_ctc.py
-
-from torch.nn.functional import log_softmax
-from torch.nn import AdaptiveMaxPool2d, Conv1d
-from torch.nn import Module
-
-
-class Decoder(Module):
-    def __init__(self, params):
-        super(Decoder, self).__init__()
-
-        self.vocab_size = params["vocab_size"]
-
-        self.ada_pool = AdaptiveMaxPool2d((1, None))
-        self.end_conv = Conv1d(in_channels=params["enc_size"], out_channels=self.vocab_size+1, kernel_size=1)
-
-    def forward(self, x):
-        x = self.ada_pool(x).squeeze(2)
-        x = self.end_conv(x)
-        return log_softmax(x, dim=1)
--- a/OCR/ocr_manager.py
+++ b/OCR/ocr_manager.py
-from basic.generic_training_manager import GenericTrainingManager
-import os
-from PIL import Image
-import pickle
-
-
-class OCRManager(GenericTrainingManager):
-    def __init__(self, params):
-        super(OCRManager, self).__init__(params)
-        self.params["model_params"]["vocab_size"] = len(self.dataset.charset)
-
-    def generate_syn_line_dataset(self, name):
-        """
-        Generate synthetic line dataset from currently loaded dataset
-        """
-        dataset_name = list(self.params['dataset_params']["datasets"].keys())[0]
-        path = os.path.join(os.path.dirname(self.params['dataset_params']["datasets"][dataset_name]), name)
-        os.makedirs(path, exist_ok=True)
-        charset = set()
-        dataset = None
-        gt = {
-            "train": dict(),
-            "valid": dict(),
-            "test": dict()
-        }
-        for set_name in ["train", "valid", "test"]:
-            set_path = os.path.join(path, set_name)
-            os.makedirs(set_path, exist_ok=True)
-            if set_name == "train":
-                dataset = self.dataset.train_dataset
-            elif set_name == "valid":
-                dataset = self.dataset.valid_datasets["{}-valid".format(dataset_name)]
-            elif set_name == "test":
-                self.dataset.generate_test_loader("{}-test".format(dataset_name), [(dataset_name, "test"), ])
-                dataset = self.dataset.test_datasets["{}-test".format(dataset_name)]
-
-            samples = list()
-            for sample in dataset.samples:
-                for line_label in sample["label"].split("\n"):
-                    for chunk in [line_label[i:i+100] for i in range(0, len(line_label), 100)]:
-                        charset = charset.union(set(chunk))
-                        if len(chunk) > 0:
-                            samples.append({
-                                "path": sample["path"],
-                                "label": chunk,
-                                "nb_cols": 1,
-                            })
-
-            for i, sample in enumerate(samples):
-                ext = sample['path'].split(".")[-1]
-                img_name = "{}_{}.{}".format(set_name, i, ext)
-                img_path = os.path.join(set_path, img_name)
-
-                img = dataset.generate_typed_text_line_image(sample["label"])
-                Image.fromarray(img).save(img_path)
-                gt[set_name][img_name] = {
-                    "text": sample["label"],
-                    "nb_cols": sample["nb_cols"] if "nb_cols" in sample else 1
-                }
-                if "line_label" in sample:
-                    gt[set_name][img_name]["lines"] = sample["line_label"]
-
-        with open(os.path.join(path, "labels.pkl"), "wb") as f:
-            pickle.dump({
-                "ground_truth": gt,
-                "charset": sorted(list(charset)),
-            }, f)
\ No newline at end of file
--- a/basic/generic_training_manager.py
+++ b/basic/generic_training_manager.py
--- a/basic/metric_manager.py
+++ b/basic/metric_manager.py
--- a/basic/scheduler.py
+++ b/basic/scheduler.py
-
-from torch.nn import Dropout, Dropout2d
-import numpy as np
-
-
-class DropoutScheduler:
-
-    def __init__(self, models, function, T=1e5):
-        """
-        T: number of gradient updates to converge
-        """
-
-        self.teta_list = list()
-        self.init_teta_list(models)
-        self.function = function
-        self.T = T
-        self.step_num = 0
-
-    def step(self):
-        self.step(1)
-
-    def step(self, num):
-        self.step_num += num
-
-    def init_teta_list(self, models):
-        for model_name in models.keys():
-            self.init_teta_list_module(models[model_name])
-
-    def init_teta_list_module(self, module):
-        for child in module.children():
-            if isinstance(child, Dropout) or isinstance(child, Dropout2d):
-                self.teta_list.append([child, child.p])
-            else:
-                self.init_teta_list_module(child)
-
-    def update_dropout_rate(self):
-        for (module, p) in self.teta_list:
-            module.p = self.function(p, self.step_num, self.T)
-
-
-def exponential_dropout_scheduler(dropout_rate, step, max_step):
-    return dropout_rate * (1 - np.exp(-10 * step / max_step))
-
-
-def exponential_scheduler(init_value, end_value, step, max_step):
-    step = min(step, max_step-1)
-    return init_value - (init_value - end_value) * (1 - np.exp(-10*step/max_step))
-
-
-def linear_scheduler(init_value, end_value, step, max_step):
-    return init_value + step * (end_value - init_value) / max_step
\ No newline at end of file
--- a/basic/utils.py
+++ b/basic/utils.py
-
-
-import numpy as np
-import torch
-from torch.distributions.uniform import Uniform
-import cv2
-
-
-def randint(low, high):
-    """
-    call torch.randint to preserve random among dataloader workers
-    """
-    return int(torch.randint(low, high, (1, )))
-
-
-def rand():
-    """
-    call torch.rand to preserve random among dataloader workers
-    """
-    return float(torch.rand((1, )))
-
-
-def rand_uniform(low, high):
-    """
-    call torch uniform to preserve random among dataloader workers
-    """
-    return float(Uniform(low, high).sample())
-
-
-def pad_sequences_1D(data, padding_value):
-    """
-    Pad data with padding_value to get same length
-    """
-    x_lengths = [len(x) for x in data]
-    longest_x = max(x_lengths)
-    padded_data = np.ones((len(data), longest_x)).astype(np.int32) * padding_value
-    for i, x_len in enumerate(x_lengths):
-        padded_data[i, :x_len] = data[i][:x_len]
-    return padded_data
-
-
-def resize_max(img, max_width=None, max_height=None):
-    if max_width is not None and img.shape[1] > max_width:
-        ratio = max_width / img.shape[1]
-        new_h = int(np.floor(ratio * img.shape[0]))
-        new_w = int(np.floor(ratio * img.shape[1]))
-        img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
-    if max_height is not None and img.shape[0] > max_height:
-        ratio = max_height / img.shape[0]
-        new_h = int(np.floor(ratio * img.shape[0]))
-        new_w = int(np.floor(ratio * img.shape[1]))
-        img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
-    return img
-
-
-def pad_images(data, padding_value, padding_mode="br"):
-    """
-    data: list of numpy array
-    mode: "br"/"tl"/"random" (bottom-right, top-left, random)
-    """
-    x_lengths = [x.shape[0] for x in data]
-    y_lengths = [x.shape[1] for x in data]
-    longest_x = max(x_lengths)
-    longest_y = max(y_lengths)
-    padded_data = np.ones((len(data), longest_x, longest_y, data[0].shape[2])) * padding_value
-    for i, xy_len in enumerate(zip(x_lengths, y_lengths)):
-        x_len, y_len = xy_len
-        if padding_mode == "br":
-            padded_data[i, :x_len, :y_len, ...] = data[i]
-        elif padding_mode == "tl":
-            padded_data[i, -x_len:, -y_len:, ...] = data[i]
-        elif padding_mode == "random":
-            xmax = longest_x - x_len
-            ymax = longest_y - y_len
-            xi = randint(0, xmax) if xmax >= 1 else 0
-            yi = randint(0, ymax) if ymax >= 1 else 0
-            padded_data[i, xi:xi+x_len, yi:yi+y_len, ...] = data[i]
-        else:
-            raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
-    return padded_data
-
-
-def pad_image(image, padding_value, new_height=None, new_width=None, pad_width=None, pad_height=None, padding_mode="br", return_position=False):
-    """
-    data: list of numpy array
-    mode: "br"/"tl"/"random" (bottom-right, top-left, random)
-    """
-    if pad_width is not None and new_width is not None:
-        raise NotImplementedError("pad_with and new_width are not compatible")
-    if pad_height is not None and new_height is not None:
-        raise NotImplementedError("pad_height and new_height are not compatible")
-
-    h, w, c = image.shape
-    pad_width = pad_width if pad_width is not None else max(0, new_width - w) if new_width is not None else 0
-    pad_height = pad_height if pad_height is not None else max(0, new_height - h) if new_height is not None else 0
-
-    if not (pad_width == 0 and pad_height == 0):
-        padded_image = np.ones((h+pad_height, w+pad_width, c)) * padding_value
-        if padding_mode == "br":
-            hi, wi = 0, 0
-        elif padding_mode == "tl":
-            hi, wi = pad_height, pad_width
-        elif padding_mode == "random":
-            hi = randint(0, pad_height) if pad_height >= 1 else 0
-            wi = randint(0, pad_width) if pad_width >= 1 else 0
-        else:
-            raise NotImplementedError("Undefined padding mode: {}".format(padding_mode))
-        padded_image[hi:hi + h, wi:wi + w, ...] = image
-        output = padded_image
-    else:
-        hi, wi = 0, 0
-        output = image
-
-    if return_position:
-        return output, [[hi, hi+h], [wi, wi+w]]
-    return output
-
-
-def pad_image_width_right(img, new_width, padding_value):
-    """
-    Pad img to right side with padding value to reach new_width as width
-    """
-    h, w, c = img.shape
-    pad_width = max((new_width - w), 0)
-    pad_right = np.ones((h, pad_width, c), dtype=img.dtype) * padding_value
-    img = np.concatenate([img, pad_right], axis=1)
-    return img
-
-
-def pad_image_width_left(img, new_width, padding_value):
-    """
-    Pad img to left side with padding value to reach new_width as width
-    """
-    h, w, c = img.shape
-    pad_width = max((new_width - w), 0)
-    pad_left = np.ones((h, pad_width, c), dtype=img.dtype) * padding_value
-    img = np.concatenate([pad_left, img], axis=1)
-    return img
-
-
-def pad_image_width_random(img, new_width, padding_value, max_pad_left_ratio=1):
-    """
-    Randomly pad img to left and right sides with padding value to reach new_width as width
-    """
-    h, w, c = img.shape
-    pad_width = max((new_width - w), 0)
-    max_pad_left = int(max_pad_left_ratio*pad_width)
-    pad_left = randint(0, min(pad_width, max_pad_left)) if pad_width != 0 and max_pad_left > 0 else 0
-    pad_right = pad_width - pad_left
-    pad_left = np.ones((h, pad_left, c), dtype=img.dtype) * padding_value
-    pad_right = np.ones((h, pad_right, c), dtype=img.dtype) * padding_value
-    img = np.concatenate([pad_left, img, pad_right], axis=1)
-    return img
-
-
-def pad_image_height_random(img, new_height, padding_value, max_pad_top_ratio=1):
-    """
-    Randomly pad img top and bottom sides with padding value to reach new_width as width
-    """
-    h, w, c = img.shape
-    pad_height = max((new_height - h), 0)
-    max_pad_top = int(max_pad_top_ratio*pad_height)
-    pad_top = randint(0, min(pad_height, max_pad_top)) if pad_height != 0 and max_pad_top > 0 else 0
-    pad_bottom = pad_height - pad_top
-    pad_top = np.ones((pad_top, w, c), dtype=img.dtype) * padding_value
-    pad_bottom = np.ones((pad_bottom, w, c), dtype=img.dtype) * padding_value
-    img = np.concatenate([pad_top, img, pad_bottom], axis=0)
-    return img
-
-
-def pad_image_height_bottom(img, new_height, padding_value):
-    """
-    Pad img to bottom side with padding value to reach new_height as height
-    """
-    h, w, c = img.shape
-    pad_height = max((new_height - h), 0)
-    pad_bottom = np.ones((pad_height, w, c)) * padding_value
-    img = np.concatenate([img, pad_bottom], axis=0)
-    return img
--- a/prediction-requirements.txt
+++ b/prediction-requirements.txt
+arkindex-client==1.0.11
+editdistance==0.6.0
+fontTools==4.29.1
+imageio==2.16.0
+networkx==2.6.3
+tensorboard==0.2.1
+torchvision==0.12.0
+tqdm==4.62.3