Skip to content
Snippets Groups Projects
Unverified Commit 5d8a8cf8 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Remove obsolete format module

parent 1392f95f
No related branches found
No related tags found
1 merge request!272Remove obsolete format module
# -*- coding: utf-8 -*-
import json
import pickle
import re
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
def remove_spaces(text):
# remove begin/ending spaces
text = text.strip()
# replace \t with regular space
text = re.sub("\t", " ", text)
# remove consecutive spaces
text = re.sub(" +", " ", text)
return text
class ATRDatasetFormatter:
"""
Global pipeline/functions for dataset formatting
"""
def __init__(self, dataset: Path, image_format: str, remove_spaces: bool):
self.dataset = dataset
self.set_names = ["train", "val", "test"]
self.remove_spaces = remove_spaces
self.image_folder = self.dataset / "images"
self.labels_folder = self.dataset / "labels"
self.image_format = image_format
if self.image_format.startswith("."):
self.image_format = self.image_format[1:]
def format(self):
"""
Format ATR dataset
"""
ground_truth = defaultdict(dict)
charset = set()
for set_name in self.set_names:
set_folder = self.labels_folder / set_name
for file_name in tqdm(set_folder.iterdir(), desc="Formatting " + set_name):
data = self.parse_labels(set_name, file_name)
charset = charset.union(set(data["label"]))
ground_truth[set_name][data["img_path"]] = {
"text": data["label"],
}
return ground_truth, charset
def read_file(self, file_name: Path):
text = file_name.read_text()
if self.remove_spaces:
text = remove_spaces(text)
return text.strip()
def parse_labels(self, set_name: str, file_name: Path):
return {
"img_path": (self.image_folder / set_name / file_name.stem)
.with_suffix(self.image_format)
.resolve(),
"label": self.read_file(self.labels_folder / set_name / file_name.name),
}
def run(self):
ground_truth, charset = self.format()
(self.dataset / "labels.json").write_text(
json.dumps(
ground_truth,
sort_keys=True,
indent=4,
)
)
(self.dataset / "charset.pkl").write_bytes(pickle.dumps(sorted(list(charset))))
def run(dataset, image_format, keep_spaces):
ATRDatasetFormatter(
dataset=dataset, image_format=image_format, remove_spaces=not keep_spaces
).run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment