Skip to content
Snippets Groups Projects
Verified Commit 61013078 authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Do not load dataset information at instanciation

parent 97a9ba29
No related branches found
No related tags found
1 merge request!453Do not load dataset information at instanciation
......@@ -46,7 +46,7 @@ class ImageDownloader:
def __init__(
self,
output: Path | None = None,
output: Path,
max_width: int | None = None,
max_height: int | None = None,
image_extension: str = "",
......@@ -57,9 +57,15 @@ class ImageDownloader:
self.max_width = max_width
self.max_height = max_height
self.image_extension = image_extension
self.data: Dict = defaultdict(dict)
self.unknown_token = unknown_token
def load_split_data(self):
"""
Load the dataset stored in `split.json` and initializes the charset.
"""
# Load split file
split_file = self.output / "split.json" if self.output else None
split_file = self.output / "split.json"
self.split: Dict = (
json.loads(split_file.read_text())
if split_file and split_file.is_file()
......@@ -68,9 +74,8 @@ class ImageDownloader:
# Create directories
for split_name in self.split:
(output / IMAGES_DIR / split_name).mkdir(parents=True, exist_ok=True)
(self.output / IMAGES_DIR / split_name).mkdir(parents=True, exist_ok=True)
self.data: Dict = defaultdict(dict)
self.charset = set(
chain.from_iterable(
split_data["text"] for split_data in self.split[TRAIN_NAME].values()
......@@ -78,7 +83,6 @@ class ImageDownloader:
)
# Add unknown token to charset
self.unknown_token = unknown_token
self.charset.add(self.unknown_token)
def check_extraction(self, values: dict) -> str | None:
......@@ -283,6 +287,7 @@ class ImageDownloader:
a mapping of the images that have been correctly uploaded (identified by its path)
to the ground-truth transcription (with NER tokens if needed).
"""
self.load_split_data()
tasks: List[Dict[str, str]] = self.build_tasks()
self.download_images(tasks)
self.export()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment