# -*- coding: utf-8 -*- import getpass from dataclasses import dataclass, field from enum import Enum from typing import List, Optional USER = getpass.getuser() class Style(Enum): handwritten: str = "handwritten" typewritten: str = "typewritten" other: str = "other" class ExtractionMode(Enum): boundingRect: str = "boundingRect" min_area_rect: str = "min_area_rect" deskew_min_area_rect: str = "deskew_min_area_rect" skew_min_area_rect: str = "skew_min_area_rect" polygon: str = "polygon" skew_polygon: str = "skew_polygon" deskew_polygon: str = "deskew_polygon" class TranscriptionType(Enum): word: str = "word" text_line: str = "text_line" half_subject_line: str = "half_subject_line" text_zone: str = "text_zone" paragraph: str = "paragraph" act: str = "act" page: str = "page" text: str = "text" @dataclass class SelectArgs: """ Arguments to select elements from Arkindex Args: corpora (list): List of corpus ids to be used. volumes (list): List of volume ids to be used. folders (list): List of folder ids to be used. Elements of `volume_type` will be searched recursively in these folders pages (list): List of page ids to be used. selection (bool): Get elements from selection volume_type (str): Volumes (1 level above page) may have a different name on corpora """ corpora: Optional[List[str]] = field(default_factory=list) volumes: Optional[List[str]] = field(default_factory=list) folders: Optional[List[str]] = field(default_factory=list) pages: Optional[List[str]] = field(default_factory=list) selection: bool = False volume_type: str = "volume" @dataclass class CommonArgs: """ General arguments Args: cache_dir (str): Cache directory where to save the full size downloaded images. log_parameters (bool): Save every parameters to a JSON file. """ cache_dir: str = f"/tmp/kaldi_data_generator_{USER}/cache/" log_parameters: bool = True @dataclass class SplitArgs: """ Arguments related to data splitting into training, validation and test subsets. Args: train_ratio (float): Ratio of data to be used in the training set. Should be between 0 and 1. test_ratio (float): Ratio of data to be used in the testing set. Should be between 0 and 1. val_ratio (float): Ratio of data to be used in the validation set. The sum of three variables should equal 1. use_existing_split (bool): Use an existing split instead of random. Expecting line_ids to be prefixed with (train, val and test). split_only (bool): Create the split from already downloaded lines, don't download the lines no_split (bool): No splitting of the data to be done just download the line in the right format """ train_ratio: float = 0.8 test_ratio: float = 0.1 val_ratio: float = 1 - train_ratio - test_ratio use_existing_split: bool = False split_only: bool = False no_split: bool = False @dataclass class ImageArgs: """ Arguments related to image transformation. Args: extraction_mode: Mode for extracting the line images: {[e.name for e in Extraction]}, max_deskew_angle: Maximum angle by which deskewing is allowed to rotate the line image. the angle determined by deskew tool is bigger than max then that line won't be deskewed/rotated. skew_angle: Angle by which the line image will be rotated. Useful for data augmentation" creating skewed text lines for a more robust model. Only used with skew_* extraction modes. should_rotate (bool): Use text line rotation class to rotate lines if possible grayscale (bool): Convert images to grayscale (By default grayscale) scale_x (float): Ratio of how much to scale the polygon horizontally (1.0 means no rescaling) scale_y_top (float): Ratio of how much to scale the polygon vertically on the top (1.0 means no rescaling) scale_y_bottom (float): Ratio of how much to scale the polygon vertically on the bottom (1.0 means no rescaling) """ extraction_mode: ExtractionMode = ExtractionMode.deskew_min_area_rect max_deskew_angle: int = 45 skew_angle: int = 0 should_rotate: bool = False grayscale: bool = True scale_x: Optional[float] = None scale_y_top: Optional[float] = None scale_y_bottom: Optional[float] = None @dataclass class FilterArgs: """ Arguments related to element filtering. Args: transcription_type: Which type of elements' transcriptions to use? (page, paragraph, text_line, etc) ignored_classes: List of ignored ml_class names. Filter lines by class accepted_classes: List of accepted ml_class names. Filter lines by class accepted_worker_version_ids:List of accepted worker version ids. Filter transcriptions by worker version ids. The order is important - only up to one transcription will be chosen per element (text_line) and the worker version order defines the precedence. If there exists a transcription for the first worker version then it will be chosen, otherwise will continue on to the next worker version. Use `--accepted_worker_version_ids manual` to get only manual transcriptions style: Filter line images by style class. 'other' corresponds to line elements that have neither handwritten or typewritten class : {[s.name for s in Style]} accepted_metadatas: Key-value dictionary where each entry is a mandatory Arkindex metadata name/value. Filter lines by metadata. """ transcription_type: TranscriptionType = TranscriptionType.text_line ignored_classes: List[str] = field(default_factory=list) accepted_classes: List[str] = field(default_factory=list) accepted_worker_version_ids: List[str] = field(default_factory=list) skip_vertical_lines: bool = False style: Style = None accepted_metadatas: dict = field(default_factory=dict) filter_parent_metadatas: bool = False