# -*- coding: utf-8 -*- """ Data extraction """ import uuid from pathlib import Path from teklia_toolbox.config import ConfigParser from atr_data_generator.arguments import CommonArgs from atr_data_generator.extract.arguments import ( ExtractionMode, FilterArgs, ImageArgs, SelectArgs, Style, TranscriptionType, ) from atr_data_generator.extract.main import main def _float(value): if value is None: return None return float(value) def _style(value): if value is None: return None return Style(value) def get_parser(): parser = ConfigParser() # Common arguments common = parser.add_subparser("common") common.add_option("dataset_name", type=str) common.add_option("output_dir", type=Path) common.add_option("cache_dir", type=Path, default=Path(".cache")) common.add_option("log_parameters", type=bool, default=True) # Image arguments image = parser.add_subparser("image", default={}) image.add_option( "extraction_mode", type=ExtractionMode, default=ExtractionMode.deskew_min_area_rect, ) image.add_option("max_deskew_angle", type=int, default=45) image.add_option("skew_angle", type=int, default=0) image.add_option("should_rotate", type=bool, default=False) image.add_option("grayscale", type=bool, default=True) scale = image.add_subparser("scale", default={}) scale.add_option("x", type=_float, default=None) scale.add_option("y_top", type=_float, default=None) scale.add_option("y_bottom", type=_float, default=None) # Filters filters = parser.add_subparser("filter") filters.add_option("transcription_type", type=TranscriptionType) filters.add_option("ignored_classes", type=str, many=True, default=[]) filters.add_option("accepted_classes", type=str, many=True, default=[]) filters.add_option( "accepted_worker_version_ids", type=uuid.UUID, many=True, default=[] ) filters.add_option("style", type=_style, default=None) filters.add_option("skip_vertical_lines", type=bool, default=False) filters.add_option("accepted_metadatas", type=dict, default={}) filters.add_option("filter_parent_metadatas", type=bool, default=False) # Select select = parser.add_subparser("select", default={}) select.add_option("corpora", type=uuid.UUID, many=True, default=[]) select.add_option("folders", type=uuid.UUID, many=True, default=[]) select.add_option("parent_type", type=str, default=None) select.add_option("element_type", type=str, default=None) return parser def config_parser(configuration_path: Path): """ Returns parsed - CommonArgs - ImageArgs - FilterArgs - SelectArgs """ config_data = get_parser().parse(configuration_path) common = CommonArgs(**config_data["common"]) image = ImageArgs(**config_data["image"]) filters = FilterArgs(**config_data["filter"]) select = SelectArgs(**config_data["select"]) return {"common": common, "image": image, "filters": filters, "select": select} def add_extract_subparser(subcommands): parser = subcommands.add_parser( "extract", description=__doc__, help=__doc__, ) parser.add_argument("--config", type=Path, help="Configuration file") parser.set_defaults(func=main, config_parser=config_parser)