Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Data extraction
"""
import uuid
from pathlib import Path
from atr_data_generator.arguments import CommonArgs
from atr_data_generator.extract.arguments import (
ExtractionMode,
FilterArgs,
ImageArgs,
SelectArgs,
Style,
TranscriptionType,
)
from atr_data_generator.extract.main import main
from teklia_toolbox.config import ConfigParser
def _float(value):
if value is None:
return None
return float(value)
def _style(value):
if value is None:
return None
return Style(value)
def get_parser():
parser = ConfigParser()
# Common arguments
common = parser.add_subparser("common")
common.add_option("dataset_name", type=str)
common.add_option("output_dir", type=Path)
common.add_option("cache_dir", type=Path, default=Path(".cache"))
common.add_option("log_parameters", type=bool, default=True)
# Image arguments
image = parser.add_subparser("image", default={})
image.add_option(
"extraction_mode",
type=ExtractionMode,
default=ExtractionMode.deskew_min_area_rect,
)
image.add_option("max_deskew_angle", type=int, default=45)
image.add_option("skew_angle", type=int, default=0)
image.add_option("should_rotate", type=bool, default=False)
image.add_option("grayscale", type=bool, default=True)
scale = image.add_subparser("scale", default={})
scale.add_option("x", type=_float, default=None)
scale.add_option("y_top", type=_float, default=None)
scale.add_option("y_bottom", type=_float, default=None)
# Filters
filters = parser.add_subparser("filter")
filters.add_option("transcription_type", type=TranscriptionType)
filters.add_option("ignored_classes", type=str, many=True, default=[])
filters.add_option("accepted_classes", type=str, many=True, default=[])
filters.add_option(
"accepted_worker_version_ids", type=uuid.UUID, many=True, default=[]
)
filters.add_option("style", type=_style, default=None)
filters.add_option("skip_vertical_lines", type=bool, default=False)
filters.add_option("accepted_metadatas", type=dict, default={})
filters.add_option("filter_parent_metadatas", type=bool, default=False)
# Select
select = parser.add_subparser("select", default={})
select.add_option("corpora", type=uuid.UUID, many=True, default=[])
select.add_option("folders", type=uuid.UUID, many=True, default=[])
select.add_option("parent_type", type=str, default=None)
select.add_option("element_type", type=str, default=None)
return parser
def config_parser(configuration_path: Path):
"""
Returns parsed
- CommonArgs
- ImageArgs
- FilterArgs
- SelectArgs
"""
config_data = get_parser().parse(configuration_path)
common = CommonArgs(**config_data["common"])
image = ImageArgs(**config_data["image"])
filters = FilterArgs(**config_data["filter"])
select = SelectArgs(**config_data["select"])
return {"common": common, "image": image, "filters": filters, "select": select}
def add_extract_subparser(subcommands):
parser = subcommands.add_parser(
"extract",
description=__doc__,
help=__doc__,
)
parser.add_argument("--config", type=Path, help="Configuration file")
parser.set_defaults(func=main, config_parser=config_parser)