From 64c5d8d80f19dbe3c5bfda3cb12fcbc4a1423b1e Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Tue, 19 Mar 2024 14:41:02 +0000 Subject: [PATCH] Do not filter dataset elements --- atr_data_generator/extract/__init__.py | 4 ++-- atr_data_generator/extract/arguments.py | 3 +++ atr_data_generator/extract/base.py | 4 +--- atr_data_generator/extract/db.py | 9 ++------- docs/extract/configuration.md | 2 +- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/atr_data_generator/extract/__init__.py b/atr_data_generator/extract/__init__.py index f45e448..9caac84 100644 --- a/atr_data_generator/extract/__init__.py +++ b/atr_data_generator/extract/__init__.py @@ -69,8 +69,8 @@ def get_parser(): filters.add_option("skip_vertical_lines", type=bool, default=False) # Select - select = parser.add_subparser("select", default={}) - select.add_option("dataset", type=str, default=None) + select = parser.add_subparser("select") + select.add_option("dataset", type=str) select.add_option("element_type", type=str, default=None) # Format specific diff --git a/atr_data_generator/extract/arguments.py b/atr_data_generator/extract/arguments.py index 9b91f1e..c70f0f3 100644 --- a/atr_data_generator/extract/arguments.py +++ b/atr_data_generator/extract/arguments.py @@ -53,6 +53,9 @@ class SelectArgs(BaseArgs): def __post_init__(self): assert UUID(self.dataset) + # Configuration parser issue: https://gitlab.teklia.com/tools/python-toolbox/-/issues/2 + if self.element_type == "None": + self.element_type = None @dataclass diff --git a/atr_data_generator/extract/base.py b/atr_data_generator/extract/base.py index 4c67105..0e3a402 100644 --- a/atr_data_generator/extract/base.py +++ b/atr_data_generator/extract/base.py @@ -181,9 +181,7 @@ class DataGenerator: # Iterate over sets for split in dataset.sets.split(","): # Find the dataset elements - for parent in get_dataset_elements( - dataset, split, self.select.element_type - ): + for parent in get_dataset_elements(dataset, split): self.process_parent(parent.element, split) assert sum( diff --git a/atr_data_generator/extract/db.py b/atr_data_generator/extract/db.py index 20f8d1b..5675b7f 100644 --- a/atr_data_generator/extract/db.py +++ b/atr_data_generator/extract/db.py @@ -8,16 +8,15 @@ from arkindex_export.queries import list_children from atr_data_generator.extract.arguments import MANUAL -def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]): +def get_dataset_elements(dataset: Dataset, split: str): """ Retrieve dataset elements in a specific split from an SQLite export of an Arkindex corpus :param dataset: Dataset object from which the elements come. :param split: Set name of the dataset to use. - :param type: Optionally filter by element type. :return: The filtered list of dataset elements. """ - query = ( + return ( DatasetElement.select(DatasetElement.element) .join(Element) .where( @@ -25,10 +24,6 @@ def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]): DatasetElement.set_name == split, ) ) - if type: - query = query.where(Element.type == type) - - return query def parse_sources(sources: List[str]): diff --git a/docs/extract/configuration.md b/docs/extract/configuration.md index 6358d9f..152c125 100644 --- a/docs/extract/configuration.md +++ b/docs/extract/configuration.md @@ -5,7 +5,7 @@ The YAML configuration for the `extract` subcommand has 5 sections: - `common`, - `image` (optional), - `filter` (optional), -- `select` (optional). +- `select`. An example configuration file, filled with the default values when there is one, is available at `examples/extraction.yml`. -- GitLab