From 64c5d8d80f19dbe3c5bfda3cb12fcbc4a1423b1e Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Tue, 19 Mar 2024 14:41:02 +0000
Subject: [PATCH] Do not filter dataset elements

---
 atr_data_generator/extract/__init__.py  | 4 ++--
 atr_data_generator/extract/arguments.py | 3 +++
 atr_data_generator/extract/base.py      | 4 +---
 atr_data_generator/extract/db.py        | 9 ++-------
 docs/extract/configuration.md           | 2 +-
 5 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/atr_data_generator/extract/__init__.py b/atr_data_generator/extract/__init__.py
index f45e448..9caac84 100644
--- a/atr_data_generator/extract/__init__.py
+++ b/atr_data_generator/extract/__init__.py
@@ -69,8 +69,8 @@ def get_parser():
     filters.add_option("skip_vertical_lines", type=bool, default=False)
 
     # Select
-    select = parser.add_subparser("select", default={})
-    select.add_option("dataset", type=str, default=None)
+    select = parser.add_subparser("select")
+    select.add_option("dataset", type=str)
     select.add_option("element_type", type=str, default=None)
 
     # Format specific
diff --git a/atr_data_generator/extract/arguments.py b/atr_data_generator/extract/arguments.py
index 9b91f1e..c70f0f3 100644
--- a/atr_data_generator/extract/arguments.py
+++ b/atr_data_generator/extract/arguments.py
@@ -53,6 +53,9 @@ class SelectArgs(BaseArgs):
 
     def __post_init__(self):
         assert UUID(self.dataset)
+        # Configuration parser issue: https://gitlab.teklia.com/tools/python-toolbox/-/issues/2
+        if self.element_type == "None":
+            self.element_type = None
 
 
 @dataclass
diff --git a/atr_data_generator/extract/base.py b/atr_data_generator/extract/base.py
index 4c67105..0e3a402 100644
--- a/atr_data_generator/extract/base.py
+++ b/atr_data_generator/extract/base.py
@@ -181,9 +181,7 @@ class DataGenerator:
         # Iterate over sets
         for split in dataset.sets.split(","):
             # Find the dataset elements
-            for parent in get_dataset_elements(
-                dataset, split, self.select.element_type
-            ):
+            for parent in get_dataset_elements(dataset, split):
                 self.process_parent(parent.element, split)
 
         assert sum(
diff --git a/atr_data_generator/extract/db.py b/atr_data_generator/extract/db.py
index 20f8d1b..5675b7f 100644
--- a/atr_data_generator/extract/db.py
+++ b/atr_data_generator/extract/db.py
@@ -8,16 +8,15 @@ from arkindex_export.queries import list_children
 from atr_data_generator.extract.arguments import MANUAL
 
 
-def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]):
+def get_dataset_elements(dataset: Dataset, split: str):
     """
     Retrieve dataset elements in a specific split from an SQLite export of an Arkindex corpus
 
     :param dataset: Dataset object from which the elements come.
     :param split: Set name of the dataset to use.
-    :param type: Optionally filter by element type.
     :return: The filtered list of dataset elements.
     """
-    query = (
+    return (
         DatasetElement.select(DatasetElement.element)
         .join(Element)
         .where(
@@ -25,10 +24,6 @@ def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]):
             DatasetElement.set_name == split,
         )
     )
-    if type:
-        query = query.where(Element.type == type)
-
-    return query
 
 
 def parse_sources(sources: List[str]):
diff --git a/docs/extract/configuration.md b/docs/extract/configuration.md
index 6358d9f..152c125 100644
--- a/docs/extract/configuration.md
+++ b/docs/extract/configuration.md
@@ -5,7 +5,7 @@ The YAML configuration for the `extract` subcommand has 5 sections:
 - `common`,
 - `image` (optional),
 - `filter` (optional),
-- `select` (optional).
+- `select`.
 
 An example configuration file, filled with the default values when there is one, is available at `examples/extraction.yml`.
 
-- 
GitLab