diff --git a/README.md b/README.md
index 8d4984bc37c45467e09777306843513fd9d75f6f..f68b0a46e3f4bb6f3afc09ede164b4dc668887ca 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,6 @@
 
 This script downloads pages with transcriptions from Arkindex
 and converts data to ATR format.
-It also generates reproducible train, val and test splits.
+It also generates reproducible splits.
 
 A documentation is available at https://atr.pages.teklia.com/data-generator/.
diff --git a/atr_data_generator/extract/__init__.py b/atr_data_generator/extract/__init__.py
index d5af7c91749d057e5236405f2d67945843142756..f45e4484d109c1e676522b9ebafc38d1e16edef9 100644
--- a/atr_data_generator/extract/__init__.py
+++ b/atr_data_generator/extract/__init__.py
@@ -2,7 +2,6 @@
 """
 Data extraction
 """
-import uuid
 from pathlib import Path
 from typing import Optional
 
@@ -20,7 +19,6 @@ from atr_data_generator.extract.base import DataGenerator
 from atr_data_generator.extract.pylaia.arguments import PylaiaArgs
 from atr_data_generator.extract.pylaia.main import PylaiaDataGenerator
 from atr_data_generator.extract.utils import ListedEnum
-from atr_data_generator.split.arguments import SplitArgs
 
 
 class Generators(ListedEnum):
@@ -72,15 +70,9 @@ def get_parser():
 
     # Select
     select = parser.add_subparser("select", default={})
-    select.add_option("parent_type", type=str, default=None)
+    select.add_option("dataset", type=str, default=None)
     select.add_option("element_type", type=str, default=None)
 
-    # Split
-    split = parser.add_subparser("split", default={})
-    split.add_option("train_folder", type=uuid.UUID, default=None)
-    split.add_option("validation_folder", type=uuid.UUID, default=None)
-    split.add_option("test_folder", type=uuid.UUID, default=None)
-
     # Format specific
     # Pylaia
     pylaia = parser.add_subparser("pylaia", default={})
@@ -96,7 +88,6 @@ def config_parser(configuration_path: Path):
     - ImageArgs
     - FilterArgs
     - SelectArgs
-    - SplitArgs
     # Format specific args if provided
     """
     config_data = get_parser().parse(configuration_path)
@@ -105,7 +96,6 @@ def config_parser(configuration_path: Path):
         "image": ImageArgs(**config_data["image"]),
         "filter": FilterArgs(**config_data["filter"]),
         "select": SelectArgs(**config_data["select"]),
-        "split": SplitArgs(**config_data["split"]),
         # Format specific
         "pylaia": PylaiaArgs(**config_data["pylaia"]),
     }
diff --git a/atr_data_generator/extract/arguments.py b/atr_data_generator/extract/arguments.py
index 96790d8f996f377201978621cfe449a074fb55aa..9b91f1eb7b522ecdb9e05c454673c99a6347dc73 100644
--- a/atr_data_generator/extract/arguments.py
+++ b/atr_data_generator/extract/arguments.py
@@ -44,12 +44,15 @@ class SelectArgs(BaseArgs):
     Arguments to select elements from Arkindex
 
     Args:
+        dataset (str): Filter dataset to process
         element_type (str): Filter elements to process by type
-        parent_type (str): Filter elements parents to process by type
     """
 
+    dataset: str
     element_type: Optional[str] = None
-    parent_type: Optional[str] = None
+
+    def __post_init__(self):
+        assert UUID(self.dataset)
 
 
 @dataclass
diff --git a/atr_data_generator/extract/base.py b/atr_data_generator/extract/base.py
index 30ee6be4ee8fd0eaf154502c62ef37025700e867..4c671056044b5fa7f97440d0c4256facbf3432b0 100644
--- a/atr_data_generator/extract/base.py
+++ b/atr_data_generator/extract/base.py
@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import Any, Dict
 
 import numpy as np
-from arkindex_export import Element, open_database
+from arkindex_export import Dataset, Element, open_database
 from line_image_extractor.extractor import extract, read_img, save_img
 from line_image_extractor.image_utils import polygon_to_bbox, resize
 from PIL import Image
@@ -15,9 +15,8 @@ from tqdm import tqdm
 
 from atr_data_generator.arguments import CommonArgs
 from atr_data_generator.extract.arguments import FilterArgs, ImageArgs, SelectArgs
-from atr_data_generator.extract.db import get_children, get_children_info
+from atr_data_generator.extract.db import get_children_info, get_dataset_elements
 from atr_data_generator.extract.utils import _is_vertical, resize_image_height
-from atr_data_generator.split.arguments import Partition, SplitArgs
 from atr_data_generator.utils import download_image, export_parameters
 
 logger = logging.getLogger(__name__)
@@ -33,7 +32,6 @@ class DataGenerator:
     image: ImageArgs
     filter: FilterArgs
     select: SelectArgs
-    split: SplitArgs
 
     data: Dict[str, dict] = field(default_factory=lambda: defaultdict(dict))
 
@@ -43,14 +41,12 @@ class DataGenerator:
         image: ImageArgs,
         filter: FilterArgs,
         select: SelectArgs,
-        split: SplitArgs,
         **kwargs,
     ) -> None:
         self.common = common
         self.image = image
         self.filter = filter
         self.select = select
-        self.split = split
         self.data: Dict[str, dict] = defaultdict(dict)
 
     def __post_init__(self):
@@ -87,7 +83,7 @@ class DataGenerator:
 
         return read_img(cached_img_path, self.image.grayscale)
 
-    def parse_transcription(self, transcription: str, **kwargs):
+    def parse_transcription(self, transcription: str, *args, **kwargs):
         return transcription
 
     def get_image(self, child: Element, destination: Path) -> None:
@@ -138,7 +134,7 @@ class DataGenerator:
     def parse_image_path(self, image_path: Path):
         return str(image_path)
 
-    def process_parent(self, parent: Element, partition: Partition):
+    def process_parent(self, parent: Element, split: str):
         """
         Process every children under this parent element.
         """
@@ -148,7 +144,7 @@ class DataGenerator:
                 type=self.select.element_type,
                 sources=self.filter.accepted_worker_version_ids,
             ),
-            desc=f"Extracting data from {parent.type} {parent.name} for split ({partition.value})",
+            desc=f"Extracting data from {parent.type} {parent.name} for split ({split})",
         ):
             image_path = (
                 self.common.output_dir
@@ -156,9 +152,9 @@ class DataGenerator:
                 / f"{parent.id}_{child.element.name.split('_')[-1]}_{child.element_id}.jpg"
             )
             # Store transcription
-            self.data[partition.value][
+            self.data[split][
                 self.parse_image_path(image_path)
-            ] = self.parse_transcription(child.text, partition=partition)
+            ] = self.parse_transcription(child.text, split=split)
 
             # Extract the image
             self.get_image(child.element, image_path)
@@ -173,28 +169,26 @@ class DataGenerator:
 
     def run(self, db_path: Path):
         """
-        Extract data from folders of elements with selected type
+        Extract data from the selected dataset
         """
-        # Either provide folders or an existing split
-        assert (
-            self.split.train_folder
-            or self.split.validation_folder
-            or self.split.test_folder
-        ), "Please provide at least one folder."
-
         self.connect_db(db_path)
 
-        # Iterate over folders
-        for partition, folder_id in zip(Partition, self.split.folders):
-            if folder_id is None:
-                continue
-            # Find the parent elements
-            for parent_element in get_children(folder_id, type=self.select.parent_type):
-                self.process_parent(parent_element, partition)
+        try:
+            dataset = Dataset.get(id=self.select.dataset)
+        except Exception:
+            raise Exception(f"{self.select.dataset} is not a valid dataset ID")
+
+        # Iterate over sets
+        for split in dataset.sets.split(","):
+            # Find the dataset elements
+            for parent in get_dataset_elements(
+                dataset, split, self.select.element_type
+            ):
+                self.process_parent(parent.element, split)
 
         assert sum(
-            len(partition.values()) for partition in self.data.values()
-        ), "No data was extracted from all three train/validation/test folders."
+            len(split.values()) for split in self.data.values()
+        ), "No data was extracted from all the splits."
 
         self.export()
 
@@ -203,5 +197,4 @@ class DataGenerator:
             image=self.image,
             select=self.select,
             filter=self.filter,
-            split=self.split,
         )
diff --git a/atr_data_generator/extract/db.py b/atr_data_generator/extract/db.py
index 6bd29287b26d2f3be24a5e936ef3ecf9eb1606ae..20f8d1b6731b7d7728147b7199e44aeeda7bc6fd 100644
--- a/atr_data_generator/extract/db.py
+++ b/atr_data_generator/extract/db.py
@@ -1,24 +1,34 @@
 # -*- coding: utf-8 -*-
+from operator import attrgetter
 from typing import List, Optional
-from uuid import UUID
 
-from arkindex_export import Element, Transcription
+from arkindex_export import Dataset, DatasetElement, Element, Transcription
 from arkindex_export.queries import list_children
 
 from atr_data_generator.extract.arguments import MANUAL
 
 
-def get_children(parent_id: UUID, type: Optional[str]):
-    """Recursively list children elements.
+def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]):
+    """
+    Retrieve dataset elements in a specific split from an SQLite export of an Arkindex corpus
 
-    :param parent_id: ID of the parent element.
+    :param dataset: Dataset object from which the elements come.
+    :param split: Set name of the dataset to use.
     :param type: Optionally filter by element type.
-    :return: The filtered list of children.
+    :return: The filtered list of dataset elements.
     """
-    query = list_children(parent_id)
+    query = (
+        DatasetElement.select(DatasetElement.element)
+        .join(Element)
+        .where(
+            DatasetElement.dataset == dataset,
+            DatasetElement.set_name == split,
+        )
+    )
     if type:
         query = query.where(Element.type == type)
-    return query.order_by(Element.name)
+
+    return query
 
 
 def parse_sources(sources: List[str]):
@@ -56,6 +66,11 @@ def get_children_info(
 
     elements = list_children(parent_id)
 
+    # Insert parent in the query to allow to process it
+    elements = Element.select().where(
+        Element.id.in_(list(map(attrgetter("id"), elements)) + [parent_id])
+    )
+
     # Filter by type
     if type:
         elements = elements.where(Element.type == type)
diff --git a/atr_data_generator/extract/pylaia/main.py b/atr_data_generator/extract/pylaia/main.py
index 237beb77a59f690b2f3ec3b2226b0a39b164daf9..9d9fb786c3c1fc46238a1107e973cbc739b79117 100644
--- a/atr_data_generator/extract/pylaia/main.py
+++ b/atr_data_generator/extract/pylaia/main.py
@@ -5,15 +5,18 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
+from arkindex_export import Dataset
+
 from atr_data_generator.extract.base import DataGenerator
 from atr_data_generator.extract.pylaia import LM_FILENAME
 from atr_data_generator.extract.pylaia.arguments import PylaiaArgs
 from atr_data_generator.extract.pylaia.syms import Syms
 from atr_data_generator.extract.pylaia.utils import _merge
-from atr_data_generator.split.arguments import Partition
 
 logger = logging.getLogger(__name__)
 
+TRAIN_SPLIT = "train"
+
 
 @dataclass
 class PylaiaDataGenerator(DataGenerator):
@@ -23,7 +26,7 @@ class PylaiaDataGenerator(DataGenerator):
     def __post_init__(self):
         super().__post_init__()
         if self.pylaia.syms_path:
-            # Load pre existing syms for all partitions
+            # Load pre existing syms for all splits
             assert self.pylaia.syms_path.exists()
             self.syms = Syms.from_disk(self.pylaia.syms_path)
             logger.info(f"Loaded symbols from {self.pylaia.syms_path}")
@@ -35,48 +38,47 @@ class PylaiaDataGenerator(DataGenerator):
         """
         return str(image_path.relative_to(image_path.parent))
 
-    def parse_transcription(self, transcription: str, partition: Partition, **kwargs):
+    def parse_transcription(self, transcription: str, split: str, *args, **kwargs):
         return transcription, self.syms.process_line(
-            transcription.strip(), read_only=partition != Partition.Train
+            transcription.strip(), read_only=split != TRAIN_SPLIT
         )
 
-    def export_partition(self, partition):
+    def export_split(self, split):
         """Exports 3 files
-        - <partition_name>.txt, with path_to_image and tokenized transcription
-        - <partition_name>_ids.txt with path_to_image only
-        - text_<partition_name>.txt with path_to_image and not tokenized transcription
+        - <split_name>.txt, with path_to_image and tokenized transcription
+        - <split_name>_ids.txt with path_to_image only
+        - text_<split_name>.txt with path_to_image and not tokenized transcription
 
         The tokenized transcriptions are also exported in a separate file for LM training.
         """
-        data = [(path, *value) for path, value in self.data[partition.value].items()]
+        data = [(path, *value) for path, value in self.data[split].items()]
         if not data:
             return
 
         paths, transcriptions, tokenized = zip(*data)
 
         # Path + tokenized
-        (self.common.output_dir / partition.value).with_suffix(".txt").write_text(
+        (self.common.output_dir / split).with_suffix(".txt").write_text(
             "\n".join(_merge(paths, tokenized))
         )
 
         # Path + not tokenized
-        (self.common.output_dir / f"{partition.value}_no_space.txt").write_text(
+        (self.common.output_dir / f"{split}_no_space.txt").write_text(
             "\n".join(_merge(paths, transcriptions))
         )
 
         # Paths
-        (self.common.output_dir / f"{partition.value}_ids.txt").write_text(
-            "\n".join(paths)
-        )
+        (self.common.output_dir / f"{split}_ids.txt").write_text("\n".join(paths))
 
         # Export data for LM training
-        if partition == Partition.Train:
+        if split == TRAIN_SPLIT:
             (self.common.output_dir / LM_FILENAME).write_text("\n".join(tokenized))
 
     def export(self):
-        # Export syms of training partition
+        # Export syms of training split
         self.syms.export(self.common.output_dir)
 
-        # Export each partition
-        for partition in Partition:
-            self.export_partition(partition)
+        # Export each split
+        dataset = Dataset.get(id=self.select.dataset)
+        for split in dataset.sets.split(","):
+            self.export_split(split)
diff --git a/atr_data_generator/split/__init__.py b/atr_data_generator/split/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/atr_data_generator/split/arguments.py b/atr_data_generator/split/arguments.py
deleted file mode 100644
index 976fc8325173f0b6aa8d146a8a379394c20aad2a..0000000000000000000000000000000000000000
--- a/atr_data_generator/split/arguments.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from dataclasses import dataclass, fields
-from enum import Enum
-from typing import Optional
-from uuid import UUID
-
-from atr_data_generator.arguments import BaseArgs
-
-
-class Partition(Enum):
-    Train = "train"
-    Validation = "val"
-    Test = "test"
-
-
-@dataclass
-class SplitArgs(BaseArgs):
-    """
-    Arguments related to data splitting into training, validation and test subsets.
-
-    Args:
-        train_ratio (float): Ratio of data to be used in the training set. Should be between 0 and 1.
-        test_ratio (float): Ratio of data to be used in the testing set. Should be between 0 and 1.
-        val_ratio (float): Ratio of data to be used in the validation set. The sum of three variables should equal 1.
-        train_folder (float): ID of the training folder.
-        validation_folder (float): ID of the validation folder.
-        test_folder (float): ID of the testing folder.
-    """
-
-    train_ratio: float = 0.8
-    test_ratio: float = 0.1
-    val_ratio: float = 1 - train_ratio - test_ratio
-
-    # Existing split from Arkindex
-    train_folder: Optional[UUID] = None
-    validation_folder: Optional[UUID] = None
-    test_folder: Optional[UUID] = None
-
-    @property
-    def folders(self):
-        """Get the folders."""
-        return (self.train_folder, self.validation_folder, self.test_folder)
-
-    def json(self):
-        data = super().json()
-
-        # String formatting for UUIDs
-        for field in fields(self):
-            if "folder" not in field.name:
-                continue
-            if (value := getattr(self, field.name)) and value is not None:
-                data[field.name] = str(value)
-        return data
diff --git a/atr_data_generator/utils.py b/atr_data_generator/utils.py
index d8cd1ba0e05bfded9d672370717bfd5c00ad0005..b2715db781d004f3dbc67730905abdff2d197d27 100644
--- a/atr_data_generator/utils.py
+++ b/atr_data_generator/utils.py
@@ -22,7 +22,6 @@ from tenacity import (
 if TYPE_CHECKING:
     from atr_data_generator.arguments import CommonArgs
     from atr_data_generator.extract.arguments import FilterArgs, ImageArgs, SelectArgs
-    from atr_data_generator.split.arguments import SplitArgs
 
 
 logger = logging.getLogger(__name__)
@@ -73,7 +72,6 @@ def download_image(url):
 def export_parameters(
     common: CommonArgs,
     image: Optional[ImageArgs] = None,
-    split: Optional[SplitArgs] = None,
     select: Optional[SelectArgs] = None,
     filter: Optional[FilterArgs] = None,
 ):
@@ -83,8 +81,6 @@ def export_parameters(
     config = {"common": common.json()}
     if image:
         config["image"] = image.json()
-    if split:
-        config["split"] = split.json()
     if select:
         config["select"] = select.json()
     if filter:
diff --git a/docs/extract/configuration.md b/docs/extract/configuration.md
index f9c4705178f6bccf110c58bbdf875104518e02ca..6358d9f44681ee184ff12b501588514f302830b0 100644
--- a/docs/extract/configuration.md
+++ b/docs/extract/configuration.md
@@ -6,7 +6,6 @@ The YAML configuration for the `extract` subcommand has 5 sections:
 - `image` (optional),
 - `filter` (optional),
 - `select` (optional).
-- `split`.
 
 An example configuration file, filled with the default values when there is one, is available at `examples/extraction.yml`.
 
@@ -25,7 +24,3 @@ The full list of fields as well as their default values is available in the [Pyt
 ## Select
 
 The full list of fields as well as their default values is available in the [Python reference](../ref/extract/arguments.md#atr_data_generator.extract.arguments.SelectArgs).
-
-## Split
-
-The full list of fields as well as their default values is available in the [Python reference](../ref/extract/arguments.md#atr_data_generator.split.arguments.SplitArgs).
diff --git a/docs/ref/split/arguments.md b/docs/ref/split/arguments.md
deleted file mode 100644
index dc08ad5399310f7450b98c8945c4e70336ef0f14..0000000000000000000000000000000000000000
--- a/docs/ref/split/arguments.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Arguments
-
-::: atr_data_generator.split.arguments
diff --git a/examples/extraction.yml b/examples/extraction.yml
index c63bf451699b92725a880861f20f5dc045de55eb..9254764a94d29e49bf70ad455ad015ecace5da2d 100644
--- a/examples/extraction.yml
+++ b/examples/extraction.yml
@@ -18,9 +18,5 @@ image:
   should_rotate: false
   skew_angle: 0
 select:
+  dataset: # Fill me
   element_type: null
-  parent_type: null
-split:
-  test_folder: null # Fill me or other folders
-  train_folder: null # Fill me or other folders
-  validation_folder: null # Fill me or other folders
diff --git a/examples/split.yml b/examples/split.yml
deleted file mode 100644
index 5eaebf1fd8af6e4e3e451087eeb3ae83b1c83624..0000000000000000000000000000000000000000
--- a/examples/split.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-common:
-  cache_dir: .cache
-  dataset_name: # Fill me
-  log_parameters: true
-  output_dir: # Fill me
-split:
-  test_ratio: 0.1
-  train_ratio: 0.8
-  use_existing_split: false
-  val_ratio: 0.1
-  train_folder: null
-  validation_folder: null
-  test_folder: null
diff --git a/mkdocs.yml b/mkdocs.yml
index aa3d13453e6caf8d262d0f27b24ef2290e699e7b..fa0183982c7d6eaa6872219771062f5b8adbb511 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,8 +58,6 @@ nav:
     - Dataset extraction:
       - Arguments: ref/extract/arguments.md
       - PyLaia-specific arguments: ref/extract/pylaia/arguments.md
-    - Dataset splitting:
-      - Arguments: ref/split/arguments.md
   - Development: dev.md
 
 markdown_extensions:
diff --git a/tests/conftest.py b/tests/conftest.py
index 82e88e6db5cc326a817fa52fad30514a8e360597..46482cb8c4de4fa22cdd4853c8c5753ff2c60917 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,7 @@ import pytest
 from PIL import Image
 
 FIXTURES = Path(__file__).resolve().parent / "data"
+DATASET_ID = "aa30fea9-3b12-497f-ac6b-eb4c2ee22c48"
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/data/pylaia/corpus_lm.txt b/tests/data/pylaia/corpus_lm.txt
index d8eac54947bfcb16aa1dcc89113aa26522bcb447..c7bf28d014366a2289a14803f6a141e456ad8463 100644
--- a/tests/data/pylaia/corpus_lm.txt
+++ b/tests/data/pylaia/corpus_lm.txt
@@ -1,3 +1,11 @@
+p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y
+w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s
+u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s
+W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l
+& <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> .
+h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h
+r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h
+b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e
 t h e <space> g l a r i n g <space> b e a c o n <space> o f <space> a <space> l i g h t h o u s e <space> .
 t e l l s <space> h i s <space> s t o r y <space> b e s t <space> i n <space> t h e <space> f a c e s <space> o f <space> h i s <space> c r o w d s <space> ,
 t h e <space> w i l d l y <space> f l a p p i n g <space> w h i t e <space> s a i l s <space> s l a s h e d <space> b y
@@ -6,12 +14,4 @@ H e r e <space> , <space> t h e <space> g u e s t s <space> a r r i v e <space>
 A <space> s u m m a r y <space> o f <space> t h e <space> s t o r y <space> c a n <space> g i v e <space> a l m o s t
 t r e m e n d o u s <space> c l i m a x <space> i n <space> t h e <space> i s l a n d <space> o r g y <space> .
 r e c o r d i n g <space> e v e r y <space> w r i n k l e <space> a n d <space> d r o p <space> o f <space> s w e a t
-i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e
-p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y
-w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s
-u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s
-W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l
-& <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> .
-h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h
-r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h
-b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e
\ No newline at end of file
+i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e
\ No newline at end of file
diff --git a/tests/data/pylaia/train.txt b/tests/data/pylaia/train.txt
index 4a0d3bfd1e81605a09c0cf9959b4cb4baa56fbe7..15e03a2addca014e9e438640cc15562bb0153e6e 100644
--- a/tests/data/pylaia/train.txt
+++ b/tests/data/pylaia/train.txt
@@ -1,3 +1,11 @@
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> .
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg t h e <space> g l a r i n g <space> b e a c o n <space> o f <space> a <space> l i g h t h o u s e <space> .
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg t e l l s <space> h i s <space> s t o r y <space> b e s t <space> i n <space> t h e <space> f a c e s <space> o f <space> h i s <space> c r o w d s <space> ,
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg t h e <space> w i l d l y <space> f l a p p i n g <space> w h i t e <space> s a i l s <space> s l a s h e d <space> b y
@@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg A <space> s u m m a r y <space> o f <space> t h e <space> s t o r y <space> c a n <space> g i v e <space> a l m o s t
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg t r e m e n d o u s <space> c l i m a x <space> i n <space> t h e <space> i s l a n d <space> o r g y <space> .
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg r e c o r d i n g <space> e v e r y <space> w r i n k l e <space> a n d <space> d r o p <space> o f <space> s w e a t
-c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> .
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e
\ No newline at end of file
+c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e
\ No newline at end of file
diff --git a/tests/data/pylaia/train_ids.txt b/tests/data/pylaia/train_ids.txt
index 24a4df5bb934821da6de30ff5dabc50717753998..3459181dbf3c2860765575b417613774edff732e 100644
--- a/tests/data/pylaia/train_ids.txt
+++ b/tests/data/pylaia/train_ids.txt
@@ -1,3 +1,11 @@
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg
@@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg
-c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg
\ No newline at end of file
+c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg
\ No newline at end of file
diff --git a/tests/data/pylaia/train_no_space.txt b/tests/data/pylaia/train_no_space.txt
index 0ba886525dfccb4ef7164011af8d981aa1f3de70..085561b4d6f56a180982b7e8de218bd21566bd32 100644
--- a/tests/data/pylaia/train_no_space.txt
+++ b/tests/data/pylaia/train_no_space.txt
@@ -1,3 +1,11 @@
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg place agree with me better than Naples . The journey
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg winter . In late November , he was ' suffering as
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg usual ' , but hoped , he told Arthur , ' to find this
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg When the sailing season was past , he sent Pearl
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & I have not yet thought of a fire ... .
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg has been against me , as there has been much
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg rain and damp , but the temperature is high
+e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg back to England , and returned to Rome for the
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg the glaring beacon of a lighthouse .
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg tells his story best in the faces of his crowds ,
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg the wildly flapping white sails slashed by
@@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg A summary of the story can give almost
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg tremendous climax in the island orgy .
 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg recording every wrinkle and drop of sweat
-c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg indication of the scope of Wicki's artistry . He
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg place agree with me better than Naples . The journey
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg winter . In late November , he was ' suffering as
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg usual ' , but hoped , he told Arthur , ' to find this
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg When the sailing season was past , he sent Pearl
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & I have not yet thought of a fire ... .
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg has been against me , as there has been much
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg rain and damp , but the temperature is high
-e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg back to England , and returned to Rome for the
\ No newline at end of file
+c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg indication of the scope of Wicki's artistry . He
\ No newline at end of file
diff --git a/tests/data/pylaia/val.txt b/tests/data/pylaia/val.txt
index 68c76fc1718b9ef4f72d1936e3f9eb102b958824..8b84e5012a8ea1efa8c07849291819d6a606b059 100644
--- a/tests/data/pylaia/val.txt
+++ b/tests/data/pylaia/val.txt
@@ -1,13 +1,3 @@
-41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg T h e y <space> a l s o <space> h a d <space> t o <space> c o p e <space> w i t h <space> t h e <space> u s u a l <space> f l o o d <space> o f <space> r u m o u r s <space> , <space> s o -
-41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg i n <space> w h i c h <space> s h e <space> h a d <space> d i e d <space> , <space> t h e r e <space> w a s <space> n o t h i n g <space> o n
-41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg t h e <space> v i c t i m <space> a n d <space> i n s p e c t i n g <space> a t <space> W a t e r l o o <space> t h e <space> c a r r i a g e
-41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg a d j o u r n e d <space> .
-41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg t h e <space> T h u r s d a y <space> e v e n i n g <space> , <space> w i t h <space> b l o o d <space> a c t u a l l y <space> d r i p p i n g <space> f r o m
-41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg h i s <space> h a n d s <space> . <space> T h e <space> i n <unk> u e s t <space> w a s <space> o p e n e d <space> o n <space> <unk> e b r u a r y <space> <unk> <unk> <space> , <space> b u t <space> ,
-41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg m a n <space> h a d <space> b e e n <space> s e e n <space> f l e e i n g <space> f r o m <space> <unk> a u x h a l l <space> s t a t i o n <space> o n
-41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg m e <space> w e l l <space> m e a n t <space> , <space> s o m e <space> m i s c h i e v o u s <space> , <space> i n c l u d i n g <space> o n e <space> t h a t <space> a
-41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg b e y o n d <space> t h e <space> j u r y <space> h e a r i n g <space> a <space> f o r m a l <space> i d e n t i f i c a t i o n <space> o f
-41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg w h i c h <space> t o <space> p r o c e e d <space> , <space> a n d <space> t h e <space> i n <unk> u e s t <space> w a s
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg a r t i c l e <space> i n <space> t h e i r <space> f e s t a l <space> p r e p a r a t i o n s <space> <unk> <space> a n d <space> i t <space> i s <space> t h e
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg w a s <space> a <space> s u b j e c t <space> o f <space> n o <space> l i t t l e <space> i m p o r t a n c e <space> t o <space> t h e <space> R o m a n s <space> .
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg <unk> y <space> d e l i c a t e <space> a p p l i c a t i o n <space> o f <space> o d o u r s <space> a n d <space> r i c h l y - d i s t i l l e d
@@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg f a i n t i n g <space> a p p e t i t e <space> a n d <space> a d d e d <space> a <space> m o r e <space> e x <unk> u i s i t e <space> a n d
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg T h e y <space> c o n s i d e r e d <space> f l o w e r s <space> a s <space> f o r m i n g <space> a <space> v e r y <space> e s s e n t i a l
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg e t h e r e a l <space> e n j o y m e n t <space> t o <space> t h e <space> g r o s s e r <space> p l e a s u r e s <space> o f <space> t h e
-cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg b o a r d <space> . <space> T h e <space> g r a t i f i c a t i o n <space> o f <space> t h e <space> s e n s e <space> o f <space> s m e l l i n g
\ No newline at end of file
+cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg b o a r d <space> . <space> T h e <space> g r a t i f i c a t i o n <space> o f <space> t h e <space> s e n s e <space> o f <space> s m e l l i n g
+41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg T h e y <space> a l s o <space> h a d <space> t o <space> c o p e <space> w i t h <space> t h e <space> u s u a l <space> f l o o d <space> o f <space> r u m o u r s <space> , <space> s o -
+41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg i n <space> w h i c h <space> s h e <space> h a d <space> d i e d <space> , <space> t h e r e <space> w a s <space> n o t h i n g <space> o n
+41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg t h e <space> v i c t i m <space> a n d <space> i n s p e c t i n g <space> a t <space> W a t e r l o o <space> t h e <space> c a r r i a g e
+41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg a d j o u r n e d <space> .
+41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg t h e <space> T h u r s d a y <space> e v e n i n g <space> , <space> w i t h <space> b l o o d <space> a c t u a l l y <space> d r i p p i n g <space> f r o m
+41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg h i s <space> h a n d s <space> . <space> T h e <space> i n <unk> u e s t <space> w a s <space> o p e n e d <space> o n <space> <unk> e b r u a r y <space> <unk> <unk> <space> , <space> b u t <space> ,
+41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg m a n <space> h a d <space> b e e n <space> s e e n <space> f l e e i n g <space> f r o m <space> <unk> a u x h a l l <space> s t a t i o n <space> o n
+41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg m e <space> w e l l <space> m e a n t <space> , <space> s o m e <space> m i s c h i e v o u s <space> , <space> i n c l u d i n g <space> o n e <space> t h a t <space> a
+41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg b e y o n d <space> t h e <space> j u r y <space> h e a r i n g <space> a <space> f o r m a l <space> i d e n t i f i c a t i o n <space> o f
+41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg w h i c h <space> t o <space> p r o c e e d <space> , <space> a n d <space> t h e <space> i n <unk> u e s t <space> w a s
\ No newline at end of file
diff --git a/tests/data/pylaia/val_ids.txt b/tests/data/pylaia/val_ids.txt
index 17e39f5792d22e90fe3187d9738ee459317cf6b0..5b2a6e55fc8eb8b53672f44629035c098d0bfd2f 100644
--- a/tests/data/pylaia/val_ids.txt
+++ b/tests/data/pylaia/val_ids.txt
@@ -1,13 +1,3 @@
-41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg
-41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg
@@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg
-cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg
\ No newline at end of file
+cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg
+41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg
\ No newline at end of file
diff --git a/tests/data/pylaia/val_no_space.txt b/tests/data/pylaia/val_no_space.txt
index 4932651c7f667cb43d2e1bca20d35729dd98ead2..e433eb379a58b8ca53128c3e887cf7938bfb2960 100644
--- a/tests/data/pylaia/val_no_space.txt
+++ b/tests/data/pylaia/val_no_space.txt
@@ -1,13 +1,3 @@
-41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg They also had to cope with the usual flood of rumours , so-
-41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg in which she had died , there was nothing on
-41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg the victim and inspecting at Waterloo the carriage
-41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg adjourned .
-41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg the Thursday evening , with blood actually dripping from
-41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg his hands . The inquest was opened on February 17 , but ,
-41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg man had been seen fleeing from Vauxhall station on
-41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg me well meant , some mischievous , including one that a
-41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg beyond the jury hearing a formal identification of
-41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg which to proceed , and the inquest was
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg article in their festal preparations ; and it is the
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg was a subject of no little importance to the Romans .
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg By delicate application of odours and richly-distilled
@@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg fainting appetite and added a more exquisite and
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg They considered flowers as forming a very essential
 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg ethereal enjoyment to the grosser pleasures of the
-cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg board . The gratification of the sense of smelling
\ No newline at end of file
+cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg board . The gratification of the sense of smelling
+41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg They also had to cope with the usual flood of rumours , so-
+41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg in which she had died , there was nothing on
+41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg the victim and inspecting at Waterloo the carriage
+41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg adjourned .
+41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg the Thursday evening , with blood actually dripping from
+41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg his hands . The inquest was opened on February 17 , but ,
+41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg man had been seen fleeing from Vauxhall station on
+41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg me well meant , some mischievous , including one that a
+41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg beyond the jury hearing a formal identification of
+41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg which to proceed , and the inquest was
\ No newline at end of file
diff --git a/tests/data/test_db.sqlite b/tests/data/test_db.sqlite
index 112afe1aeddbd2af2e32a9e0ce6ff2c201cf4c86..fa5669e340f3858fc10c915931dee38c06105ff6 100644
Binary files a/tests/data/test_db.sqlite and b/tests/data/test_db.sqlite differ
diff --git a/tests/extract/test_base.py b/tests/extract/test_base.py
index a7b95ff4170aa34d6bd4978ce7af27abad93d4ae..3e56a6894ef4581dbc170370f04d51247ba8da0a 100644
--- a/tests/extract/test_base.py
+++ b/tests/extract/test_base.py
@@ -13,7 +13,7 @@ from atr_data_generator.extract.arguments import (
     SelectArgs,
 )
 from atr_data_generator.extract.base import EXPORT_PATH, DataGenerator
-from atr_data_generator.split.arguments import Partition, SplitArgs
+from tests.conftest import DATASET_ID
 
 
 @pytest.mark.parametrize(
@@ -23,46 +23,27 @@ from atr_data_generator.split.arguments import Partition, SplitArgs
         ([MANUAL]),  # only manual transcriptions
     ),
 )
-@pytest.mark.parametrize(
-    "folders, expected_trans_lines",
-    (
-        (["a0c4522d-2d80-4766-a01c-b9d686f41f6a"], [17]),
-        (
-            [
-                "a0c4522d-2d80-4766-a01c-b9d686f41f6a",
-                "39b9ac5c-89ab-4258-8116-965bf0ca0419",
-            ],
-            [17, 21],
-        ),
-    ),
-)
 def test_run(
     database,
-    folders,
     image_cache,
     worker_version_ids,
-    expected_trans_lines,
     tmp_path,
 ):
-    train_folder, val_folder = folders if len(folders) == 2 else (folders[0], None)
     atr_data_gen = DataGenerator(
         common=CommonArgs(
             dataset_name="test", output_dir=tmp_path, cache_dir=image_cache
         ),
         image=ImageArgs(),
-        select=SelectArgs(),
+        select=SelectArgs(dataset=DATASET_ID),
         filter=FilterArgs(accepted_worker_version_ids=worker_version_ids),
-        split=SplitArgs(train_folder=train_folder, validation_folder=val_folder),
     )
 
     atr_data_gen.run(database)
 
     # Read json transcription file
     data = json.loads((atr_data_gen.common.output_dir / EXPORT_PATH).read_text())
-
-    for partition, count in zip(Partition, expected_trans_lines):
-        assert len(data[partition.value]) == count
+    assert {key: len(value) for key, value in data.items()} == {"train": 17, "val": 21}
 
     # each image file should have one transcription file
     img_files = list(atr_data_gen.common.output_dir.rglob("*.jpg"))
-    assert len(img_files) == sum(expected_trans_lines)
+    assert len(img_files) == 38
diff --git a/tests/extract/test_pylaia.py b/tests/extract/test_pylaia.py
index 71be6e5494a80112fc97662df34394c9ee192a26..81944ccada4d3868e0a62c5426ce79072766cf85 100644
--- a/tests/extract/test_pylaia.py
+++ b/tests/extract/test_pylaia.py
@@ -12,8 +12,7 @@ from atr_data_generator.extract.arguments import (
 )
 from atr_data_generator.extract.pylaia.arguments import PylaiaArgs
 from atr_data_generator.extract.pylaia.main import PylaiaDataGenerator
-from atr_data_generator.split.arguments import SplitArgs
-from tests.conftest import FIXTURES
+from tests.conftest import DATASET_ID, FIXTURES
 
 FIXTURE_DIR = FIXTURES / "pylaia"
 
@@ -25,38 +24,21 @@ FIXTURE_DIR = FIXTURES / "pylaia"
         ([MANUAL]),  # only manual transcriptions
     ),
 )
-@pytest.mark.parametrize(
-    "folders, expected_trans_lines",
-    (
-        (["a0c4522d-2d80-4766-a01c-b9d686f41f6a"], [17]),
-        (
-            [
-                "a0c4522d-2d80-4766-a01c-b9d686f41f6a",
-                "39b9ac5c-89ab-4258-8116-965bf0ca0419",
-            ],
-            [17, 21],
-        ),
-    ),
-)
 @pytest.mark.parametrize("syms_path", ((None), (FIXTURE_DIR / "syms.txt")))
 def test_run(
     database,
-    folders,
     image_cache,
     worker_version_ids,
-    expected_trans_lines,
     tmp_path,
     syms_path,
 ):
-    train_folder, val_folder = folders if len(folders) == 2 else (folders[0], None)
     atr_data_gen = PylaiaDataGenerator(
         common=CommonArgs(
             dataset_name="test", output_dir=tmp_path, cache_dir=image_cache
         ),
         image=ImageArgs(),
-        select=SelectArgs(),
+        select=SelectArgs(dataset=DATASET_ID),
         filter=FilterArgs(accepted_worker_version_ids=worker_version_ids),
-        split=SplitArgs(train_folder=train_folder, validation_folder=val_folder),
         pylaia=PylaiaArgs(syms_path=syms_path),
     )
 
@@ -68,4 +50,4 @@ def test_run(
 
     # each image file should have one transcription file
     img_files = list(atr_data_gen.common.output_dir.rglob("*.jpg"))
-    assert len(img_files) == sum(expected_trans_lines)
+    assert len(img_files) == 38