diff --git a/README.md b/README.md index 8d4984bc37c45467e09777306843513fd9d75f6f..f68b0a46e3f4bb6f3afc09ede164b4dc668887ca 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,6 @@ This script downloads pages with transcriptions from Arkindex and converts data to ATR format. -It also generates reproducible train, val and test splits. +It also generates reproducible splits. A documentation is available at https://atr.pages.teklia.com/data-generator/. diff --git a/atr_data_generator/extract/__init__.py b/atr_data_generator/extract/__init__.py index d5af7c91749d057e5236405f2d67945843142756..f45e4484d109c1e676522b9ebafc38d1e16edef9 100644 --- a/atr_data_generator/extract/__init__.py +++ b/atr_data_generator/extract/__init__.py @@ -2,7 +2,6 @@ """ Data extraction """ -import uuid from pathlib import Path from typing import Optional @@ -20,7 +19,6 @@ from atr_data_generator.extract.base import DataGenerator from atr_data_generator.extract.pylaia.arguments import PylaiaArgs from atr_data_generator.extract.pylaia.main import PylaiaDataGenerator from atr_data_generator.extract.utils import ListedEnum -from atr_data_generator.split.arguments import SplitArgs class Generators(ListedEnum): @@ -72,15 +70,9 @@ def get_parser(): # Select select = parser.add_subparser("select", default={}) - select.add_option("parent_type", type=str, default=None) + select.add_option("dataset", type=str, default=None) select.add_option("element_type", type=str, default=None) - # Split - split = parser.add_subparser("split", default={}) - split.add_option("train_folder", type=uuid.UUID, default=None) - split.add_option("validation_folder", type=uuid.UUID, default=None) - split.add_option("test_folder", type=uuid.UUID, default=None) - # Format specific # Pylaia pylaia = parser.add_subparser("pylaia", default={}) @@ -96,7 +88,6 @@ def config_parser(configuration_path: Path): - ImageArgs - FilterArgs - SelectArgs - - SplitArgs # Format specific args if provided """ config_data = get_parser().parse(configuration_path) @@ -105,7 +96,6 @@ def config_parser(configuration_path: Path): "image": ImageArgs(**config_data["image"]), "filter": FilterArgs(**config_data["filter"]), "select": SelectArgs(**config_data["select"]), - "split": SplitArgs(**config_data["split"]), # Format specific "pylaia": PylaiaArgs(**config_data["pylaia"]), } diff --git a/atr_data_generator/extract/arguments.py b/atr_data_generator/extract/arguments.py index 96790d8f996f377201978621cfe449a074fb55aa..9b91f1eb7b522ecdb9e05c454673c99a6347dc73 100644 --- a/atr_data_generator/extract/arguments.py +++ b/atr_data_generator/extract/arguments.py @@ -44,12 +44,15 @@ class SelectArgs(BaseArgs): Arguments to select elements from Arkindex Args: + dataset (str): Filter dataset to process element_type (str): Filter elements to process by type - parent_type (str): Filter elements parents to process by type """ + dataset: str element_type: Optional[str] = None - parent_type: Optional[str] = None + + def __post_init__(self): + assert UUID(self.dataset) @dataclass diff --git a/atr_data_generator/extract/base.py b/atr_data_generator/extract/base.py index 30ee6be4ee8fd0eaf154502c62ef37025700e867..4c671056044b5fa7f97440d0c4256facbf3432b0 100644 --- a/atr_data_generator/extract/base.py +++ b/atr_data_generator/extract/base.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any, Dict import numpy as np -from arkindex_export import Element, open_database +from arkindex_export import Dataset, Element, open_database from line_image_extractor.extractor import extract, read_img, save_img from line_image_extractor.image_utils import polygon_to_bbox, resize from PIL import Image @@ -15,9 +15,8 @@ from tqdm import tqdm from atr_data_generator.arguments import CommonArgs from atr_data_generator.extract.arguments import FilterArgs, ImageArgs, SelectArgs -from atr_data_generator.extract.db import get_children, get_children_info +from atr_data_generator.extract.db import get_children_info, get_dataset_elements from atr_data_generator.extract.utils import _is_vertical, resize_image_height -from atr_data_generator.split.arguments import Partition, SplitArgs from atr_data_generator.utils import download_image, export_parameters logger = logging.getLogger(__name__) @@ -33,7 +32,6 @@ class DataGenerator: image: ImageArgs filter: FilterArgs select: SelectArgs - split: SplitArgs data: Dict[str, dict] = field(default_factory=lambda: defaultdict(dict)) @@ -43,14 +41,12 @@ class DataGenerator: image: ImageArgs, filter: FilterArgs, select: SelectArgs, - split: SplitArgs, **kwargs, ) -> None: self.common = common self.image = image self.filter = filter self.select = select - self.split = split self.data: Dict[str, dict] = defaultdict(dict) def __post_init__(self): @@ -87,7 +83,7 @@ class DataGenerator: return read_img(cached_img_path, self.image.grayscale) - def parse_transcription(self, transcription: str, **kwargs): + def parse_transcription(self, transcription: str, *args, **kwargs): return transcription def get_image(self, child: Element, destination: Path) -> None: @@ -138,7 +134,7 @@ class DataGenerator: def parse_image_path(self, image_path: Path): return str(image_path) - def process_parent(self, parent: Element, partition: Partition): + def process_parent(self, parent: Element, split: str): """ Process every children under this parent element. """ @@ -148,7 +144,7 @@ class DataGenerator: type=self.select.element_type, sources=self.filter.accepted_worker_version_ids, ), - desc=f"Extracting data from {parent.type} {parent.name} for split ({partition.value})", + desc=f"Extracting data from {parent.type} {parent.name} for split ({split})", ): image_path = ( self.common.output_dir @@ -156,9 +152,9 @@ class DataGenerator: / f"{parent.id}_{child.element.name.split('_')[-1]}_{child.element_id}.jpg" ) # Store transcription - self.data[partition.value][ + self.data[split][ self.parse_image_path(image_path) - ] = self.parse_transcription(child.text, partition=partition) + ] = self.parse_transcription(child.text, split=split) # Extract the image self.get_image(child.element, image_path) @@ -173,28 +169,26 @@ class DataGenerator: def run(self, db_path: Path): """ - Extract data from folders of elements with selected type + Extract data from the selected dataset """ - # Either provide folders or an existing split - assert ( - self.split.train_folder - or self.split.validation_folder - or self.split.test_folder - ), "Please provide at least one folder." - self.connect_db(db_path) - # Iterate over folders - for partition, folder_id in zip(Partition, self.split.folders): - if folder_id is None: - continue - # Find the parent elements - for parent_element in get_children(folder_id, type=self.select.parent_type): - self.process_parent(parent_element, partition) + try: + dataset = Dataset.get(id=self.select.dataset) + except Exception: + raise Exception(f"{self.select.dataset} is not a valid dataset ID") + + # Iterate over sets + for split in dataset.sets.split(","): + # Find the dataset elements + for parent in get_dataset_elements( + dataset, split, self.select.element_type + ): + self.process_parent(parent.element, split) assert sum( - len(partition.values()) for partition in self.data.values() - ), "No data was extracted from all three train/validation/test folders." + len(split.values()) for split in self.data.values() + ), "No data was extracted from all the splits." self.export() @@ -203,5 +197,4 @@ class DataGenerator: image=self.image, select=self.select, filter=self.filter, - split=self.split, ) diff --git a/atr_data_generator/extract/db.py b/atr_data_generator/extract/db.py index 6bd29287b26d2f3be24a5e936ef3ecf9eb1606ae..20f8d1b6731b7d7728147b7199e44aeeda7bc6fd 100644 --- a/atr_data_generator/extract/db.py +++ b/atr_data_generator/extract/db.py @@ -1,24 +1,34 @@ # -*- coding: utf-8 -*- +from operator import attrgetter from typing import List, Optional -from uuid import UUID -from arkindex_export import Element, Transcription +from arkindex_export import Dataset, DatasetElement, Element, Transcription from arkindex_export.queries import list_children from atr_data_generator.extract.arguments import MANUAL -def get_children(parent_id: UUID, type: Optional[str]): - """Recursively list children elements. +def get_dataset_elements(dataset: Dataset, split: str, type: Optional[str]): + """ + Retrieve dataset elements in a specific split from an SQLite export of an Arkindex corpus - :param parent_id: ID of the parent element. + :param dataset: Dataset object from which the elements come. + :param split: Set name of the dataset to use. :param type: Optionally filter by element type. - :return: The filtered list of children. + :return: The filtered list of dataset elements. """ - query = list_children(parent_id) + query = ( + DatasetElement.select(DatasetElement.element) + .join(Element) + .where( + DatasetElement.dataset == dataset, + DatasetElement.set_name == split, + ) + ) if type: query = query.where(Element.type == type) - return query.order_by(Element.name) + + return query def parse_sources(sources: List[str]): @@ -56,6 +66,11 @@ def get_children_info( elements = list_children(parent_id) + # Insert parent in the query to allow to process it + elements = Element.select().where( + Element.id.in_(list(map(attrgetter("id"), elements)) + [parent_id]) + ) + # Filter by type if type: elements = elements.where(Element.type == type) diff --git a/atr_data_generator/extract/pylaia/main.py b/atr_data_generator/extract/pylaia/main.py index 237beb77a59f690b2f3ec3b2226b0a39b164daf9..9d9fb786c3c1fc46238a1107e973cbc739b79117 100644 --- a/atr_data_generator/extract/pylaia/main.py +++ b/atr_data_generator/extract/pylaia/main.py @@ -5,15 +5,18 @@ from dataclasses import dataclass from pathlib import Path from typing import Optional +from arkindex_export import Dataset + from atr_data_generator.extract.base import DataGenerator from atr_data_generator.extract.pylaia import LM_FILENAME from atr_data_generator.extract.pylaia.arguments import PylaiaArgs from atr_data_generator.extract.pylaia.syms import Syms from atr_data_generator.extract.pylaia.utils import _merge -from atr_data_generator.split.arguments import Partition logger = logging.getLogger(__name__) +TRAIN_SPLIT = "train" + @dataclass class PylaiaDataGenerator(DataGenerator): @@ -23,7 +26,7 @@ class PylaiaDataGenerator(DataGenerator): def __post_init__(self): super().__post_init__() if self.pylaia.syms_path: - # Load pre existing syms for all partitions + # Load pre existing syms for all splits assert self.pylaia.syms_path.exists() self.syms = Syms.from_disk(self.pylaia.syms_path) logger.info(f"Loaded symbols from {self.pylaia.syms_path}") @@ -35,48 +38,47 @@ class PylaiaDataGenerator(DataGenerator): """ return str(image_path.relative_to(image_path.parent)) - def parse_transcription(self, transcription: str, partition: Partition, **kwargs): + def parse_transcription(self, transcription: str, split: str, *args, **kwargs): return transcription, self.syms.process_line( - transcription.strip(), read_only=partition != Partition.Train + transcription.strip(), read_only=split != TRAIN_SPLIT ) - def export_partition(self, partition): + def export_split(self, split): """Exports 3 files - - <partition_name>.txt, with path_to_image and tokenized transcription - - <partition_name>_ids.txt with path_to_image only - - text_<partition_name>.txt with path_to_image and not tokenized transcription + - <split_name>.txt, with path_to_image and tokenized transcription + - <split_name>_ids.txt with path_to_image only + - text_<split_name>.txt with path_to_image and not tokenized transcription The tokenized transcriptions are also exported in a separate file for LM training. """ - data = [(path, *value) for path, value in self.data[partition.value].items()] + data = [(path, *value) for path, value in self.data[split].items()] if not data: return paths, transcriptions, tokenized = zip(*data) # Path + tokenized - (self.common.output_dir / partition.value).with_suffix(".txt").write_text( + (self.common.output_dir / split).with_suffix(".txt").write_text( "\n".join(_merge(paths, tokenized)) ) # Path + not tokenized - (self.common.output_dir / f"{partition.value}_no_space.txt").write_text( + (self.common.output_dir / f"{split}_no_space.txt").write_text( "\n".join(_merge(paths, transcriptions)) ) # Paths - (self.common.output_dir / f"{partition.value}_ids.txt").write_text( - "\n".join(paths) - ) + (self.common.output_dir / f"{split}_ids.txt").write_text("\n".join(paths)) # Export data for LM training - if partition == Partition.Train: + if split == TRAIN_SPLIT: (self.common.output_dir / LM_FILENAME).write_text("\n".join(tokenized)) def export(self): - # Export syms of training partition + # Export syms of training split self.syms.export(self.common.output_dir) - # Export each partition - for partition in Partition: - self.export_partition(partition) + # Export each split + dataset = Dataset.get(id=self.select.dataset) + for split in dataset.sets.split(","): + self.export_split(split) diff --git a/atr_data_generator/split/__init__.py b/atr_data_generator/split/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/atr_data_generator/split/arguments.py b/atr_data_generator/split/arguments.py deleted file mode 100644 index 976fc8325173f0b6aa8d146a8a379394c20aad2a..0000000000000000000000000000000000000000 --- a/atr_data_generator/split/arguments.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- - -from dataclasses import dataclass, fields -from enum import Enum -from typing import Optional -from uuid import UUID - -from atr_data_generator.arguments import BaseArgs - - -class Partition(Enum): - Train = "train" - Validation = "val" - Test = "test" - - -@dataclass -class SplitArgs(BaseArgs): - """ - Arguments related to data splitting into training, validation and test subsets. - - Args: - train_ratio (float): Ratio of data to be used in the training set. Should be between 0 and 1. - test_ratio (float): Ratio of data to be used in the testing set. Should be between 0 and 1. - val_ratio (float): Ratio of data to be used in the validation set. The sum of three variables should equal 1. - train_folder (float): ID of the training folder. - validation_folder (float): ID of the validation folder. - test_folder (float): ID of the testing folder. - """ - - train_ratio: float = 0.8 - test_ratio: float = 0.1 - val_ratio: float = 1 - train_ratio - test_ratio - - # Existing split from Arkindex - train_folder: Optional[UUID] = None - validation_folder: Optional[UUID] = None - test_folder: Optional[UUID] = None - - @property - def folders(self): - """Get the folders.""" - return (self.train_folder, self.validation_folder, self.test_folder) - - def json(self): - data = super().json() - - # String formatting for UUIDs - for field in fields(self): - if "folder" not in field.name: - continue - if (value := getattr(self, field.name)) and value is not None: - data[field.name] = str(value) - return data diff --git a/atr_data_generator/utils.py b/atr_data_generator/utils.py index d8cd1ba0e05bfded9d672370717bfd5c00ad0005..b2715db781d004f3dbc67730905abdff2d197d27 100644 --- a/atr_data_generator/utils.py +++ b/atr_data_generator/utils.py @@ -22,7 +22,6 @@ from tenacity import ( if TYPE_CHECKING: from atr_data_generator.arguments import CommonArgs from atr_data_generator.extract.arguments import FilterArgs, ImageArgs, SelectArgs - from atr_data_generator.split.arguments import SplitArgs logger = logging.getLogger(__name__) @@ -73,7 +72,6 @@ def download_image(url): def export_parameters( common: CommonArgs, image: Optional[ImageArgs] = None, - split: Optional[SplitArgs] = None, select: Optional[SelectArgs] = None, filter: Optional[FilterArgs] = None, ): @@ -83,8 +81,6 @@ def export_parameters( config = {"common": common.json()} if image: config["image"] = image.json() - if split: - config["split"] = split.json() if select: config["select"] = select.json() if filter: diff --git a/docs/extract/configuration.md b/docs/extract/configuration.md index f9c4705178f6bccf110c58bbdf875104518e02ca..6358d9f44681ee184ff12b501588514f302830b0 100644 --- a/docs/extract/configuration.md +++ b/docs/extract/configuration.md @@ -6,7 +6,6 @@ The YAML configuration for the `extract` subcommand has 5 sections: - `image` (optional), - `filter` (optional), - `select` (optional). -- `split`. An example configuration file, filled with the default values when there is one, is available at `examples/extraction.yml`. @@ -25,7 +24,3 @@ The full list of fields as well as their default values is available in the [Pyt ## Select The full list of fields as well as their default values is available in the [Python reference](../ref/extract/arguments.md#atr_data_generator.extract.arguments.SelectArgs). - -## Split - -The full list of fields as well as their default values is available in the [Python reference](../ref/extract/arguments.md#atr_data_generator.split.arguments.SplitArgs). diff --git a/docs/ref/split/arguments.md b/docs/ref/split/arguments.md deleted file mode 100644 index dc08ad5399310f7450b98c8945c4e70336ef0f14..0000000000000000000000000000000000000000 --- a/docs/ref/split/arguments.md +++ /dev/null @@ -1,3 +0,0 @@ -# Arguments - -::: atr_data_generator.split.arguments diff --git a/examples/extraction.yml b/examples/extraction.yml index c63bf451699b92725a880861f20f5dc045de55eb..9254764a94d29e49bf70ad455ad015ecace5da2d 100644 --- a/examples/extraction.yml +++ b/examples/extraction.yml @@ -18,9 +18,5 @@ image: should_rotate: false skew_angle: 0 select: + dataset: # Fill me element_type: null - parent_type: null -split: - test_folder: null # Fill me or other folders - train_folder: null # Fill me or other folders - validation_folder: null # Fill me or other folders diff --git a/examples/split.yml b/examples/split.yml deleted file mode 100644 index 5eaebf1fd8af6e4e3e451087eeb3ae83b1c83624..0000000000000000000000000000000000000000 --- a/examples/split.yml +++ /dev/null @@ -1,13 +0,0 @@ -common: - cache_dir: .cache - dataset_name: # Fill me - log_parameters: true - output_dir: # Fill me -split: - test_ratio: 0.1 - train_ratio: 0.8 - use_existing_split: false - val_ratio: 0.1 - train_folder: null - validation_folder: null - test_folder: null diff --git a/mkdocs.yml b/mkdocs.yml index aa3d13453e6caf8d262d0f27b24ef2290e699e7b..fa0183982c7d6eaa6872219771062f5b8adbb511 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,8 +58,6 @@ nav: - Dataset extraction: - Arguments: ref/extract/arguments.md - PyLaia-specific arguments: ref/extract/pylaia/arguments.md - - Dataset splitting: - - Arguments: ref/split/arguments.md - Development: dev.md markdown_extensions: diff --git a/tests/conftest.py b/tests/conftest.py index 82e88e6db5cc326a817fa52fad30514a8e360597..46482cb8c4de4fa22cdd4853c8c5753ff2c60917 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ import pytest from PIL import Image FIXTURES = Path(__file__).resolve().parent / "data" +DATASET_ID = "aa30fea9-3b12-497f-ac6b-eb4c2ee22c48" @pytest.fixture(autouse=True) diff --git a/tests/data/pylaia/corpus_lm.txt b/tests/data/pylaia/corpus_lm.txt index d8eac54947bfcb16aa1dcc89113aa26522bcb447..c7bf28d014366a2289a14803f6a141e456ad8463 100644 --- a/tests/data/pylaia/corpus_lm.txt +++ b/tests/data/pylaia/corpus_lm.txt @@ -1,3 +1,11 @@ +p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y +w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s +u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s +W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l +& <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> . +h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h +r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h +b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e t h e <space> g l a r i n g <space> b e a c o n <space> o f <space> a <space> l i g h t h o u s e <space> . t e l l s <space> h i s <space> s t o r y <space> b e s t <space> i n <space> t h e <space> f a c e s <space> o f <space> h i s <space> c r o w d s <space> , t h e <space> w i l d l y <space> f l a p p i n g <space> w h i t e <space> s a i l s <space> s l a s h e d <space> b y @@ -6,12 +14,4 @@ H e r e <space> , <space> t h e <space> g u e s t s <space> a r r i v e <space> A <space> s u m m a r y <space> o f <space> t h e <space> s t o r y <space> c a n <space> g i v e <space> a l m o s t t r e m e n d o u s <space> c l i m a x <space> i n <space> t h e <space> i s l a n d <space> o r g y <space> . r e c o r d i n g <space> e v e r y <space> w r i n k l e <space> a n d <space> d r o p <space> o f <space> s w e a t -i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e -p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y -w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s -u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s -W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l -& <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> . -h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h -r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h -b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e \ No newline at end of file +i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e \ No newline at end of file diff --git a/tests/data/pylaia/train.txt b/tests/data/pylaia/train.txt index 4a0d3bfd1e81605a09c0cf9959b4cb4baa56fbe7..15e03a2addca014e9e438640cc15562bb0153e6e 100644 --- a/tests/data/pylaia/train.txt +++ b/tests/data/pylaia/train.txt @@ -1,3 +1,11 @@ +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> . +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg t h e <space> g l a r i n g <space> b e a c o n <space> o f <space> a <space> l i g h t h o u s e <space> . c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg t e l l s <space> h i s <space> s t o r y <space> b e s t <space> i n <space> t h e <space> f a c e s <space> o f <space> h i s <space> c r o w d s <space> , c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg t h e <space> w i l d l y <space> f l a p p i n g <space> w h i t e <space> s a i l s <space> s l a s h e d <space> b y @@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg A <space> s u m m a r y <space> o f <space> t h e <space> s t o r y <space> c a n <space> g i v e <space> a l m o s t c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg t r e m e n d o u s <space> c l i m a x <space> i n <space> t h e <space> i s l a n d <space> o r g y <space> . c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg r e c o r d i n g <space> e v e r y <space> w r i n k l e <space> a n d <space> d r o p <space> o f <space> s w e a t -c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg p l a c e <space> a g r e e <space> w i t h <space> m e <space> b e t t e r <space> t h a n <space> N a p l e s <space> . <space> T h e <space> j o u r n e y -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg w i n t e r <space> . <space> I n <space> l a t e <space> N o v e m b e r <space> , <space> h e <space> w a s <space> ' <space> s u f f e r i n g <space> a s -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg u s u a l <space> ' <space> , <space> b u t <space> h o p e d <space> , <space> h e <space> t o l d <space> A r t h u r <space> , <space> ' <space> t o <space> f i n d <space> t h i s -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg W h e n <space> t h e <space> s a i l i n g <space> s e a s o n <space> w a s <space> p a s t <space> , <space> h e <space> s e n t <space> P e a r l -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & <space> I <space> h a v e <space> n o t <space> y e t <space> t h o u g h t <space> o f <space> a <space> f i r e <space> . . . <space> . -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg h a s <space> b e e n <space> a g a i n s t <space> m e <space> , <space> a s <space> t h e r e <space> h a s <space> b e e n <space> m u c h -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg r a i n <space> a n d <space> d a m p <space> , <space> b u t <space> t h e <space> t e m p e r a t u r e <space> i s <space> h i g h -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg b a c k <space> t o <space> E n g l a n d <space> , <space> a n d <space> r e t u r n e d <space> t o <space> R o m e <space> f o r <space> t h e \ No newline at end of file +c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg i n d i c a t i o n <space> o f <space> t h e <space> s c o p e <space> o f <space> W i c k i ' s <space> a r t i s t r y <space> . <space> H e \ No newline at end of file diff --git a/tests/data/pylaia/train_ids.txt b/tests/data/pylaia/train_ids.txt index 24a4df5bb934821da6de30ff5dabc50717753998..3459181dbf3c2860765575b417613774edff732e 100644 --- a/tests/data/pylaia/train_ids.txt +++ b/tests/data/pylaia/train_ids.txt @@ -1,3 +1,11 @@ +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg @@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg -c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg \ No newline at end of file +c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg \ No newline at end of file diff --git a/tests/data/pylaia/train_no_space.txt b/tests/data/pylaia/train_no_space.txt index 0ba886525dfccb4ef7164011af8d981aa1f3de70..085561b4d6f56a180982b7e8de218bd21566bd32 100644 --- a/tests/data/pylaia/train_no_space.txt +++ b/tests/data/pylaia/train_no_space.txt @@ -1,3 +1,11 @@ +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg place agree with me better than Naples . The journey +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg winter . In late November , he was ' suffering as +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg usual ' , but hoped , he told Arthur , ' to find this +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg When the sailing season was past , he sent Pearl +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & I have not yet thought of a fire ... . +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg has been against me , as there has been much +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg rain and damp , but the temperature is high +e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg back to England , and returned to Rome for the c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-08_2a1405cc-c8ad-4478-858b-d8f80902dd08.jpg the glaring beacon of a lighthouse . c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-02_4509e20f-6ea4-42ce-98a9-2d31f7c1217a.jpg tells his story best in the faces of his crowds , c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-07_4dafcf5b-4ffc-4416-a03f-800bcfc6ea32.jpg the wildly flapping white sails slashed by @@ -6,12 +14,4 @@ c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-06_9085a7f8-d52a-4bd9-955c-7ea52a80 c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-00_96f12fde-ce73-4f02-8fc8-ad952e9b3db6.jpg A summary of the story can give almost c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-05_9b7a03bd-d5de-426d-b743-8aebc3fdc960.jpg tremendous climax in the island orgy . c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-03_e21e731c-ca1b-4584-8d86-d585e5b749d3.jpg recording every wrinkle and drop of sweat -c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg indication of the scope of Wicki's artistry . He -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-04_19e23d82-b2de-4482-94af-7f2a59fd786d.jpg place agree with me better than Naples . The journey -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-02_2c887901-c9ff-4d5b-a242-d7a5503e4421.jpg winter . In late November , he was ' suffering as -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-03_34bb1d7e-657d-404d-b602-54d8e78eeed4.jpg usual ' , but hoped , he told Arthur , ' to find this -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-00_6411b293-dee0-4002-ac82-ffc5cdcb499d.jpg When the sailing season was past , he sent Pearl -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-07_c32df3a4-6af6-4bd7-9743-fbee03f74c48.jpg & I have not yet thought of a fire ... . -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-05_cf904505-2414-495a-9ca0-7ff63f604e39.jpg has been against me , as there has been much -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-06_e9987625-0940-4286-bff9-01156b946cf0.jpg rain and damp , but the temperature is high -e26e6803-18da-4768-be30-a0a68132107c_g06-018m-01_f613f337-ed74-4035-91c0-cd173336c49e.jpg back to England , and returned to Rome for the \ No newline at end of file +c673bd94-96b1-4a2e-8662-a4d806940b5f_c02-026-01_e78fc32e-d2d4-4bf1-b684-2e3fc1983893.jpg indication of the scope of Wicki's artistry . He \ No newline at end of file diff --git a/tests/data/pylaia/val.txt b/tests/data/pylaia/val.txt index 68c76fc1718b9ef4f72d1936e3f9eb102b958824..8b84e5012a8ea1efa8c07849291819d6a606b059 100644 --- a/tests/data/pylaia/val.txt +++ b/tests/data/pylaia/val.txt @@ -1,13 +1,3 @@ -41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg T h e y <space> a l s o <space> h a d <space> t o <space> c o p e <space> w i t h <space> t h e <space> u s u a l <space> f l o o d <space> o f <space> r u m o u r s <space> , <space> s o - -41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg i n <space> w h i c h <space> s h e <space> h a d <space> d i e d <space> , <space> t h e r e <space> w a s <space> n o t h i n g <space> o n -41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg t h e <space> v i c t i m <space> a n d <space> i n s p e c t i n g <space> a t <space> W a t e r l o o <space> t h e <space> c a r r i a g e -41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg a d j o u r n e d <space> . -41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg t h e <space> T h u r s d a y <space> e v e n i n g <space> , <space> w i t h <space> b l o o d <space> a c t u a l l y <space> d r i p p i n g <space> f r o m -41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg h i s <space> h a n d s <space> . <space> T h e <space> i n <unk> u e s t <space> w a s <space> o p e n e d <space> o n <space> <unk> e b r u a r y <space> <unk> <unk> <space> , <space> b u t <space> , -41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg m a n <space> h a d <space> b e e n <space> s e e n <space> f l e e i n g <space> f r o m <space> <unk> a u x h a l l <space> s t a t i o n <space> o n -41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg m e <space> w e l l <space> m e a n t <space> , <space> s o m e <space> m i s c h i e v o u s <space> , <space> i n c l u d i n g <space> o n e <space> t h a t <space> a -41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg b e y o n d <space> t h e <space> j u r y <space> h e a r i n g <space> a <space> f o r m a l <space> i d e n t i f i c a t i o n <space> o f -41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg w h i c h <space> t o <space> p r o c e e d <space> , <space> a n d <space> t h e <space> i n <unk> u e s t <space> w a s cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg a r t i c l e <space> i n <space> t h e i r <space> f e s t a l <space> p r e p a r a t i o n s <space> <unk> <space> a n d <space> i t <space> i s <space> t h e cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg w a s <space> a <space> s u b j e c t <space> o f <space> n o <space> l i t t l e <space> i m p o r t a n c e <space> t o <space> t h e <space> R o m a n s <space> . cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg <unk> y <space> d e l i c a t e <space> a p p l i c a t i o n <space> o f <space> o d o u r s <space> a n d <space> r i c h l y - d i s t i l l e d @@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg f a i n t i n g <space> a p p e t i t e <space> a n d <space> a d d e d <space> a <space> m o r e <space> e x <unk> u i s i t e <space> a n d cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg T h e y <space> c o n s i d e r e d <space> f l o w e r s <space> a s <space> f o r m i n g <space> a <space> v e r y <space> e s s e n t i a l cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg e t h e r e a l <space> e n j o y m e n t <space> t o <space> t h e <space> g r o s s e r <space> p l e a s u r e s <space> o f <space> t h e -cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg b o a r d <space> . <space> T h e <space> g r a t i f i c a t i o n <space> o f <space> t h e <space> s e n s e <space> o f <space> s m e l l i n g \ No newline at end of file +cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg b o a r d <space> . <space> T h e <space> g r a t i f i c a t i o n <space> o f <space> t h e <space> s e n s e <space> o f <space> s m e l l i n g +41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg T h e y <space> a l s o <space> h a d <space> t o <space> c o p e <space> w i t h <space> t h e <space> u s u a l <space> f l o o d <space> o f <space> r u m o u r s <space> , <space> s o - +41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg i n <space> w h i c h <space> s h e <space> h a d <space> d i e d <space> , <space> t h e r e <space> w a s <space> n o t h i n g <space> o n +41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg t h e <space> v i c t i m <space> a n d <space> i n s p e c t i n g <space> a t <space> W a t e r l o o <space> t h e <space> c a r r i a g e +41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg a d j o u r n e d <space> . +41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg t h e <space> T h u r s d a y <space> e v e n i n g <space> , <space> w i t h <space> b l o o d <space> a c t u a l l y <space> d r i p p i n g <space> f r o m +41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg h i s <space> h a n d s <space> . <space> T h e <space> i n <unk> u e s t <space> w a s <space> o p e n e d <space> o n <space> <unk> e b r u a r y <space> <unk> <unk> <space> , <space> b u t <space> , +41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg m a n <space> h a d <space> b e e n <space> s e e n <space> f l e e i n g <space> f r o m <space> <unk> a u x h a l l <space> s t a t i o n <space> o n +41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg m e <space> w e l l <space> m e a n t <space> , <space> s o m e <space> m i s c h i e v o u s <space> , <space> i n c l u d i n g <space> o n e <space> t h a t <space> a +41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg b e y o n d <space> t h e <space> j u r y <space> h e a r i n g <space> a <space> f o r m a l <space> i d e n t i f i c a t i o n <space> o f +41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg w h i c h <space> t o <space> p r o c e e d <space> , <space> a n d <space> t h e <space> i n <unk> u e s t <space> w a s \ No newline at end of file diff --git a/tests/data/pylaia/val_ids.txt b/tests/data/pylaia/val_ids.txt index 17e39f5792d22e90fe3187d9738ee459317cf6b0..5b2a6e55fc8eb8b53672f44629035c098d0bfd2f 100644 --- a/tests/data/pylaia/val_ids.txt +++ b/tests/data/pylaia/val_ids.txt @@ -1,13 +1,3 @@ -41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg -41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg @@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg -cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg \ No newline at end of file +cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg +41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg \ No newline at end of file diff --git a/tests/data/pylaia/val_no_space.txt b/tests/data/pylaia/val_no_space.txt index 4932651c7f667cb43d2e1bca20d35729dd98ead2..e433eb379a58b8ca53128c3e887cf7938bfb2960 100644 --- a/tests/data/pylaia/val_no_space.txt +++ b/tests/data/pylaia/val_no_space.txt @@ -1,13 +1,3 @@ -41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg They also had to cope with the usual flood of rumours , so- -41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg in which she had died , there was nothing on -41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg the victim and inspecting at Waterloo the carriage -41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg adjourned . -41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg the Thursday evening , with blood actually dripping from -41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg his hands . The inquest was opened on February 17 , but , -41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg man had been seen fleeing from Vauxhall station on -41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg me well meant , some mischievous , including one that a -41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg beyond the jury hearing a formal identification of -41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg which to proceed , and the inquest was cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-07_025b5770-f876-43d6-9466-7acd58ee3463.jpg article in their festal preparations ; and it is the cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-05_36ca3b5a-626c-4b1d-9c77-53ee91db5632.jpg was a subject of no little importance to the Romans . cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-00_5110fb10-7122-4d2f-9245-6f50ec81ec13.jpg By delicate application of odours and richly-distilled @@ -18,4 +8,14 @@ cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-10_8ffd0c0f-188c-48d9-abf2-d1f09b1 cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-02_be65ccfb-d3f4-4446-959c-3be85b554c8a.jpg fainting appetite and added a more exquisite and cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-06_cffe26ef-d0d2-4761-9d8f-471bc8b2432f.jpg They considered flowers as forming a very essential cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-03_dafccdac-2359-4f10-a958-aa901598c4ae.jpg ethereal enjoyment to the grosser pleasures of the -cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg board . The gratification of the sense of smelling \ No newline at end of file +cdfb799a-3b30-4ed4-b514-793778b2ed6e_f07-032a-04_ffc1422b-f551-4748-999a-86eede57cab2.jpg board . The gratification of the sense of smelling +41b78478-04cc-4995-9787-47794d305a09_f04-032-00_07b7efc7-27bd-4939-8ef6-30daf8ee9fe2.jpg They also had to cope with the usual flood of rumours , so- +41b78478-04cc-4995-9787-47794d305a09_f04-032-07_27b27def-1480-4995-a1fc-e83aa92b669f.jpg in which she had died , there was nothing on +41b78478-04cc-4995-9787-47794d305a09_f04-032-06_6b2139e4-3568-44c4-ac03-e5fef565fa5c.jpg the victim and inspecting at Waterloo the carriage +41b78478-04cc-4995-9787-47794d305a09_f04-032-09_920f2165-85d8-4f0d-8182-0c430c0c28f5.jpg adjourned . +41b78478-04cc-4995-9787-47794d305a09_f04-032-03_bdb9728f-5d82-4d69-b21b-016c2590c74b.jpg the Thursday evening , with blood actually dripping from +41b78478-04cc-4995-9787-47794d305a09_f04-032-04_c724af5d-d78c-4041-8e5f-1548298428c4.jpg his hands . The inquest was opened on February 17 , but , +41b78478-04cc-4995-9787-47794d305a09_f04-032-02_d0045984-87fc-4675-9ba4-3732943b2dc1.jpg man had been seen fleeing from Vauxhall station on +41b78478-04cc-4995-9787-47794d305a09_f04-032-01_da7e3e4f-8dc5-4a22-b650-28006c981691.jpg me well meant , some mischievous , including one that a +41b78478-04cc-4995-9787-47794d305a09_f04-032-05_e2fc7a77-2b60-4a40-bebc-a698d42ed134.jpg beyond the jury hearing a formal identification of +41b78478-04cc-4995-9787-47794d305a09_f04-032-08_ffda995f-ae71-499d-b963-a54d9ae6a9da.jpg which to proceed , and the inquest was \ No newline at end of file diff --git a/tests/data/test_db.sqlite b/tests/data/test_db.sqlite index 112afe1aeddbd2af2e32a9e0ce6ff2c201cf4c86..fa5669e340f3858fc10c915931dee38c06105ff6 100644 Binary files a/tests/data/test_db.sqlite and b/tests/data/test_db.sqlite differ diff --git a/tests/extract/test_base.py b/tests/extract/test_base.py index a7b95ff4170aa34d6bd4978ce7af27abad93d4ae..3e56a6894ef4581dbc170370f04d51247ba8da0a 100644 --- a/tests/extract/test_base.py +++ b/tests/extract/test_base.py @@ -13,7 +13,7 @@ from atr_data_generator.extract.arguments import ( SelectArgs, ) from atr_data_generator.extract.base import EXPORT_PATH, DataGenerator -from atr_data_generator.split.arguments import Partition, SplitArgs +from tests.conftest import DATASET_ID @pytest.mark.parametrize( @@ -23,46 +23,27 @@ from atr_data_generator.split.arguments import Partition, SplitArgs ([MANUAL]), # only manual transcriptions ), ) -@pytest.mark.parametrize( - "folders, expected_trans_lines", - ( - (["a0c4522d-2d80-4766-a01c-b9d686f41f6a"], [17]), - ( - [ - "a0c4522d-2d80-4766-a01c-b9d686f41f6a", - "39b9ac5c-89ab-4258-8116-965bf0ca0419", - ], - [17, 21], - ), - ), -) def test_run( database, - folders, image_cache, worker_version_ids, - expected_trans_lines, tmp_path, ): - train_folder, val_folder = folders if len(folders) == 2 else (folders[0], None) atr_data_gen = DataGenerator( common=CommonArgs( dataset_name="test", output_dir=tmp_path, cache_dir=image_cache ), image=ImageArgs(), - select=SelectArgs(), + select=SelectArgs(dataset=DATASET_ID), filter=FilterArgs(accepted_worker_version_ids=worker_version_ids), - split=SplitArgs(train_folder=train_folder, validation_folder=val_folder), ) atr_data_gen.run(database) # Read json transcription file data = json.loads((atr_data_gen.common.output_dir / EXPORT_PATH).read_text()) - - for partition, count in zip(Partition, expected_trans_lines): - assert len(data[partition.value]) == count + assert {key: len(value) for key, value in data.items()} == {"train": 17, "val": 21} # each image file should have one transcription file img_files = list(atr_data_gen.common.output_dir.rglob("*.jpg")) - assert len(img_files) == sum(expected_trans_lines) + assert len(img_files) == 38 diff --git a/tests/extract/test_pylaia.py b/tests/extract/test_pylaia.py index 71be6e5494a80112fc97662df34394c9ee192a26..81944ccada4d3868e0a62c5426ce79072766cf85 100644 --- a/tests/extract/test_pylaia.py +++ b/tests/extract/test_pylaia.py @@ -12,8 +12,7 @@ from atr_data_generator.extract.arguments import ( ) from atr_data_generator.extract.pylaia.arguments import PylaiaArgs from atr_data_generator.extract.pylaia.main import PylaiaDataGenerator -from atr_data_generator.split.arguments import SplitArgs -from tests.conftest import FIXTURES +from tests.conftest import DATASET_ID, FIXTURES FIXTURE_DIR = FIXTURES / "pylaia" @@ -25,38 +24,21 @@ FIXTURE_DIR = FIXTURES / "pylaia" ([MANUAL]), # only manual transcriptions ), ) -@pytest.mark.parametrize( - "folders, expected_trans_lines", - ( - (["a0c4522d-2d80-4766-a01c-b9d686f41f6a"], [17]), - ( - [ - "a0c4522d-2d80-4766-a01c-b9d686f41f6a", - "39b9ac5c-89ab-4258-8116-965bf0ca0419", - ], - [17, 21], - ), - ), -) @pytest.mark.parametrize("syms_path", ((None), (FIXTURE_DIR / "syms.txt"))) def test_run( database, - folders, image_cache, worker_version_ids, - expected_trans_lines, tmp_path, syms_path, ): - train_folder, val_folder = folders if len(folders) == 2 else (folders[0], None) atr_data_gen = PylaiaDataGenerator( common=CommonArgs( dataset_name="test", output_dir=tmp_path, cache_dir=image_cache ), image=ImageArgs(), - select=SelectArgs(), + select=SelectArgs(dataset=DATASET_ID), filter=FilterArgs(accepted_worker_version_ids=worker_version_ids), - split=SplitArgs(train_folder=train_folder, validation_folder=val_folder), pylaia=PylaiaArgs(syms_path=syms_path), ) @@ -68,4 +50,4 @@ def test_run( # each image file should have one transcription file img_files = list(atr_data_gen.common.output_dir.rglob("*.jpg")) - assert len(img_files) == sum(expected_trans_lines) + assert len(img_files) == 38