diff --git a/dan/datasets/analyze/__init__.py b/dan/datasets/analyze/__init__.py index 1ce275ef509604af1ac902778b536d520a01fb25..d83e3e479ab0b8fd8e13e90316f5e1a86097cb6a 100644 --- a/dan/datasets/analyze/__init__.py +++ b/dan/datasets/analyze/__init__.py @@ -6,7 +6,7 @@ Analyze dataset and display statistics in markdown format. from pathlib import Path from dan.datasets.analyze.statistics import run -from dan.utils import read_json, read_yaml +from dan.utils import read_yaml def add_analyze_parser(subcommands) -> None: @@ -17,7 +17,7 @@ def add_analyze_parser(subcommands) -> None: ) parser.add_argument( "--labels", - type=read_json, + type=Path, help="Path to the formatted labels in JSON format.", required=True, ) diff --git a/dan/datasets/analyze/statistics.py b/dan/datasets/analyze/statistics.py index e937ab659ab5d7aedb2d79ab44c55f5648f5d32f..d1cc4dcd805abd063a4eb8594aed2311c6dabdd0 100644 --- a/dan/datasets/analyze/statistics.py +++ b/dan/datasets/analyze/statistics.py @@ -10,6 +10,8 @@ import numpy as np from mdutils.mdutils import MdUtils from prettytable import MARKDOWN, PrettyTable +from dan.utils import read_json + logger = logging.getLogger(__name__) METRIC_COLUMN = "Metric" @@ -157,14 +159,18 @@ class Statistics: level=3, ) - def run(self, labels: Dict, tokens: Dict | None): + def run(self, labels_path: Path, tokens: Dict | None): + labels = read_json(labels_path) + # Iterate over each split for split_name, split_data in labels.items(): self.document.new_header(level=1, title=split_name.capitalize()) # Image statistics # Path to the images are the key of the dict - self.create_image_statistics(images=split_data.keys()) + self.create_image_statistics( + images=[labels_path.parent / image_path for image_path in split_data] + ) labels = list(split_data.values()) # Text statistics @@ -175,8 +181,8 @@ class Statistics: self.document.create_md_file() -def run(labels: Dict, tokens: Dict | None, output: Path) -> None: +def run(labels: Path, tokens: Dict | None, output: Path) -> None: """ Compute and save a dataset statistics. """ - Statistics(filename=str(output)).run(labels=labels, tokens=tokens) + Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens) diff --git a/dan/datasets/download/images.py b/dan/datasets/download/images.py index c1702e51525c1ed0e00c3a3e641c291ebcdb912a..b0eabdd9b1684d0e61b9354cc743285702cb1d7f 100644 --- a/dan/datasets/download/images.py +++ b/dan/datasets/download/images.py @@ -49,7 +49,7 @@ class ImageDownloader: self.image_extension = image_extension # Load split file - split_file = output / "split.json" if output else None + split_file = self.output / "split.json" if self.output else None self.split: Dict = ( json.loads(split_file.read_text()) if split_file and split_file.is_file() @@ -127,7 +127,10 @@ class ImageDownloader: image_path = destination / values["dataset_id"] / filename image_path.parent.mkdir(parents=True, exist_ok=True) - self.data[split][str(image_path)] = values["text"] + # Store a relative path to the label file in case we need to move the data elsewhere + self.data[split][str(image_path.relative_to(self.output))] = values[ + "text" + ] # Create task for multithreading pool if image does not exist yet if image_path.exists(): diff --git a/dan/ocr/manager/dataset.py b/dan/ocr/manager/dataset.py index af90b10c4299b88dd2dcc0440279731962fd2107..46b14072af0c1375b8d38281e620db1c17f98adb 100644 --- a/dan/ocr/manager/dataset.py +++ b/dan/ocr/manager/dataset.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import copy import json -from pathlib import Path import numpy as np from torch.utils.data import Dataset @@ -95,7 +94,7 @@ class OCRDataset(Dataset): set_name = path_and_set["set_name"] gt = gt_per_set[set_name] for filename in natural_sort(gt): - filepath = Path(filename) + filepath = path / filename samples.append( { "name": filepath.name, @@ -105,7 +104,7 @@ class OCRDataset(Dataset): ) if self.load_in_memory: samples[-1]["img"] = self.preprocessing_transforms( - read_image(filename) + read_image(str(filepath)) ) return samples diff --git a/tests/data/prediction/labels.json b/tests/data/prediction/labels.json index 6efebc38e665e2431142f5b577d4854dc538880a..23a1ee411692b4ef2c1b1b997c3cf5ea874e5b06 100644 --- a/tests/data/prediction/labels.json +++ b/tests/data/prediction/labels.json @@ -1,12 +1,12 @@ { "train": { - "tests/data/prediction/images/0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…12241", - "tests/data/prediction/images/0dfe8bcd-ed0b-453e-bf19-cc697012296e.png": "ⓈTemplié â’»Marcelle â’·93 â“J â“€ch â“„E dachyle" + "images/0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…12241", + "images/0dfe8bcd-ed0b-453e-bf19-cc697012296e.png": "ⓈTemplié â’»Marcelle â’·93 â“J â“€ch â“„E dachyle" }, "val": { - "tests/data/prediction/images/2c242f5c-e979-43c4-b6f2-a6d4815b651d.png": "ⓈA â’»Charles â’·11 â“P â’¸C â“€F â“„A â“…14331" + "images/2c242f5c-e979-43c4-b6f2-a6d4815b651d.png": "ⓈA â’»Charles â’·11 â“P â’¸C â“€F â“„A â“…14331" }, "test": { - "tests/data/prediction/images/ffdec445-7f14-4f5f-be44-68d0844d0df1.png": "ⓈNaudin â’»Marie â’·53 â“S â’¸V â“€Belle mère" + "images/ffdec445-7f14-4f5f-be44-68d0844d0df1.png": "ⓈNaudin â’»Marie â’·53 â“S â’¸V â“€Belle mère" } } diff --git a/tests/data/training/training_dataset/labels.json b/tests/data/training/training_dataset/labels.json index da3b3df76ef9b53773aebc1a23305510494e57db..75d28b2f546fadab7ef498c42da34df3fbe5c7d2 100644 --- a/tests/data/training/training_dataset/labels.json +++ b/tests/data/training/training_dataset/labels.json @@ -1,14 +1,14 @@ { "train": { - "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": "The latter do not regard", - "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png": "At the beginning of" + "images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": "The latter do not regard", + "images/0a70e14f-feda-4607-989c-36cf581ddff5.png": "At the beginning of" }, "val": { - "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png": "One can remember with", - "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": "The play was no more" + "images/0a576062-303c-4893-a729-c09c92865d31.png": "One can remember with", + "images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": "The play was no more" }, "test": { - "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": "Both her wrists bore", - "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": "SOME years ago a contemporary" + "images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": "Both her wrists bore", + "images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": "SOME years ago a contemporary" } } diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 82d18fe1608df485f4d50688b423465443d40d0b..43e23823fd434e785883dae6cff9cbb7c528cb5d 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -3,7 +3,7 @@ import pytest from mdutils.mdutils import MdUtils -from dan.datasets.analyze import read_json, read_yaml +from dan.datasets.analyze import read_yaml from dan.datasets.analyze.statistics import Statistics from tests.conftest import FIXTURES @@ -74,7 +74,7 @@ def test_run(full_statistics, tmp_path): output_file = tmp_path / "stats.md" stats = Statistics(filename=str(output_file)) stats.run( - labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"), + labels_path=FIXTURES / "training" / "training_dataset" / "labels.json", tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"), ) assert output_file.read_text() == full_statistics diff --git a/tests/test_download.py b/tests/test_download.py index 174e69a449dac1c173dd3896c475e90572a81084..9b018d8b0a1f5d3974d117bbe973bf40d2530cbf 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -90,26 +90,26 @@ def test_download(split_content, monkeypatch, tmp_path): # Check "labels.json" expected_labels = { "test": { - str(TEST_DIR / "test-page_1-line_1.jpg"): "â“¢Couâ‡e⇠ⓕBouis â“‘â‡.12.14", - str(TEST_DIR / "test-page_1-line_2.jpg"): "â“¢â‡outrain â“•Aâ‡olâ‡â‡e â“‘9.4.13", - str(TEST_DIR / "test-page_1-line_3.jpg"): "â“¢â‡abale â“•â‡ranâ‡ais â“‘26.3.11", - str(TEST_DIR / "test-page_2-line_1.jpg"): "â“¢â‡urosoy â“•Bouis â“‘22â‡4â‡18", - str(TEST_DIR / "test-page_2-line_2.jpg"): "â“¢Colaiani â“•Anâ‡els â“‘28.11.1â‡", - str(TEST_DIR / "test-page_2-line_3.jpg"): "â“¢Renouar⇠ⓕMaurice â“‘2â‡.â‡.04", + "images/test/dataset_id/test-page_1-line_1.jpg": "â“¢Couâ‡e⇠ⓕBouis â“‘â‡.12.14", + "images/test/dataset_id/test-page_1-line_2.jpg": "â“¢â‡outrain â“•Aâ‡olâ‡â‡e â“‘9.4.13", + "images/test/dataset_id/test-page_1-line_3.jpg": "â“¢â‡abale â“•â‡ranâ‡ais â“‘26.3.11", + "images/test/dataset_id/test-page_2-line_1.jpg": "â“¢â‡urosoy â“•Bouis â“‘22â‡4â‡18", + "images/test/dataset_id/test-page_2-line_2.jpg": "â“¢Colaiani â“•Anâ‡els â“‘28.11.1â‡", + "images/test/dataset_id/test-page_2-line_3.jpg": "â“¢Renouar⇠ⓕMaurice â“‘2â‡.â‡.04", }, "train": { - str(TRAIN_DIR / "train-page_1-line_1.jpg"): "â“¢Caillet â“•Maurice â“‘28.9.06", - str(TRAIN_DIR / "train-page_1-line_2.jpg"): "â“¢Reboul â“•Jean â“‘30.9.02", - str(TRAIN_DIR / "train-page_1-line_3.jpg"): "â“¢Bareyre â“•Jean â“‘28.3.11", - str(TRAIN_DIR / "train-page_1-line_4.jpg"): "â“¢Roussy â“•Jean â“‘4.11.14", - str(TRAIN_DIR / "train-page_2-line_1.jpg"): "â“¢Marin â“•Marcel â“‘10.8.06", - str(TRAIN_DIR / "train-page_2-line_2.jpg"): "â“¢Amical â“•Eloi â“‘11.10.04", - str(TRAIN_DIR / "train-page_2-line_3.jpg"): "â“¢Biros â“•Mael â“‘30.10.10", + "images/train/dataset_id/train-page_1-line_1.jpg": "â“¢Caillet â“•Maurice â“‘28.9.06", + "images/train/dataset_id/train-page_1-line_2.jpg": "â“¢Reboul â“•Jean â“‘30.9.02", + "images/train/dataset_id/train-page_1-line_3.jpg": "â“¢Bareyre â“•Jean â“‘28.3.11", + "images/train/dataset_id/train-page_1-line_4.jpg": "â“¢Roussy â“•Jean â“‘4.11.14", + "images/train/dataset_id/train-page_2-line_1.jpg": "â“¢Marin â“•Marcel â“‘10.8.06", + "images/train/dataset_id/train-page_2-line_2.jpg": "â“¢Amical â“•Eloi â“‘11.10.04", + "images/train/dataset_id/train-page_2-line_3.jpg": "â“¢Biros â“•Mael â“‘30.10.10", }, "val": { - str(VAL_DIR / "val-page_1-line_1.jpg"): "â“¢Monar⇠ⓕBouis â“‘29â‡â‡â‡04", - str(VAL_DIR / "val-page_1-line_2.jpg"): "â“¢Astier â“•Artâ‡ur â“‘11â‡2â‡13", - str(VAL_DIR / "val-page_1-line_3.jpg"): "â“¢â‡e â‡lieâ‡er â“•Jules â“‘21â‡11â‡11", + "images/val/dataset_id/val-page_1-line_1.jpg": "â“¢Monar⇠ⓕBouis â“‘29â‡â‡â‡04", + "images/val/dataset_id/val-page_1-line_2.jpg": "â“¢Astier â“•Artâ‡ur â“‘11â‡2â‡13", + "images/val/dataset_id/val-page_1-line_3.jpg": "â“¢â‡e â‡lieâ‡er â“•Jules â“‘21â‡11â‡11", }, }