Skip to content
Snippets Groups Projects
Commit e5080e4c authored by Manon Blanco's avatar Manon Blanco
Browse files

Relative paths in `labels.json`

parent cb9dd9bc
No related branches found
No related tags found
1 merge request!386Relative paths in `labels.json`
...@@ -6,7 +6,7 @@ Analyze dataset and display statistics in markdown format. ...@@ -6,7 +6,7 @@ Analyze dataset and display statistics in markdown format.
from pathlib import Path from pathlib import Path
from dan.datasets.analyze.statistics import run from dan.datasets.analyze.statistics import run
from dan.utils import read_json, read_yaml from dan.utils import read_yaml
def add_analyze_parser(subcommands) -> None: def add_analyze_parser(subcommands) -> None:
...@@ -17,7 +17,7 @@ def add_analyze_parser(subcommands) -> None: ...@@ -17,7 +17,7 @@ def add_analyze_parser(subcommands) -> None:
) )
parser.add_argument( parser.add_argument(
"--labels", "--labels",
type=read_json, type=Path,
help="Path to the formatted labels in JSON format.", help="Path to the formatted labels in JSON format.",
required=True, required=True,
) )
......
...@@ -10,6 +10,8 @@ import numpy as np ...@@ -10,6 +10,8 @@ import numpy as np
from mdutils.mdutils import MdUtils from mdutils.mdutils import MdUtils
from prettytable import MARKDOWN, PrettyTable from prettytable import MARKDOWN, PrettyTable
from dan.utils import read_json
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
METRIC_COLUMN = "Metric" METRIC_COLUMN = "Metric"
...@@ -157,14 +159,18 @@ class Statistics: ...@@ -157,14 +159,18 @@ class Statistics:
level=3, level=3,
) )
def run(self, labels: Dict, tokens: Dict | None): def run(self, labels_path: Path, tokens: Dict | None):
labels = read_json(labels_path)
# Iterate over each split # Iterate over each split
for split_name, split_data in labels.items(): for split_name, split_data in labels.items():
self.document.new_header(level=1, title=split_name.capitalize()) self.document.new_header(level=1, title=split_name.capitalize())
# Image statistics # Image statistics
# Path to the images are the key of the dict # Path to the images are the key of the dict
self.create_image_statistics(images=split_data.keys()) self.create_image_statistics(
images=[labels_path.parent / image_path for image_path in split_data]
)
labels = list(split_data.values()) labels = list(split_data.values())
# Text statistics # Text statistics
...@@ -175,8 +181,8 @@ class Statistics: ...@@ -175,8 +181,8 @@ class Statistics:
self.document.create_md_file() self.document.create_md_file()
def run(labels: Dict, tokens: Dict | None, output: Path) -> None: def run(labels: Path, tokens: Dict | None, output: Path) -> None:
""" """
Compute and save a dataset statistics. Compute and save a dataset statistics.
""" """
Statistics(filename=str(output)).run(labels=labels, tokens=tokens) Statistics(filename=str(output)).run(labels_path=labels, tokens=tokens)
...@@ -49,7 +49,7 @@ class ImageDownloader: ...@@ -49,7 +49,7 @@ class ImageDownloader:
self.image_extension = image_extension self.image_extension = image_extension
# Load split file # Load split file
split_file = output / "split.json" if output else None split_file = self.output / "split.json" if self.output else None
self.split: Dict = ( self.split: Dict = (
json.loads(split_file.read_text()) json.loads(split_file.read_text())
if split_file and split_file.is_file() if split_file and split_file.is_file()
...@@ -127,7 +127,10 @@ class ImageDownloader: ...@@ -127,7 +127,10 @@ class ImageDownloader:
image_path = destination / values["dataset_id"] / filename image_path = destination / values["dataset_id"] / filename
image_path.parent.mkdir(parents=True, exist_ok=True) image_path.parent.mkdir(parents=True, exist_ok=True)
self.data[split][str(image_path)] = values["text"] # Store a relative path to the label file in case we need to move the data elsewhere
self.data[split][str(image_path.relative_to(self.output))] = values[
"text"
]
# Create task for multithreading pool if image does not exist yet # Create task for multithreading pool if image does not exist yet
if image_path.exists(): if image_path.exists():
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import copy import copy
import json import json
from pathlib import Path
import numpy as np import numpy as np
from torch.utils.data import Dataset from torch.utils.data import Dataset
...@@ -95,7 +94,7 @@ class OCRDataset(Dataset): ...@@ -95,7 +94,7 @@ class OCRDataset(Dataset):
set_name = path_and_set["set_name"] set_name = path_and_set["set_name"]
gt = gt_per_set[set_name] gt = gt_per_set[set_name]
for filename in natural_sort(gt): for filename in natural_sort(gt):
filepath = Path(filename) filepath = path / filename
samples.append( samples.append(
{ {
"name": filepath.name, "name": filepath.name,
...@@ -105,7 +104,7 @@ class OCRDataset(Dataset): ...@@ -105,7 +104,7 @@ class OCRDataset(Dataset):
) )
if self.load_in_memory: if self.load_in_memory:
samples[-1]["img"] = self.preprocessing_transforms( samples[-1]["img"] = self.preprocessing_transforms(
read_image(filename) read_image(str(filepath))
) )
return samples return samples
......
{ {
"train": { "train": {
"tests/data/prediction/images/0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ12241", "images/0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ12241",
"tests/data/prediction/images/0dfe8bcd-ed0b-453e-bf19-cc697012296e.png": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁJ Ⓚch ⓄE dachyle" "images/0dfe8bcd-ed0b-453e-bf19-cc697012296e.png": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁJ Ⓚch ⓄE dachyle"
}, },
"val": { "val": {
"tests/data/prediction/images/2c242f5c-e979-43c4-b6f2-a6d4815b651d.png": "ⓈA ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF ⓄA Ⓟ14331" "images/2c242f5c-e979-43c4-b6f2-a6d4815b651d.png": "ⓈA ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF ⓄA Ⓟ14331"
}, },
"test": { "test": {
"tests/data/prediction/images/ffdec445-7f14-4f5f-be44-68d0844d0df1.png": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS ⒸV ⓀBelle mère" "images/ffdec445-7f14-4f5f-be44-68d0844d0df1.png": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS ⒸV ⓀBelle mère"
} }
} }
{ {
"train": { "train": {
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": "The latter do not regard", "images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": "The latter do not regard",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png": "At the beginning of" "images/0a70e14f-feda-4607-989c-36cf581ddff5.png": "At the beginning of"
}, },
"val": { "val": {
"tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png": "One can remember with", "images/0a576062-303c-4893-a729-c09c92865d31.png": "One can remember with",
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": "The play was no more" "images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": "The play was no more"
}, },
"test": { "test": {
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": "Both her wrists bore", "images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": "Both her wrists bore",
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": "SOME years ago a contemporary" "images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": "SOME years ago a contemporary"
} }
} }
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from mdutils.mdutils import MdUtils from mdutils.mdutils import MdUtils
from dan.datasets.analyze import read_json, read_yaml from dan.datasets.analyze import read_yaml
from dan.datasets.analyze.statistics import Statistics from dan.datasets.analyze.statistics import Statistics
from tests.conftest import FIXTURES from tests.conftest import FIXTURES
...@@ -74,7 +74,7 @@ def test_run(full_statistics, tmp_path): ...@@ -74,7 +74,7 @@ def test_run(full_statistics, tmp_path):
output_file = tmp_path / "stats.md" output_file = tmp_path / "stats.md"
stats = Statistics(filename=str(output_file)) stats = Statistics(filename=str(output_file))
stats.run( stats.run(
labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"), labels_path=FIXTURES / "training" / "training_dataset" / "labels.json",
tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"), tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"),
) )
assert output_file.read_text() == full_statistics assert output_file.read_text() == full_statistics
...@@ -90,26 +90,26 @@ def test_download(split_content, monkeypatch, tmp_path): ...@@ -90,26 +90,26 @@ def test_download(split_content, monkeypatch, tmp_path):
# Check "labels.json" # Check "labels.json"
expected_labels = { expected_labels = {
"test": { "test": {
str(TEST_DIR / "test-page_1-line_1.jpg"): "ⓢCou⁇e⁇ ⓕBouis ⓑ⁇.12.14", "images/test/dataset_id/test-page_1-line_1.jpg": "ⓢCou⁇e⁇ ⓕBouis ⓑ⁇.12.14",
str(TEST_DIR / "test-page_1-line_2.jpg"): "ⓢ⁇outrain ⓕA⁇ol⁇⁇e ⓑ9.4.13", "images/test/dataset_id/test-page_1-line_2.jpg": "ⓢ⁇outrain ⓕA⁇ol⁇⁇e ⓑ9.4.13",
str(TEST_DIR / "test-page_1-line_3.jpg"): "ⓢ⁇abale ⓕ⁇ran⁇ais ⓑ26.3.11", "images/test/dataset_id/test-page_1-line_3.jpg": "ⓢ⁇abale ⓕ⁇ran⁇ais ⓑ26.3.11",
str(TEST_DIR / "test-page_2-line_1.jpg"): "ⓢ⁇urosoy ⓕBouis ⓑ22⁇4⁇18", "images/test/dataset_id/test-page_2-line_1.jpg": "ⓢ⁇urosoy ⓕBouis ⓑ22⁇4⁇18",
str(TEST_DIR / "test-page_2-line_2.jpg"): "ⓢColaiani ⓕAn⁇els ⓑ28.11.1⁇", "images/test/dataset_id/test-page_2-line_2.jpg": "ⓢColaiani ⓕAn⁇els ⓑ28.11.1⁇",
str(TEST_DIR / "test-page_2-line_3.jpg"): "ⓢRenouar⁇ ⓕMaurice ⓑ2⁇.⁇.04", "images/test/dataset_id/test-page_2-line_3.jpg": "ⓢRenouar⁇ ⓕMaurice ⓑ2⁇.⁇.04",
}, },
"train": { "train": {
str(TRAIN_DIR / "train-page_1-line_1.jpg"): "ⓢCaillet ⓕMaurice ⓑ28.9.06", "images/train/dataset_id/train-page_1-line_1.jpg": "ⓢCaillet ⓕMaurice ⓑ28.9.06",
str(TRAIN_DIR / "train-page_1-line_2.jpg"): "ⓢReboul ⓕJean ⓑ30.9.02", "images/train/dataset_id/train-page_1-line_2.jpg": "ⓢReboul ⓕJean ⓑ30.9.02",
str(TRAIN_DIR / "train-page_1-line_3.jpg"): "ⓢBareyre ⓕJean ⓑ28.3.11", "images/train/dataset_id/train-page_1-line_3.jpg": "ⓢBareyre ⓕJean ⓑ28.3.11",
str(TRAIN_DIR / "train-page_1-line_4.jpg"): "ⓢRoussy ⓕJean ⓑ4.11.14", "images/train/dataset_id/train-page_1-line_4.jpg": "ⓢRoussy ⓕJean ⓑ4.11.14",
str(TRAIN_DIR / "train-page_2-line_1.jpg"): "ⓢMarin ⓕMarcel ⓑ10.8.06", "images/train/dataset_id/train-page_2-line_1.jpg": "ⓢMarin ⓕMarcel ⓑ10.8.06",
str(TRAIN_DIR / "train-page_2-line_2.jpg"): "ⓢAmical ⓕEloi ⓑ11.10.04", "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢAmical ⓕEloi ⓑ11.10.04",
str(TRAIN_DIR / "train-page_2-line_3.jpg"): "ⓢBiros ⓕMael ⓑ30.10.10", "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢBiros ⓕMael ⓑ30.10.10",
}, },
"val": { "val": {
str(VAL_DIR / "val-page_1-line_1.jpg"): "ⓢMonar⁇ ⓕBouis ⓑ29⁇⁇⁇04", "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢMonar⁇ ⓕBouis ⓑ29⁇⁇⁇04",
str(VAL_DIR / "val-page_1-line_2.jpg"): "ⓢAstier ⓕArt⁇ur ⓑ11⁇2⁇13", "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢAstier ⓕArt⁇ur ⓑ11⁇2⁇13",
str(VAL_DIR / "val-page_1-line_3.jpg"): "ⓢ⁇e ⁇lie⁇er ⓕJules ⓑ21⁇11⁇11", "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢ⁇e ⁇lie⁇er ⓕJules ⓑ21⁇11⁇11",
}, },
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment