Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • bump-boto3
  • bump-mdutils
  • bump-mlflow-skinny
  • bump-torchvision
  • bump-torchaudio
  • bump-torch
  • bump-numpy
  • main
  • bump-matplotlib
  • bump-wandb
  • bump-nltk
  • bump-pandas
  • bump-PyYAML
  • bump-lxml
  • bump-scipy
  • bump-tensorboard
  • bump-mkdocs-material
  • bump-mkdocstrings-python
  • bump-albumentations
  • support-nested
  • bump-griffe
  • half_precision
  • unit-test-polygon-bug
  • bump-black
  • bump-imageio
  • bump-teklia-nerval
  • bump-tqdm
  • configs-fix-eval
  • memory-snapshot
  • 308-support-rotation-on-dan-dataset-extraction
  • remove-requirements-files
  • memory-profile
  • use-kornia
  • visualisation-predictions-validation-steps
  • move-unknown-token-replace
  • visualize-transcription-inference-gif
  • tmp
  • objects-to-plot-attention-map
  • 278-padding-tokens-are-not-ignored-during-training
  • fix-attention_vis
  • 281-add-support-for-prompting
  • bump-mkdocstrings
  • bump-mkdocs
  • model-compile-suppress-errors
  • bump-sentencepiece
  • memory-leak
  • merge-datasets
  • multi-dan
  • display-pred
  • bump-prettytable
  • memory-profiler
  • memleak
  • multi-dan-perceiver
  • lower-memory-usage
  • configuration-file-for-training
  • try-out-pytest-cov
  • parse-labels-json
  • multi-output
  • perceiver
  • label-smoothing
  • multi-output-old
  • fix-img-position
  • bump-opencv-python
  • test-dropout-dan
  • dan-temperature-scaling
  • split-charset
  • merge-training-managers
  • bump-fontTools
  • remove-syn-lines
  • rename-apply-teacher-forcing-to-add-label-noise-in-training-code
  • prompt-socface
  • bump-mlflow
  • combine-with-bert
  • train-random-start-token
  • 35-new-function-to-aggregate-attention-maps-2
  • 35-new-function-to-aggregate-attention-maps
  • fix-gifs
  • revert-changes-to-config-and-fix-publication
  • update-training-doc
  • 0.1.0
  • 0.2.0
  • 0.2.0-dev1
  • 0.2.0-dev2
  • 0.2.0-dev3
  • 0.2.0-dev4
  • 0.2.0-dev5
  • 0.2.0-dev6
  • 0.2.0rc1
  • 0.2.0rc10
  • 0.2.0rc11
  • 0.2.0rc12
  • 0.2.0rc2
  • 0.2.0rc3
  • 0.2.0rc4
  • 0.2.0rc5
  • 0.2.0rc6
  • 0.2.0rc7
  • 0.2.0rc8
  • 0.2.0rc9
  • 0.2.1
  • 0.2.2a1
  • 0.2.2a2
  • 0.2.2b1
  • 0.2.2b2
  • 0.2.2rc1
  • 0.2.2rc2
  • 0.2.2rc3
  • 0.2.2rc4
  • attention-exp
109 results

Target

Select target project
No results found
Select Git revision
  • bump-boto3
  • bump-mdutils
  • bump-mlflow-skinny
  • bump-torchvision
  • bump-torchaudio
  • bump-torch
  • bump-numpy
  • main
  • bump-matplotlib
  • bump-wandb
  • bump-nltk
  • bump-pandas
  • bump-PyYAML
  • bump-lxml
  • bump-scipy
  • bump-tensorboard
  • bump-mkdocs-material
  • bump-mkdocstrings-python
  • bump-albumentations
  • support-nested
  • bump-griffe
  • half_precision
  • unit-test-polygon-bug
  • bump-black
  • bump-imageio
  • bump-teklia-nerval
  • bump-tqdm
  • configs-fix-eval
  • memory-snapshot
  • 308-support-rotation-on-dan-dataset-extraction
  • remove-requirements-files
  • memory-profile
  • use-kornia
  • visualisation-predictions-validation-steps
  • move-unknown-token-replace
  • visualize-transcription-inference-gif
  • tmp
  • objects-to-plot-attention-map
  • 278-padding-tokens-are-not-ignored-during-training
  • fix-attention_vis
  • 281-add-support-for-prompting
  • bump-mkdocstrings
  • bump-mkdocs
  • model-compile-suppress-errors
  • bump-sentencepiece
  • memory-leak
  • merge-datasets
  • multi-dan
  • display-pred
  • bump-prettytable
  • memory-profiler
  • memleak
  • multi-dan-perceiver
  • lower-memory-usage
  • configuration-file-for-training
  • try-out-pytest-cov
  • parse-labels-json
  • multi-output
  • perceiver
  • label-smoothing
  • multi-output-old
  • fix-img-position
  • bump-opencv-python
  • test-dropout-dan
  • dan-temperature-scaling
  • split-charset
  • merge-training-managers
  • bump-fontTools
  • remove-syn-lines
  • rename-apply-teacher-forcing-to-add-label-noise-in-training-code
  • prompt-socface
  • bump-mlflow
  • combine-with-bert
  • train-random-start-token
  • 35-new-function-to-aggregate-attention-maps-2
  • 35-new-function-to-aggregate-attention-maps
  • fix-gifs
  • revert-changes-to-config-and-fix-publication
  • update-training-doc
  • 0.1.0
  • 0.2.0
  • 0.2.0-dev1
  • 0.2.0-dev2
  • 0.2.0-dev3
  • 0.2.0-dev4
  • 0.2.0-dev5
  • 0.2.0-dev6
  • 0.2.0rc1
  • 0.2.0rc10
  • 0.2.0rc11
  • 0.2.0rc12
  • 0.2.0rc2
  • 0.2.0rc3
  • 0.2.0rc4
  • 0.2.0rc5
  • 0.2.0rc6
  • 0.2.0rc7
  • 0.2.0rc8
  • 0.2.0rc9
  • 0.2.1
  • 0.2.2a1
  • 0.2.2a2
  • 0.2.2b1
  • 0.2.2b2
  • 0.2.2rc1
  • 0.2.2rc2
  • 0.2.2rc3
  • 0.2.2rc4
  • attention-exp
109 results
Show changes

Commits on Source 2

28 files
+ 68
68
Compare changes
  • Side-by-side
  • Inline

Files

Original line number Diff line number Diff line
@@ -9,9 +9,9 @@
                ["training", "train"]
            ]
        },
        "val": {
            "training-val": [
                ["training", "val"]
        "dev": {
            "training-dev": [
                ["training", "dev"]
            ]
        },
        "test": {
@@ -84,7 +84,7 @@
        "validation": {
            "eval_on_valid": true,
            "eval_on_valid_interval": 2,
            "set_name_focus_metric": "training-val",
            "set_name_focus_metric": "training-dev",
            "font": "fonts/LinuxLibertine.ttf",
            "maximum_font_size": 32,
            "nb_logged_images": 5
Original line number Diff line number Diff line
@@ -19,9 +19,9 @@
                ["$dataset_name", "train"]
            ]
        },
        "val": {
            "$dataset_name-val": [
                ["$dataset_name", "val"]
        "dev": {
            "$dataset_name-dev": [
                ["$dataset_name", "dev"]
            ]
        },
        "test": {
@@ -96,7 +96,7 @@
            "eval_on_valid": true,
            "eval_on_valid_interval": 5,
            "eval_on_valid_start": 0,
            "set_name_focus_metric": "$dataset_name-val",
            "set_name_focus_metric": "$dataset_name-dev",
            "font": "fonts/LinuxLibertine.ttf",
            "maximum_font_size": 32,
            "nb_logged_images": 5,
Original line number Diff line number Diff line
@@ -9,9 +9,9 @@
                ["training", "train"]
            ]
        },
        "val": {
            "training-val": [
                ["training", "val"]
        "dev": {
            "training-dev": [
                ["training", "dev"]
            ]
        },
        "test": {
@@ -85,7 +85,7 @@
            "eval_on_valid": true,
            "eval_on_valid_interval": 2,
            "eval_on_valid_start": 0,
            "set_name_focus_metric": "training-val",
            "set_name_focus_metric": "training-dev",
            "font": "fonts/LinuxLibertine.ttf",
            "maximum_font_size": 32,
            "nb_logged_images": 1
+1 −1
Original line number Diff line number Diff line
@@ -10,6 +10,6 @@ logging.basicConfig(
)

TRAIN_NAME = "train"
VAL_NAME = "val"
VAL_NAME = "dev"
TEST_NAME = "test"
SPLIT_NAMES = [TRAIN_NAME, VAL_NAME, TEST_NAME]
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ def logging_metrics(
    Log dictionary metrics in the Metrics section of MLflow

    :param display_values: dict, the dictionary containing the metrics to publish on MLflow
    :param step: str, the step for which the metrics are to be published on Metrics section (ex: train, val, test). This will allow a better display on MLflow.
    :param step: str, the step for which the metrics are to be published on Metrics section (ex: train, dev, test). This will allow a better display on MLflow.
    :param epoch: int, the current epoch.
    :param mlflow_logging: bool, allows you to verify that you have the authorization to log on MLflow, defaults to False
    :param is_master: bool, makes sure you're on the right thread, defaults to False
@@ -107,7 +107,7 @@ def logging_tags_metrics(
    Log dictionary metrics in the Tags section of MLflow

    :param display_values: dict, the dictionary containing the metrics to publish on MLflow
    :param step: str, the step for which the metrics are to be published on Tags section (ex: train, val, test). This will allow a better display on MLflow.
    :param step: str, the step for which the metrics are to be published on Tags section (ex: train, dev, test). This will allow a better display on MLflow.
    :param mlflow_logging: bool, allows you to verify that you have the authorization to log on MLflow, defaults to False
    :param is_master: bool, makes sure you're on the right thread, defaults to False
    """
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ There are a several steps to follow when training a DAN model.

To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:

1. Structure the data into splits (`train` / `val` / `test`) in a project dataset in [Arkindex](https://demo.arkindex.org/).
1. Structure the data into splits (`train` / `dev` / `test`) in a project dataset in [Arkindex](https://demo.arkindex.org/).
1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
1. Extract the data with the [extract command](../usage/datasets/extract.md).
1. Download images with the [download command](../usage/datasets/download.md).
@@ -22,7 +22,7 @@ output/
├── labels.json
├── images
│   ├── train
│   ├── val
│   ├── dev
│   └── test
└── language_model
    ├── corpus_characters.txt
Original line number Diff line number Diff line
@@ -38,7 +38,7 @@ The `--output` directory should have a `split.json` JSON-formatted file with a s
      "text": "ⓢCoufet ⓕBouis ⓑ07.12.14"
    },
  },
  "val": {},
  "dev": {},
  "test": {}
}
```
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ These files can be generated by the `teklia-dan dataset download` command. More
  "train": {
    "<image_path>": "\u24e2Coufet \u24d5Bouis \u24d107.12.14"
  },
  "val": {},
  "dev": {},
  "test": {}
}
```
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ This will, for each evaluated split:
| `--config`           | Path to the configuration file.                                                                                                                                                                          | `pathlib.Path` |                            |
| `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float`        | `0.3`                      |
| `--output-json`      | Where to save evaluation results in JSON format.                                                                                                                                                         | `pathlib.Path` | `None`                     |
| `--sets`             | Sets to evaluate. Defaults to `train`, `val`, `test`.                                                                                                                                                    | `list[str]`    | `["train", "val", "test"]` |
| `--sets`             | Sets to evaluate. Defaults to `train`, `dev`, `test`.                                                                                                                                                    | `list[str]`    | `["train", "dev", "test"]` |

## Examples

@@ -37,7 +37,7 @@ This will, for each evaluated split:
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) |
| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: |
| train |       x       |     x     |       x       |     x     |         x          |
|  val  |       x       |     x     |       x       |     x     |         x          |
|  dev  |       x       |     x     |       x       |     x     |         x          |
| test  |       x       |     x     |       x       |     x     |         x          |

#### 5 worst prediction(s)
@@ -57,7 +57,7 @@ This will, for each evaluated split:
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER |
| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | :-: |
| train |       x       |     x     |       x       |     x     |         x          |  x  |
|  val  |       x       |     x     |       x       |     x     |         x          |  x  |
|  dev  |       x       |     x     |       x       |     x     |         x          |  x  |
| test  |       x       |     x     |       x       |     x     |         x          |  x  |

#### Nerval evaluation
@@ -69,7 +69,7 @@ This will, for each evaluated split:
| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
|   All   |     x     |    x    |     x     |   x    |  x  |    x    |

##### val
##### dev

|   tag   | predicted | matched | Precision | Recall | F1  | Support |
| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
Original line number Diff line number Diff line
@@ -39,12 +39,12 @@ Alternatively, you can find them in the `Time series` tab.

The same metrics are computed on the validation set, except for the loss function:

- `val/{dataset}-val_cer`: the CER.
- `val/{dataset}-val_cer_no_token`: the CER ignoring punctuation marks.
- `val/{dataset}-val_ner`: the CER ignoring characters (only NE tokens are considered).
- `val/{dataset}-val_wer`. the WER.
- `val/{dataset}-val_wer_no_punct`: the WER ignoring punctuation marks.
- `val/{dataset}-val_wer_no_token`: the WER ignoring Named Entity (NE) tokens (only characters are considered).
- `dev/{dataset}-dev_cer`: the CER.
- `dev/{dataset}-dev_cer_no_token`: the CER ignoring punctuation marks.
- `dev/{dataset}-dev_ner`: the CER ignoring characters (only NE tokens are considered).
- `dev/{dataset}-dev_wer`. the WER.
- `dev/{dataset}-dev_wer_no_punct`: the WER ignoring punctuation marks.
- `dev/{dataset}-dev_wer_no_token`: the WER ignoring Named Entity (NE) tokens (only characters are considered).

These metrics can be visualized in the `Scalars` tab in Tensorboard, under the `valid` section.
<img src="../../../assets/tensorboard/example_scalars_val.png" />
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ Statistics
| Median |   0.0   |    0.0    | 0.0 |
| Total  |    0    |     0     |  0  |

# Val
# Dev

## Images statistics

Original line number Diff line number Diff line
@@ -15,7 +15,7 @@
            0.4286
        ]
    ],
    "val": [
    "dev": [
        [
            "2c242f5c-e979-43c4-b6f2-a6d4815b651d.png",
            "\u24c8A \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4A \u24c514331",
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER  |
|:-----:|:-------------:|:---------:|:-------------:|:---------:|:------------------:|:----:|
| train |     18.89     |   21.05   |     26.67     |   26.67   |       26.67        | 7.14 |
|  val  |      8.82     |   11.54   |      50.0     |    50.0   |        50.0        | 0.0  |
|  dev  |      8.82     |   11.54   |      50.0     |    50.0   |        50.0        | 0.0  |
|  test |      2.78     |    3.33   |     14.29     |   14.29   |       14.29        | 0.0  |

#### Nerval evaluation
@@ -22,7 +22,7 @@
| Surname   |         2 |       2 |     100.0 |  100.0 | 100.0 |       2 |
| ALL       |        15 |      12 |      80.0 |  85.71 | 82.76 |      14 |

##### val
##### dev

| tag       | predicted | matched | Precision | Recall |    F1 | Support |
|:----------|----------:|--------:|----------:|-------:|------:|--------:|
Original line number Diff line number Diff line
@@ -380,11 +380,11 @@
            "text": "ⓢPressonet  ⓕMarie  ⓑ12"
        }
    },
    "val": {
        "val-page_1-line_1": {
    "dev": {
        "dev-page_1-line_1": {
            "dataset_id": "dataset_id",
            "image": {
                "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_1.jpg",
                "iiif_url": "{FIXTURES}/extraction/images/dev-page_1-line_1.jpg",
                "polygon": [
                    [
                        0,
@@ -410,10 +410,10 @@
            },
            "text": "ⓢCiraud  ⓕAntoine  ⓑ34"
        },
        "val-page_1-line_2": {
        "dev-page_1-line_2": {
            "dataset_id": "dataset_id",
            "image": {
                "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_2.jpg",
                "iiif_url": "{FIXTURES}/extraction/images/dev-page_1-line_2.jpg",
                "polygon": [
                    [
                        0,
@@ -439,10 +439,10 @@
            },
            "text": "ⓢCiraud  ⓕPriser  ⓑ34"
        },
        "val-page_1-line_3": {
        "dev-page_1-line_3": {
            "dataset_id": "dataset_id",
            "image": {
                "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_3.jpg",
                "iiif_url": "{FIXTURES}/extraction/images/dev-page_1-line_3.jpg",
                "polygon": [
                    [
                        0,
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
        "images/0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier Ⓟ12241",
        "images/0dfe8bcd-ed0b-453e-bf19-cc697012296e.png": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁJ Ⓚch ⓄE dachyle"
    },
    "val": {
    "dev": {
        "images/2c242f5c-e979-43c4-b6f2-a6d4815b651d.png": "ⓈA ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF ⓄA Ⓟ14331"
    },
    "test": {
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
        "images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": "The latter do not regard",
        "images/0a70e14f-feda-4607-989c-36cf581ddff5.png": "At the beginning of"
    },
    "val": {
    "dev": {
        "images/0a576062-303c-4893-a729-c09c92865d31.png": "One can remember with",
        "images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": "The play was no more"
    },
Original line number Diff line number Diff line
@@ -96,10 +96,10 @@ def test_download(
                "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux  ⓕJean  ⓑ2",
                "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet  ⓕMarie  ⓑ12",
            },
            "val": {
                "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            "dev": {
                "images/dev/dataset_id/dev-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            },
        },
    )
@@ -126,10 +126,14 @@ def test_download(
    IMAGE_DIR = output / "images"
    TEST_DIR = IMAGE_DIR / "test" / "dataset_id"
    TRAIN_DIR = IMAGE_DIR / "train" / "dataset_id"
    VAL_DIR = IMAGE_DIR / "val" / "dataset_id"
    VAL_DIR = IMAGE_DIR / "dev" / "dataset_id"

    expected_paths = [
        output / "charset.pkl",
        # Images of dev folder
        VAL_DIR / "dev-page_1-line_1.jpg",
        VAL_DIR / "dev-page_1-line_2.jpg",
        VAL_DIR / "dev-page_1-line_3.jpg",
        # Images of test folder
        TEST_DIR / "test-page_1-line_1.jpg",
        TEST_DIR / "test-page_1-line_2.jpg",
@@ -145,10 +149,6 @@ def test_download(
        TRAIN_DIR / "train-page_2-line_1.jpg",
        TRAIN_DIR / "train-page_2-line_2.jpg",
        TRAIN_DIR / "train-page_2-line_3.jpg",
        # Images of val folder
        VAL_DIR / "val-page_1-line_1.jpg",
        VAL_DIR / "val-page_1-line_2.jpg",
        VAL_DIR / "val-page_1-line_3.jpg",
        output / "labels.json",
        output / "split.json",
    ]
Original line number Diff line number Diff line
@@ -149,7 +149,7 @@ def test_eval_nerval(capsys, evaluate_config):


@pytest.mark.parametrize(
    "training_res, val_res, test_res",
    "training_res, dev_res, test_res",
    (
        (
            {
@@ -202,7 +202,7 @@ def test_eval_nerval(capsys, evaluate_config):
)
@pytest.mark.parametrize("is_output_json", ((True, False)))
def test_evaluate(
    capsys, training_res, val_res, test_res, is_output_json, evaluate_config, tmp_path
    capsys, training_res, dev_res, test_res, is_output_json, evaluate_config, tmp_path
):
    evaluate_path = FIXTURES / "evaluate"

@@ -224,7 +224,7 @@ def test_evaluate(

    # Check that the evaluation results are correct
    for split_name, expected_res in zip(
        ["train", "val", "test"], [training_res, val_res, test_res]
        ["train", "dev", "test"], [training_res, dev_res, test_res]
    ):
        filename = (
            evaluate_config["training"]["output_folder"]
@@ -409,7 +409,7 @@ def test_evaluate_language_model(
            },
        ),
        (
            "val",
            "dev",
            {
                "nb_chars": 34,
                "cer": 0.0882,
Original line number Diff line number Diff line
@@ -114,10 +114,10 @@ def test_language_model(
                "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux  ⓕJean  ⓑ2",
                "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet  ⓕMarie  ⓑ12",
            },
            "val": {
                "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            "dev": {
                "images/dev/dataset_id/dev-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            },
        },
    )
@@ -318,10 +318,10 @@ def test_language_model_subword_vocab_size(
                "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux  ⓕJean  ⓑ2",
                "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet  ⓕMarie  ⓑ12",
            },
            "val": {
                "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            "dev": {
                "images/dev/dataset_id/dev-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
                "images/dev/dataset_id/dev-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
            },
        },
    )