Compare revisions

Yoann Schneider · Mélodie Boillet · Yoann Schneider · Mélodie Boillet · deab8965 · deab8965
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,3 +46,10 @@ repos:
  - repo: meta
    hooks:
      - id: check-useless-excludes
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.16
+    hooks:
+    - id: mdformat
+      # Optionally add plugins
+      additional_dependencies:
+      - mdformat-mkdocs[recommended]
--- a/README.md
+++ b/README.md
@@ -10,19 +10,20 @@ To use DAN in your own scripts, install it using pip:
 pip install -e .
 ```

-For more details about this package, make sure to see the documentation available at https://teklia.gitlab.io/atr/dan/.
+For more details about this package, make sure to see the documentation available at <https://teklia.gitlab.io/atr/dan/>.

 ## Development

 For development and tests purpose it may be useful to install the project as a editable package with pip.

-* Use a virtualenv (e.g. with virtualenvwrapper `mkvirtualenv -a . dan`)
-* Install `dan` as a package (e.g. `pip install -e .`)
+- Use a virtualenv (e.g. with virtualenvwrapper `mkvirtualenv -a . dan`)
+- Install `dan` as a package (e.g. `pip install -e .`)

 ### Linter

 Code syntax is analyzed before submitting the code.\
 To run the linter tools suite you may use pre-commit.
+
 ```shell
 pip install pre-commit
 pre-commit run -a
@@ -32,6 +33,7 @@ pre-commit run -a

 Tests are executed with `tox` using [pytest](https://pytest.org).
 To install `tox`,
+
 ```shell
 pip install tox
 tox
@@ -48,16 +50,18 @@ The tests use a large file stored via [Git-LFS](https://docs.gitlab.com/ee/topic

 Please keep the documentation updated when modifying or adding features.
 It's pretty easy to do:
+
 ```shell
 pip install -r doc-requirements.txt
 mkdocs serve
 ```

-You can then write in Markdown in the relevant `docs/*.md` files, and see live output on http://localhost:8000.
+You can then write in Markdown in the relevant `docs/*.md` files, and see live output on <http://localhost:8000>.

 ## Inference

 To apply DAN to an image, one needs to first add a few imports and to load an image. Note that the image should be in RGB.
+
 ```python
 import cv2
 from dan.predict import DAN
@@ -66,16 +70,18 @@ image = cv2.cvtColor(cv2.imread(IMAGE_PATH), cv2.COLOR_BGR2RGB)
 ```

 Then one can initialize and load the trained model with the parameters used during training.
+
 ```python
-model_path = 'model.pt'
-params_path = 'parameters.yml'
-charset_path = 'charset.pkl'
+model_path = "model.pt"
+params_path = "parameters.yml"
+charset_path = "charset.pkl"

-model = DAN('cpu')
+model = DAN("cpu")
 model.load(model_path, params_path, charset_path, mode="eval")
 ```

 To run the inference on a GPU, one can replace `cpu` by the name of the GPU. In the end, one can run the prediction:
+
 ```python
 text, confidence_scores = model.predict(image, confidences=True)
 ```

--- a/dan/predict/__init__.py
+++ b/dan/predict/__init__.py
@@ -140,4 +140,12 @@ def add_predict_parser(subcommands) -> None:
        type=int,
        required=False,
    )
+    parser.add_argument(
+        "--batch-size",
+        help="Size of prediction batches.",
+        type=int,
+        default=1,
+        required=False,
+    )
+
    parser.set_defaults(func=run)
--- a/dan/predict/attention.py
+++ b/dan/predict/attention.py
@@ -4,6 +4,7 @@ import re
 import cv2
 import numpy as np
 from PIL import Image
+from torchvision.transforms.functional import to_pil_image

 from dan import logger

@@ -179,7 +180,7 @@ def blend_coverage(coverage_vector, image, mask, scale):
    blend = Image.composite(image, coverage_vector, mask)

    # Resize to save time
-    blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
+    blend = blend.resize((int(width * scale), int(height * scale)), Image.LANCZOS)
    return blend


@@ -292,7 +293,7 @@ def plot_attention(
 ):
    """
    Create a gif by blending attention maps to the image for each text piece (char, word or line)
-    :param image: Input image in PIL format
+    :param image: Input image as torch.Tensor
    :param text: Text predicted by DAN
    :param weights: Attention weights of size (n_char, feature_height, feature_width)
    :param level: Level to display (must be in [char, word, line])
@@ -303,12 +304,11 @@ def plot_attention(
    :param display_polygons: Whether to plot extracted polygons
    """

-    height, width, _ = image.shape
+    image = to_pil_image(image)
    attention_map = []

    # Convert to PIL Image and create mask
-    mask = Image.new("L", (width, height), color=(110))
-    image = Image.fromarray(image)
+    mask = Image.new("L", (image.width, image.height), color=(110))

    # Split text into characters, words or lines
    text_list, offset = split_text(text, level, word_separators, line_separators)
@@ -320,7 +320,7 @@ def plot_attention(
    for text_piece in text_list:
        # Accumulate weights for the current word/line and resize to original image size
        coverage_vector = compute_coverage(
-            text_piece, max_value, tot_len, weights, (width, height)
+            text_piece, max_value, tot_len, weights, (image.width, image.height)
        )

        # Get polygons if flag is set:
@@ -333,7 +333,7 @@ def plot_attention(
                weights,
                threshold_method=threshold_method,
                threshold_value=threshold_value,
-                size=(width, height),
+                size=(image.width, image.height),
            )

            if contour is not None:

--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -20,7 +20,7 @@ from dan.predict.attention import (
    split_text_and_confidences,
 )
 from dan.transforms import get_normalization_transforms, get_preprocessing_transforms
-from dan.utils import ind_to_token, read_image
+from dan.utils import ind_to_token, list_to_batches, pad_images, read_image


 class DAN:
@@ -248,8 +248,8 @@ class DAN:
        return out


-def process_image(
-    image_path,
+def process_batch(
+    image_batch,
    dan_model,
    device,
    output,
@@ -264,20 +264,25 @@ def process_image(
    threshold_method,
    threshold_value,
 ):
-    # Load image and pre-process it
-    image = dan_model.preprocess(str(image_path))
-    logger.info("Image loaded.")
+    input_images, input_sizes = [], []
+    logger.info("Loading images...")
+    for image_path in image_batch:
+        # Load image and pre-process it
+        image = dan_model.preprocess(str(image_path))
+        input_images.append(image)
+        input_sizes.append(image.shape[1:])

    # Convert to tensor of size (batch_size, channel, height, width) with batch_size=1
-    input_tensor = image.unsqueeze(0)
-    input_tensor = input_tensor.to(device)
-    input_sizes = [image.shape[1:]]
+    input_tensor = pad_images(input_images).to(device)
+
+    logger.info("Images preprocessed!")

    # Parse delimiters to regex
    word_separators = parse_delimiters(word_separators)
    line_separators = parse_delimiters(line_separators)

    # Predict
+    logger.info("Predicting...")
    prediction = dan_model.predict(
        input_tensor,
        input_sizes,
@@ -290,70 +295,78 @@ def process_image(
        threshold_method=threshold_method,
        threshold_value=threshold_value,
    )
+    logger.info("Prediction parsing...")
+
+    for idx, image_path in enumerate(image_batch):
+        predicted_text = prediction["text"][idx]
+        result = {"text": predicted_text}
+
+        # Return extracted objects (coordinates, text, confidence)
+        if predict_objects:
+            result["objects"] = prediction["objects"][idx]
+
+        # Return mean confidence score
+        if confidence_score:
+            result["confidences"] = {}
+            char_confidences = prediction["confidences"][idx]
+            # retrieve the index of the token ner
+            index = [
+                pos
+                for pos, char in enumerate(predicted_text)
+                if char in ["ⓝ", "ⓟ", "ⓓ", "ⓡ"]
+            ]

-    result = {}
-    result["text"] = prediction["text"][0]
-
-    # Return extracted objects (coordinates, text, confidence)
-    if predict_objects:
-        result["objects"] = prediction["objects"][0]
-
-    # Return mean confidence score
-    if confidence_score:
-        result["confidences"] = {}
-        char_confidences = prediction["confidences"][0]
-        text = result["text"]
-        # retrieve the index of the token ner
-        index = [pos for pos, char in enumerate(text) if char in ["ⓝ", "ⓟ", "ⓓ", "ⓡ"]]
-
-        # calculates scores by token
-
-        result["confidences"]["by ner token"] = [
-            {
-                "text": f"{text[current: next_token]}".replace("\n", " "),
-                "confidence_ner": f"{np.around(np.mean(char_confidences[current : next_token]), 2)}",
-            }
-            # We go up to -1 so that the last token matches until the end of the text
-            for current, next_token in pairwise(index + [-1])
-        ]
-        result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
-
-        for level in confidence_score_levels:
-            result["confidences"][level] = []
-            texts, confidences, _ = split_text_and_confidences(
-                prediction["text"][0],
-                char_confidences,
-                level,
-                word_separators,
-                line_separators,
-            )
+            # calculates scores by token

-            for text, conf in zip(texts, confidences):
-                result["confidences"][level].append({"text": text, "confidence": conf})
-
-    # Save gif with attention map
-    if attention_map:
-        gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
-        logger.info(f"Creating attention GIF in {gif_filename}")
-        # this returns polygons but unused for now.
-        plot_attention(
-            image=image,
-            text=prediction["text"][0],
-            weights=prediction["attentions"][0],
-            level=attention_map_level,
-            scale=attention_map_scale,
-            word_separators=word_separators,
-            line_separators=line_separators,
-            display_polygons=predict_objects,
-            threshold_method=threshold_method,
-            threshold_value=threshold_value,
-            outname=gif_filename,
-        )
-        result["attention_gif"] = gif_filename
+            result["confidences"]["by ner token"] = [
+                {
+                    "text": f"{predicted_text[current: next_token]}".replace("\n", " "),
+                    "confidence_ner": f"{np.around(np.mean(char_confidences[current : next_token]), 2)}",
+                }
+                # We go up to -1 so that the last token matches until the end of the text
+                for current, next_token in pairwise(index + [-1])
+            ]
+            result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
+
+            for level in confidence_score_levels:
+                result["confidences"][level] = []
+                texts, confidences, _ = split_text_and_confidences(
+                    predicted_text,
+                    char_confidences,
+                    level,
+                    word_separators,
+                    line_separators,
+                )
+
+                for text, conf in zip(texts, confidences):
+                    result["confidences"][level].append(
+                        {"text": text, "confidence": conf}
+                    )
+
+        # Save gif with attention map
+        if attention_map:
+            attentions = prediction["attentions"][idx]
+            gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
+            logger.info(f"Creating attention GIF in {gif_filename}")
+            # this returns polygons but unused for now.
+            plot_attention(
+                image=input_tensor[idx],
+                text=predicted_text,
+                weights=attentions,
+                level=attention_map_level,
+                scale=attention_map_scale,
+                word_separators=word_separators,
+                line_separators=line_separators,
+                display_polygons=predict_objects,
+                threshold_method=threshold_method,
+                threshold_value=threshold_value,
+                outname=gif_filename,
+            )
+            result["attention_gif"] = gif_filename

-    json_filename = f"{output}/{image_path.stem}.json"
-    logger.info(f"Saving JSON prediction in {json_filename}")
-    save_json(Path(json_filename), result)
+        json_filename = f"{output}/{image_path.stem}.json"
+        logger.info(f"Saving JSON prediction in {json_filename}")
+        save_json(Path(json_filename), result)


 def run(
@@ -376,6 +389,7 @@ def run(
    threshold_value,
    image_extension,
    gpu_device,
+    batch_size,
 ):
    """
    Predict a single image save the output
@@ -395,6 +409,7 @@ def run(
    :param threshold_method: Thresholding method. Should be in ["otsu", "simple"].
    :param threshold_value: Thresholding value to use for the "simple" thresholding method.
    :param gpu_device: Use a specific GPU if available.
+    :param batch_size: Size of the batches for prediction.
    """
    # Create output directory if necessary
    if not os.path.exists(output):
@@ -407,9 +422,9 @@ def run(
    dan_model.load(model, parameters, charset, mode="eval")

    images = image_dir.rglob(f"*{image_extension}") if not image else [image]
-    for image_name in images:
-        process_image(
-            image_name,
+    for image_batch in list_to_batches(images, n=batch_size):
+        process_batch(
+            image_batch,
            dan_model,
            device,
            output,

--- a/dan/utils.py
+++ b/dan/utils.py
 # -*- coding: utf-8 -*-
+from itertools import islice
+
 import torch
 import torchvision.io as torchvision

@@ -72,3 +74,13 @@ def ind_to_token(labels, ind, oov_symbol=None):
    else:
        res = [labels[i] for i in ind]
    return "".join(res)
+
+
+def list_to_batches(iterable, n):
+    "Batch data into tuples of length n. The last batch may be shorter."
+    # list_to_batches('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
@@ -7,11 +7,12 @@ pip install -e .
 ```

 To learn more about the newly installed `teklia-dan` command, make sure to run:
+
 ```shell
 teklia-dan --help
 ```

 Get started with:

-* [Developments](development.md)
-* [Training workflow](training.md)
+- [Developments](development.md)
+- [Training workflow](training.md)
--- a/docs/get_started/training.md
+++ b/docs/get_started/training.md
@@ -7,11 +7,12 @@ There are a several steps to follow when training a DAN model.
 The data must be extracted and formatted for training. To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:

 1. Structure the data into folders (`train` / `val` / `test`) in [Arkindex](https://arkindex.teklia.com/).
-2. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
-3. Extract the data with the [extract command](../usage/datasets/extract.md).
-4. Format the data with the [format command](../usage/datasets/format.md).
+1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
+1. Extract the data with the [extract command](../usage/datasets/extract.md).
+1. Format the data with the [format command](../usage/datasets/format.md).

 At the end, you should have a tree structure like this:
+
 ```
 output/
 ├── charset.pkl
@@ -33,16 +34,12 @@ The training command does not take any input parameters for now. To train a DAN

 1. Update the parameters from those listed in the [dedicated page](../usage/train/parameters.md). You will always need to update at least these variables:

-  - `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path`,
-  - `model_params.transfer_learning.*[checkpoint_path]` to finetune an existing model,
-  - `training_params.output_folder`.
+    - `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path`,
+    - `model_params.transfer_learning.*[checkpoint_path]` to finetune an existing model,
+    - `training_params.output_folder`.

-2. Train a DAN model with the [train command](../usage/train/index.md).
+1. Train a DAN model with the [train command](../usage/train/index.md).

 ## 3. Predict

-Once the training is complete, you can apply a trained DAN model on an image.
-
-To do this, you will need to:
-
-1. Apply a trained DAN model on an image using the [predict command](../usage/predict.md).
+Once the training is complete, you can apply  a trained DAN model on an image using the [predict command](../usage/predict.md).
--- a/docs/original_paper.md
+++ b/docs/original_paper.md
@@ -17,5 +17,4 @@ The following results were published:
 | READ 2016 (single page) | 3.53    |  13.33  |   5.94   | 92.57       |
 | READ 2016 (double page) | 3.69    |  14.20  |   4.60   | 93.92       |

-
 Pretrained model weights are available [here](https://git.litislab.fr/dcoquenet/dan).
--- a/docs/ref/datasets/extract/exceptions.md
+++ b/docs/ref/datasets/extract/exceptions.md
 # Exceptions

 ::: dan.datasets.extract.exceptions
-    options:
-        show_source: false
+options:
+show_source: false
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -4,29 +4,29 @@

 Use the `teklia-dan dataset extract` command to extract a dataset from an Arkindex export database (SQLite format). This will generate the images and the labels needed to train a DAN model.

-| Parameter                      | Description                                                                         | Type     | Default |
-| ------------------------------ | ----------------------------------------------------------------------------------- | -------- | ------- |
-| `database`                       | Path to an Arkindex export database in SQLite format.         | `Path` |         |
-| `--parent`                       | UUID of the folder to import from Arkindex. You may specify multiple UUIDs.         | `str|uuid` |         |
-| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                    | `str`      |         |
-| `--parent-element-type`                 | Type of the parent element containing the data.                    | `str`      |  `page`       |
-| `--output`                       | Folder where the data will be generated.                                | `Path`     |         |
-| `--load-entities`                | Extract text with their entities. Needed for NER tasks.                             | `bool`     | `False`   |
-| `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.               | `Path`    |         |
-| `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                 | `bool`     |         |
-| `--train-folder`                 | ID of the training folder to import from Arkindex.                                  | `uuid`     |         |
-| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                | `uuid`     |         |
-| `--test-folder`                  | ID of the training folder to import from Arkindex.                                  | `uuid`     |         |
-| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.         | `str|uuid` |         |
-| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str|uuid` |         |
-| `--train-prob`                   | Training set split size                                                             | `float`    | `0.7`     |
-| `--val-prob`                     | Validation set split size                                                           | `float`    | `0.15`    |
-| `--max-width`                     | Images larger than this width will be resized to this width.                                                           | `int`    |     |
-| `--max-height`                     | Images larger than this height will be resized to this height.                                                           | `int`    |     |
+| Parameter                        | Description                                                                         | Type            | Default |
+| -------------------------------- | ----------------------------------------------------------------------------------- | --------------- | ------- |
+| `database`                       | Path to an Arkindex export database in SQLite format.                               | `Path`          |         |
+| `--parent`                       | UUID of the folder to import from Arkindex. You may specify multiple UUIDs.         | `str` or `uuid` |         |
+| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                    | `str`           |         |
+| `--parent-element-type`          | Type of the parent element containing the data.                                     | `str`           | `page`  |
+| `--output`                       | Folder where the data will be generated.                                            | `Path`          |         |
+| `--load-entities`                | Extract text with their entities. Needed for NER tasks.                             | `bool`          | `False` |
+| `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.               | `Path`          |         |
+| `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                 | `bool`          |         |
+| `--train-folder`                 | ID of the training folder to import from Arkindex.                                  | `uuid`          |         |
+| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                | `uuid`          |         |
+| `--test-folder`                  | ID of the training folder to import from Arkindex.                                  | `uuid`          |         |
+| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.         | `str` or `uuid` |         |
+| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str` or `uuid` |         |
+| `--train-prob`                   | Training set split size                                                             | `float`         | `0.7`   |
+| `--val-prob`                     | Validation set split size                                                           | `float`         | `0.15`  |
+| `--max-width`                    | Images larger than this width will be resized to this width.                        | `int`           |         |
+| `--max-height`                   | Images larger than this height will be resized to this height.                      | `int`           |         |

 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
+
 ```yaml
---
 INTITULE: # Type of the entity on Arkindex
  start: ⓘ # Starting token for this entity
  end: Ⓘ # Optional ending token for this entity
@@ -50,11 +50,12 @@ CLASSEMENT:
  end: Ⓛ
 ```

-
 ## Examples

 ### HTR and NER data from one source
+
 To extract HTR+NER data from **pages** from a folder, use the following command:
+
 ```shell
 teklia-dan dataset extract \
    database.sqlite \
@@ -64,10 +65,13 @@ teklia-dan dataset extract \
    --load-entities \
    --tokens tokens.yml
 ```
+
 with `tokens.yml` compliant with the format described before.

 ### HTR and NER data from multiple source
+
 To do the same but only use the data from three folders, the commands becomes:
+
 ```shell
 teklia-dan dataset extract \
    database.sqlite \
@@ -79,7 +83,9 @@ teklia-dan dataset extract \
 ```

 ### HTR and NER data with an existing split
+
 To use the data from three folders as **training**, **validation** and **testing** dataset respectively, the commands becomes:
+
 ```shell
 teklia-dan dataset extract \
    database.sqlite \
@@ -94,7 +100,9 @@ teklia-dan dataset extract \
 ```

 ### HTR from multiple element types with some parent filtering
+
 To extract HTR data from **annotations** and **text_zones** from a folder, but only keep those that are children of **single_pages**, use the following command:
+
 ```shell
 teklia-dan dataset extract \
    database.sqlite \

--- a/docs/usage/datasets/format.md
+++ b/docs/usage/datasets/format.md
@@ -11,18 +11,20 @@ Use the `teklia-dan dataset format` command to format a dataset. This will gener

 The available arguments are

-| Parameter                      | Description                                                                         | Type     | Default |
-| ------------------------------ | ----------------------------------------------------------------------------------- | -------- | ------- |
-| `--dataset`                       | Path to the folder containing the dataset.         | `str|uuid` |         |
-| `--image-format`                 | Format under which the images were generated.                    | `str`      |         |
-| `--keep-spaces`                 | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                    | `str`      |         |
+| Parameter        | Description                                                                     | Type            | Default |
+| ---------------- | ------------------------------------------------------------------------------- | --------------- | ------- |
+| `--dataset`      | Path to the folder containing the dataset.                                      | `str` or `uuid` |         |
+| `--image-format` | Format under which the images were generated.                                   | `str`           |         |
+| `--keep-spaces`  | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `str`           |         |

 ## Examples

 ### Format dataset with PNG images
+
 ```shell
 teklia-dan dataset format \
    --dataset path/to/dataset \
    --image-format png
 ```
+
 The created files will be stored at the root of your dataset.
--- a/docs/usage/predict.md
+++ b/docs/usage/predict.md
@@ -4,31 +4,32 @@ Use the `teklia-dan predict` command to apply a trained DAN model on an image.

 ## Description of parameters

-| Parameter                   | Description                                                                                  | Type    | Default       |
-| --------------------------- | -------------------------------------------------------------------------------------------- | ------- | ------------- |
-| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                                | `Path`  |               |
-| `--image-dir`                   | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                                                | `Path`  |               |
-| `--image-extension`                   | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                                                 | `str`  |      .jpg         |
-| `--model`                   | Path to the model to use for prediction                                                      | `Path`  |               |
-| `--parameters`              | Path to the YAML parameters file.                                                            | `Path`  |               |
-| `--charset`                 | Path to the charset file.                                                                    | `Path`  |               |
-| `--output`                  | Path to the output folder. Results will be saved in this directory.                          | `Path`  |               |
-| `--confidence-score`        | Whether to return confidence scores.                                                         | `bool`  | `False`       |
-| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.  | `str`   |               |
-| `--attention-map`           | Whether to plot attention maps.                                                              | `bool`  | `False`       |
-| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                | `float` | `0.5`         |
-| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                   | `str`   | `"line"`      |
-| `--predict-objects`         | Whether to return polygons coordinates.                                                      | `bool`  | `False`       |
-| `--word-separators`         | List of word separators.                                                                     | `list`  | `[" ", "\n"]` |
-| `--line-separators`         | List of line separators.                                                                     | `list`  | `["\n"]`      |
-| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.            | `str`   | `"otsu"`      |
-| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                       | `int`   | `0`      |
+| Parameter                   | Description                                                                                     | Type    | Default       |
+| --------------------------- | ----------------------------------------------------------------------------------------------- | ------- | ------------- |
+| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                          | `Path`  |               |
+| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `Path`  |               |
+| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.            | `str`   | .jpg          |
+| `--model`                   | Path to the model to use for prediction                                                         | `Path`  |               |
+| `--parameters`              | Path to the YAML parameters file.                                                               | `Path`  |               |
+| `--charset`                 | Path to the charset file.                                                                       | `Path`  |               |
+| `--output`                  | Path to the output folder. Results will be saved in this directory.                             | `Path`  |               |
+| `--confidence-score`        | Whether to return confidence scores.                                                            | `bool`  | `False`       |
+| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.     | `str`   |               |
+| `--attention-map`           | Whether to plot attention maps.                                                                 | `bool`  | `False`       |
+| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                   | `float` | `0.5`         |
+| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                      | `str`   | `"line"`      |
+| `--predict-objects`         | Whether to return polygons coordinates.                                                         | `bool`  | `False`       |
+| `--word-separators`         | List of word separators.                                                                        | `list`  | `[" ", "\n"]` |
+| `--line-separators`         | List of line separators.                                                                        | `list`  | `["\n"]`      |
+| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.               | `str`   | `"otsu"`      |
+| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                          | `int`   | `0`           |

 ## Examples

 ### Predict with confidence scores

 To run a prediction with confidence scores, run this command:
+
 ```shell
 teklia-dan predict \
    --image dan_humu_page/example.jpg \
@@ -38,12 +39,13 @@ teklia-dan predict \
    --output dan_humu_page/predict/ \
    --confidence-score
 ```
+
 It will create the following JSON file named `dan_humu_page/predict/example.json`

 ```json
 {
-    "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-    "confidence": 0.99
+  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
+  "confidence": 0.99
 }
 ```

@@ -66,9 +68,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json

 ```json
 {
-    "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-    "confidence": 0.99,
-    "attention_gif": "dan_humu_page/predict/example_line.gif"
+  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
+  "confidence": 0.99,
+  "attention_gif": "dan_humu_page/predict/example_line.gif"
 }
 ```

@@ -95,13 +97,13 @@ It will create the following JSON file named `dan_humu_page/predict/example.json

 ```json
 {
-    "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-    "confidence": 0.99,
-    "attention_gif": "dan_humu_page/predict/example_word.gif"
+  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
+  "confidence": 0.99,
+  "attention_gif": "dan_humu_page/predict/example_word.gif"
 }
 ```
-<img src="../../assets/example_word.gif" >

+<img src="../../assets/example_word.gif" >

 ### Predict with line-level attention maps and extract polygons

@@ -123,33 +125,33 @@ It will create the following JSON file named `dan_humu_page/predict/example.json

 ```json
 {
-    "text": "Oslo\n39 \nOresden den 24te Rasser!\nH\u00f8jst\u00e6redesherr Hartvig - assert!\nUllereder fra den f\u00f8rste tide da\njeg havder den tilfredsstillelser at vide den ar-\ndistiske ledelser af Kristiania theater i Deres\nhronder, har jeg g\u00e5t hernede med et stille\nh\u00e5b om fra Dem at modtage et forelag, sig -\nsende tils at lade \"K\u00e6rlighedens \u00abKomedie\u00bb\nopf\u00f8re fore det norske purblikum.\nEt s\u00e5dant forslag er imidlertid, imod\nforventning; ikke fremkommet, og jeg n\u00f8des der-\nfor tils self at grivbe initiativet, hvilket hervede\nsker, idet jeg\nbeder\nbet\nragte stigkket some ved denne\nskrivelse officielde indleveret til theatret. No-\nget exemplar af bogen vedlagger jeg ikke da\ndenne (i 2den udgave) med Lethed kan er -\nholdet deroppe.\nDe bet\u00e6nkeligheder, jeg i sin tid n\u00e6-\nrede mod stykkets opf\u00f8relse, er for l\u00e6nge si -\ndem forsvundne. Af mange begn er jeg kom-\nmen til den overbevisning at almenlreden\naru har f\u00e5tt sine \u00f8gne opladte for den sand -\nMed at dette arbejde i sin indersten id\u00e9 hviler\np\u00e5 et ubedinget meralsk grundlag, og brad\nstykkets hele kunstneriske struktuve ang\u00e5r,",
-    "objects": [
-        {
-            "confidence": 0.68,
-            "polygon": [
-                [
-                    264,
-                    118
-                ],
-                [
-                    410,
-                    118
-                ],
-                [
-                    410,
-                    185
-                ],
-                [
-                    264,
-                    185
-                ]
-            ],
-            "text": "Oslo",
-            "text_confidence": 0.8
-        },
-        ...
-    "attention_gif": "dan_humu_page/predict/example_line.gif"
+  "text": "Oslo\n39 \nOresden den 24te Rasser!\nH\u00f8jst\u00e6redesherr Hartvig - assert!\nUllereder fra den f\u00f8rste tide da\njeg havder den tilfredsstillelser at vide den ar-\ndistiske ledelser af Kristiania theater i Deres\nhronder, har jeg g\u00e5t hernede med et stille\nh\u00e5b om fra Dem at modtage et forelag, sig -\nsende tils at lade \"K\u00e6rlighedens \u00abKomedie\u00bb\nopf\u00f8re fore det norske purblikum.\nEt s\u00e5dant forslag er imidlertid, imod\nforventning; ikke fremkommet, og jeg n\u00f8des der-\nfor tils self at grivbe initiativet, hvilket hervede\nsker, idet jeg\nbeder\nbet\nragte stigkket some ved denne\nskrivelse officielde indleveret til theatret. No-\nget exemplar af bogen vedlagger jeg ikke da\ndenne (i 2den udgave) med Lethed kan er -\nholdet deroppe.\nDe bet\u00e6nkeligheder, jeg i sin tid n\u00e6-\nrede mod stykkets opf\u00f8relse, er for l\u00e6nge si -\ndem forsvundne. Af mange begn er jeg kom-\nmen til den overbevisning at almenlreden\naru har f\u00e5tt sine \u00f8gne opladte for den sand -\nMed at dette arbejde i sin indersten id\u00e9 hviler\np\u00e5 et ubedinget meralsk grundlag, og brad\nstykkets hele kunstneriske struktuve ang\u00e5r,",
+  "objects": [
+    {
+      "confidence": 0.68,
+      "polygon": [
+        [
+          264,
+          118
+        ],
+        [
+          410,
+          118
+        ],
+        [
+          410,
+          185
+        ],
+        [
+          264,
+          185
+        ]
+      ],
+      "text": "Oslo",
+      "text_confidence": 0.8
+    }
+  ],
+  "attention_gif": "dan_humu_page/predict/example_line.gif"
 }
 ```


--- a/docs/usage/train/augmentation.md
+++ b/docs/usage/train/augmentation.md
@@ -15,6 +15,7 @@ This page lists data augmentation transforms used in DAN.
 | CPU time (seconds/10 images) | 0.44 (3013x128 pixels) / 0.86 (1116x581 pixels)                                                                                                                                                |

 ### PieceWise Affine
+
 :warning: This transform is temporarily removed from the pipeline until [this issue](https://github.com/albumentations-team/albumentations/issues/1442) is fixed.

 |                              | PieceWise Affine                                                                                                                                                                              |
@@ -127,10 +128,10 @@ This page lists data augmentation transforms used in DAN.

 ## Full augmentation pipeline

-* Data augmentation is applied with a probability of 0.9.
-* In this case, two transformations are randomly selected to be applied.
-* Reproducibility is possible by setting `random.seed` and `np.random.seed` (already done in `dan/ocr/document/train.py`)
-* Examples with new pipeline:
+- Data augmentation is applied with a probability of 0.9.
+- In this case, two transformations are randomly selected to be applied.
+- Reproducibility is possible by setting `random.seed` and `np.random.seed` (already done in `dan/ocr/document/train.py`)
+- Examples with new pipeline:

 ![](../../assets/augmentations/line_full_pipeline.png)
 ![](../../assets/augmentations/document_full_pipeline.png)

--- a/docs/usage/train/index.md
+++ b/docs/usage/train/index.md
@@ -9,10 +9,10 @@ Use the `teklia-dan train document` command to train a new DAN model. It is able
 To train DAN on documents:

 1. Set your training configuration in `dan/ocr/document/train.py`. Refer to the [dedicated section](parameters.md) for a description of parameters.
-2. Run `teklia-dan train document`.
-3. Look into evaluation results in the `output` folder:
-    * `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
-    * `results` contains the tensorboard log file, the parameters file, and the evaluation results for the best epoch.
+1. Run `teklia-dan train document`.
+1. Look into evaluation results in the `output` folder:
+    - `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
+    - `results` contains the tensorboard log file, the parameters file, and the evaluation results for the best epoch.

 ### Line

@@ -20,5 +20,5 @@ To train DAN on lines, run `teklia-dan train document` with a line dataset.

 ## Additional pages

-* [Jean Zay tutorial](jeanzay.md)
-* [Data augmentation](augmentation.md)
+- [Jean Zay tutorial](jeanzay.md)
+- [Data augmentation](augmentation.md)
--- a/docs/usage/train/jeanzay.md
+++ b/docs/usage/train/jeanzay.md
@@ -3,13 +3,15 @@
 See the [wiki](https://redmine.teklia.com/projects/research/wiki/Jean_Zay) for more details.

 ## Run a training job
+
 Warning: there is no HTTP connection during a job.

 You can debug using an interactive job. The following command will get you a new terminal with 1 gpu for 1 hour: `srun --ntasks=1 --cpus-per-task=40 --gres=gpu:1 --time=01:00:00 --qos=qos_gpu-dev --pty bash -i`.

 You should run the actual training using a passive/batch job:
-* Run `sbatch train_dan.sh`.
-* The `train_dan.sh` file should look like the example below.
+
+- Run `sbatch train_dan.sh`.
+- The `train_dan.sh` file should look like the example below.

 ```sh
 #!/bin/bash
@@ -39,7 +41,9 @@ teklia-dan train document
 ```

 ## Supervise a job
-* Use `squeue -u $USER`. This command should give an output similar to the one presented below.
+
+- Use `squeue -u $USER`. This command should give an output similar to the one presented below.
+
 ```
 (base) [ubz97wr@jean-zay1: ubz97wr]$ squeue -u $USER
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
@@ -48,5 +52,6 @@ teklia-dan train document
 ```

 ## Delete a job
-* Use `scancel $JOBID` to cancel a specific job.
-* Use `scancel -u $USER` to cancel all your jobs.
+
+- Use `scancel $JOBID` to cancel a specific job.
+- Use `scancel -u $USER` to cancel all your jobs.
--- a/docs/usage/train/parameters.md
+++ b/docs/usage/train/parameters.md
@@ -4,20 +4,20 @@ All hyperparameters are specified and editable in the training scripts `dan/ocr/

 ## Dataset parameters

-| Parameter                               | Description                                                                            | Type         | Default                                              |
-| --------------------------------------- | -------------------------------------------------------------------------------------- | ------------ | ---------------------------------------------------- |
-| `dataset_name`                          | Name of the dataset.                                                                   | `str`        |                                                      |
-| `dataset_level`                         | Level of the dataset. Should be named after the element type.                          | `str`        |                                                      |
-| `dataset_variant`                       | Variant of the dataset. Usually empty for HTR datasets, `"_sem"` for HTR+NER datasets. | `str`        |                                                      |
-| `dataset_path`                          | Path to the dataset.                                                                   | `str`        |                                                      |
-| `dataset_params.config.load_in_memory`  | Load all images in CPU memory.                                                         | `bool`       | `True`                                               |
-| `dataset_params.config.worker_per_gpu`  | Number of parallel processes per gpu for data loading.                                 | `int`        | `4`                                                  |
-| `dataset_params.config.preprocessings`  | List of pre-processing functions to apply to input images.                             | `list`       | (see [dedicated section](#data-preprocessing))       |
-| `dataset_params.config.augmentation`    | Whether to use data augmentation on the training set.                                  | `bool`       | `True` (see [dedicated section](#data-augmentation)) |
+| Parameter                              | Description                                                                            | Type   | Default                                              |
+| -------------------------------------- | -------------------------------------------------------------------------------------- | ------ | ---------------------------------------------------- |
+| `dataset_name`                         | Name of the dataset.                                                                   | `str`  |                                                      |
+| `dataset_level`                        | Level of the dataset. Should be named after the element type.                          | `str`  |                                                      |
+| `dataset_variant`                      | Variant of the dataset. Usually empty for HTR datasets, `"_sem"` for HTR+NER datasets. | `str`  |                                                      |
+| `dataset_path`                         | Path to the dataset.                                                                   | `str`  |                                                      |
+| `dataset_params.config.load_in_memory` | Load all images in CPU memory.                                                         | `bool` | `True`                                               |
+| `dataset_params.config.worker_per_gpu` | Number of parallel processes per gpu for data loading.                                 | `int`  | `4`                                                  |
+| `dataset_params.config.preprocessings` | List of pre-processing functions to apply to input images.                             | `list` | (see [dedicated section](#data-preprocessing))       |
+| `dataset_params.config.augmentation`   | Whether to use data augmentation on the training set.                                  | `bool` | `True` (see [dedicated section](#data-augmentation)) |

 !!! warning
-    The variables `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path` must have values such that the data is located in `{dataset_path}/{dataset_name}_{dataset_level}{dataset_variant}`.

+The variables `dataset_name`, `dataset_level`, `dataset_variant` and `dataset_path` must have values such that the data is located in `{dataset_path}/{dataset_name}_{dataset_level}{dataset_variant}`.

 ### Data preprocessing

@@ -35,7 +35,7 @@ class Preprocessing(Enum):

 Usage:

-* Resize to a fixed height
+- Resize to a fixed height

 ```py
 [
@@ -46,7 +46,7 @@ Usage:
 ]
 ```

-* Resize to a fixed width
+- Resize to a fixed width

 ```py
 [
@@ -57,7 +57,7 @@ Usage:
 ]
 ```

-* Resize to a maximum size (only if the image is bigger than the given size)
+- Resize to a maximum size (only if the image is bigger than the given size)

 ```py
 [
@@ -69,7 +69,7 @@ Usage:
 ]
 ```

-* Combine these pre-processings
+- Combine these pre-processings

 ```py
 [
@@ -115,30 +115,29 @@ For a detailed description of all augmentation transforms, see the [dedicated pa

 ## Model parameters

-| Name                                      | Description                                                                          | Type          | Default                                                           |
-| ----------------------------------------- | ------------------------------------------------------------------------------------ | ------------- | ----------------------------------------------------------------- |
-| `model_params.models.encoder`             | Encoder class.                                                                       | custom class  | `FCN_encoder`                                                     |
-| `model_params.models.decoder`             | Decoder class.                                                                       | custom class  | `GlobalHTADecoder`                                                |
-| `model_params.transfer_learning.encoder`  | Model to load for the encoder [state_dict_name, checkpoint_path, learnable, strict]. | `list`        | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]`  |
-| `model_params.transfer_learning.decoder`  | Model to load for the decoder [state_dict_name, checkpoint_path, learnable, strict]. | `list`        | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` |
-| `model_params.transfered_charset`         | Transfer learning of the decision layer based on charset of the model to transfer.   | `bool`        | `True`                                                            |
-| `model_params.additional_tokens`          | For decision layer = [<eot>, ], only for transferred charset.                        | `int`         | `1`                                                               |
-| `model_params.dropout`                    | Dropout probability in the encoder.                                                  | `float`       | `0.5`                                                             |
-| `model_params.enc_dim`                    | Dimension of features extracted by the encoder.                                      | `int`         | `256`                                                             |
-| `model_params.nb_layers`                  | Number of layers in the encoder.                                                     | `int`         | `5`                                                               |
-| `model_params.h_max`                      | Maximum height for encoder output (for 2D positional embedding).                     | `int`         | `500`                                                             |
-| `model_params.w_max`                      | Maximum width for encoder output (for 2D positional embedding).                      | `int`         | `1000`                                                            |
-| `model_params.l_max`                      | Maximum predicted sequence length (for 1D positional embedding).                     | `int`         | `15000`                                                           |
-| `model_params.dec_num_layers`             | Number of transformer decoder layers.                                                | `int`         | `8`                                                               |
-| `model_params.dec_num_heads`              | Number of heads in transformer decoder layers.                                       | `int`         | `4`                                                               |
-| `model_params.dec_res_dropout`            | Dropout probability in transformer decoder layers.                                   | `int`         | `0.1`                                                             |
-| `model_params.dec_pred_dropout`           | Dropout rate before decision layer.                                                  | `float`       | `0.1`                                                             |
-| `model_params.dec_att_dropout`            | Dropout rate in multi head attention.                                                | `float`       | `0.1`                                                             |
-| `model_params.dec_dim_feedforward`        | Number of dimensions for feedforward layer in transformer decoder layers.            | `int`         | `256`                                                             |
-| `model_params.attention_win`              | Length of attention window.                                                          | `int`         | `100`                                                             |
-| `model_params.dropout_scheduler.function` | Curriculum dropout scheduler.                                                        | custom class  | `exponential_dropout_scheduler`                                   |
-| `model_params.dropout_scheduler.T`        | Exponential factor.                                                                  | `float`       | `5e4`                                                             |
-
+| Name                                      | Description                                                                            | Type         | Default                                                           |
+| ----------------------------------------- | -------------------------------------------------------------------------------------- | ------------ | ----------------------------------------------------------------- |
+| `model_params.models.encoder`             | Encoder class.                                                                         | custom class | `FCN_encoder`                                                     |
+| `model_params.models.decoder`             | Decoder class.                                                                         | custom class | `GlobalHTADecoder`                                                |
+| `model_params.transfer_learning.encoder`  | Model to load for the encoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, True]`  |
+| `model_params.transfer_learning.decoder`  | Model to load for the decoder \[state_dict_name, checkpoint_path, learnable, strict\]. | `list`       | `["encoder", "pretrained_models/dan_rimes_page.pt", True, False]` |
+| `model_params.transfered_charset`         | Transfer learning of the decision layer based on charset of the model to transfer.     | `bool`       | `True`                                                            |
+| `model_params.additional_tokens`          | For decision layer = \[<eot>, \], only for transferred charset.                        | `int`        | `1`                                                               |
+| `model_params.dropout`                    | Dropout probability in the encoder.                                                    | `float`      | `0.5`                                                             |
+| `model_params.enc_dim`                    | Dimension of features extracted by the encoder.                                        | `int`        | `256`                                                             |
+| `model_params.nb_layers`                  | Number of layers in the encoder.                                                       | `int`        | `5`                                                               |
+| `model_params.h_max`                      | Maximum height for encoder output (for 2D positional embedding).                       | `int`        | `500`                                                             |
+| `model_params.w_max`                      | Maximum width for encoder output (for 2D positional embedding).                        | `int`        | `1000`                                                            |
+| `model_params.l_max`                      | Maximum predicted sequence length (for 1D positional embedding).                       | `int`        | `15000`                                                           |
+| `model_params.dec_num_layers`             | Number of transformer decoder layers.                                                  | `int`        | `8`                                                               |
+| `model_params.dec_num_heads`              | Number of heads in transformer decoder layers.                                         | `int`        | `4`                                                               |
+| `model_params.dec_res_dropout`            | Dropout probability in transformer decoder layers.                                     | `int`        | `0.1`                                                             |
+| `model_params.dec_pred_dropout`           | Dropout rate before decision layer.                                                    | `float`      | `0.1`                                                             |
+| `model_params.dec_att_dropout`            | Dropout rate in multi head attention.                                                  | `float`      | `0.1`                                                             |
+| `model_params.dec_dim_feedforward`        | Number of dimensions for feedforward layer in transformer decoder layers.              | `int`        | `256`                                                             |
+| `model_params.attention_win`              | Length of attention window.                                                            | `int`        | `100`                                                             |
+| `model_params.dropout_scheduler.function` | Curriculum dropout scheduler.                                                          | custom class | `exponential_dropout_scheduler`                                   |
+| `model_params.dropout_scheduler.T`        | Exponential factor.                                                                    | `float`      | `5e4`                                                             |

 ## Training parameters

@@ -151,11 +150,11 @@ For a detailed description of all augmentation transforms, see the [dedicated pa
 | `training_params.batch_size`                            | Mini-batch size for the training loop.                                      | `int`        | `2`                                         |
 | `training_params.use_ddp`                               | Whether to use DistributedDataParallel.                                     | `bool`       | `False`                                     |
 | `training_params.ddp_port`                              | DDP port.                                                                   | `int`        | `20027`                                     |
-| `training_params.use_amp`                               | Whether to enable automatic mix-precision.                                  | `bool`        | `True`            |
-| `training_params.nb_gpu`                                | Number of GPUs to train DAN.                                                | `int`        |  `torch.cuda.device_count()`                                |
+| `training_params.use_amp`                               | Whether to enable automatic mix-precision.                                  | `bool`       | `True`                                      |
+| `training_params.nb_gpu`                                | Number of GPUs to train DAN.                                                | `int`        | `torch.cuda.device_count()`                 |
 | `training_params.optimizers.all.class`                  | Optimizer class.                                                            | custom class | `Adam`                                      |
 | `training_params.optimizers.all.args.lr`                | Learning rate for the optimizer.                                            | `float`      | `0.0001`                                    |
-| `training_params.optimizers.all.args.amsgrad`           | Whether to use AMSGrad optimization.                                        | `bool`        | `False`                                     |
+| `training_params.optimizers.all.args.amsgrad`           | Whether to use AMSGrad optimization.                                        | `bool`       | `False`                                     |
 | `training_params.lr_schedulers`                         | Learning rate schedulers.                                                   | custom class | `None`                                      |
 | `training_params.eval_on_valid`                         | Whether to evaluate and log metrics on the validation set during training.  | `bool`       | `True`                                      |
 | `training_params.eval_on_valid_interval`                | Interval (in epochs) to evaluate during training.                           | `int`        | `5`                                         |

--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
 # -*- coding: utf-8 -*-

 import json
+import shutil

 import pytest

@@ -270,9 +271,172 @@ def test_run_prediction(
        threshold_value=0,
        image_extension=None,
        gpu_device=None,
+        batch_size=1,
    )

-    with (tmp_path / image_name).with_suffix(".json").open("r") as json_file:
-        prediction = json.load(json_file)
-
+    prediction = json.loads((tmp_path / image_name).with_suffix(".json").read_text())
    assert prediction == expected_prediction
+
+
+@pytest.mark.parametrize(
+    "image_names, confidence_score, temperature, expected_predictions",
+    (
+        (
+            ["0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84"],
+            None,
+            1.0,
+            [{"text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241"}],
+        ),
+        (
+            ["0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84"],
+            ["word"],
+            1.0,
+            [
+                {
+                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    "confidences": {
+                        "by ner token": [],
+                        "total": 1.0,
+                        "word": [
+                            {"text": "ⓈBellisson", "confidence": 1.0},
+                            {"text": "ⒻGeorges", "confidence": 1.0},
+                            {"text": "Ⓑ91", "confidence": 1.0},
+                            {"text": "ⓁP", "confidence": 1.0},
+                            {"text": "ⒸM", "confidence": 1.0},
+                            {"text": "ⓀCh", "confidence": 1.0},
+                            {"text": "ⓄPlombier", "confidence": 1.0},
+                            {"text": "ⓅPatron?12241", "confidence": 1.0},
+                        ],
+                    },
+                }
+            ],
+        ),
+        (
+            [
+                "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
+                "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
+            ],
+            ["word"],
+            1.0,
+            [
+                {
+                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    "confidences": {
+                        "by ner token": [],
+                        "total": 1.0,
+                        "word": [
+                            {"text": "ⓈBellisson", "confidence": 1.0},
+                            {"text": "ⒻGeorges", "confidence": 1.0},
+                            {"text": "Ⓑ91", "confidence": 1.0},
+                            {"text": "ⓁP", "confidence": 1.0},
+                            {"text": "ⒸM", "confidence": 1.0},
+                            {"text": "ⓀCh", "confidence": 1.0},
+                            {"text": "ⓄPlombier", "confidence": 1.0},
+                            {"text": "ⓅPatron?12241", "confidence": 1.0},
+                        ],
+                    },
+                },
+                {
+                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    "confidences": {
+                        "by ner token": [],
+                        "total": 1.0,
+                        "word": [
+                            {"text": "ⓈBellisson", "confidence": 1.0},
+                            {"text": "ⒻGeorges", "confidence": 1.0},
+                            {"text": "Ⓑ91", "confidence": 1.0},
+                            {"text": "ⓁP", "confidence": 1.0},
+                            {"text": "ⒸM", "confidence": 1.0},
+                            {"text": "ⓀCh", "confidence": 1.0},
+                            {"text": "ⓄPlombier", "confidence": 1.0},
+                            {"text": "ⓅPatron?12241", "confidence": 1.0},
+                        ],
+                    },
+                },
+            ],
+        ),
+        (
+            ["0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84"],
+            ["word"],
+            1.0,
+            [
+                {
+                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
+                    "confidences": {
+                        "by ner token": [],
+                        "total": 1.0,
+                        "word": [
+                            {"text": "ⓈBellisson", "confidence": 1.0},
+                            {"text": "ⒻGeorges", "confidence": 1.0},
+                            {"text": "Ⓑ91", "confidence": 1.0},
+                            {"text": "ⓁP", "confidence": 1.0},
+                            {"text": "ⒸM", "confidence": 1.0},
+                            {"text": "ⓀCh", "confidence": 1.0},
+                            {"text": "ⓄPlombier", "confidence": 1.0},
+                            {"text": "ⓅPatron?12241", "confidence": 1.0},
+                        ],
+                    },
+                }
+            ],
+        ),
+        (
+            [
+                "2c242f5c-e979-43c4-b6f2-a6d4815b651d",
+                "ffdec445-7f14-4f5f-be44-68d0844d0df1",
+            ],
+            False,
+            1.0,
+            [
+                {"text": "Ⓢd ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF Ⓞd Ⓟ14 31"},
+                {"text": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS Ⓒv ⓀBelle mère"},
+            ],
+        ),
+    ),
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_run_prediction_batch(
+    image_names,
+    confidence_score,
+    temperature,
+    expected_predictions,
+    prediction_data_path,
+    batch_size,
+    tmp_path,
+):
+    # Make tmpdir and copy needed images inside
+    image_dir = tmp_path / "images"
+    image_dir.mkdir()
+    for image_name in image_names:
+        shutil.copyfile(
+            (prediction_data_path / "images" / image_name).with_suffix(".png"),
+            (image_dir / image_name).with_suffix(".png"),
+        )
+
+    run_prediction(
+        image=None,
+        image_dir=image_dir,
+        model=prediction_data_path / "popp_line_model.pt",
+        parameters=prediction_data_path / "parameters.yml",
+        charset=prediction_data_path / "charset.pkl",
+        output=tmp_path,
+        confidence_score=True if confidence_score else False,
+        confidence_score_levels=confidence_score if confidence_score else [],
+        attention_map=False,
+        attention_map_level=None,
+        attention_map_scale=0.5,
+        word_separators=[" ", "\n"],
+        line_separators=["\n"],
+        temperature=temperature,
+        predict_objects=False,
+        threshold_method="otsu",
+        threshold_value=0,
+        image_extension=".png",
+        gpu_device=None,
+        batch_size=batch_size,
+    )
+
+    for image_name, expected_prediction in zip(image_names, expected_predictions):
+        prediction = json.loads(
+            (tmp_path / image_name).with_suffix(".json").read_text()
+        )
+        assert prediction == expected_prediction
No results found