Compare revisions

Manon Blanco · Yoann Schneider · Mélodie Boillet · Yoann Schneider · Yoann Schneider · a267de74
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -50,10 +50,18 @@ def download_image(url):
    Download an image and open it with Pillow
    """
    assert url.startswith("http"), "Image URL must be HTTP(S)"
+
    # Download the image
    # Cannot use stream=True as urllib's responses do not support the seek(int) method,
    # which is explicitly required by Image.open on file-like objects
-    resp = _retried_request(url)
+    try:
+        resp = _retried_request(url)
+    except requests.HTTPError as e:
+        if "/full/" in url and 400 <= e.response.status_code < 500:
+            # Retry with max instead of full as IIIF size
+            resp = _retried_request(url.replace("/full/", "/max/"))
+        else:
+            raise e

    # Preprocess the image and prepare it for classification
    image = Image.open(BytesIO(resp.content)).convert("RGB")

--- a/dan/ocr/manager/training.py
+++ b/dan/ocr/manager/training.py
@@ -224,6 +224,8 @@ class GenericTrainingManager:
        self.best = checkpoint["best"]
        if "scaler_state_dict" in checkpoint:
            self.scaler.load_state_dict(checkpoint["scaler_state_dict"])
+        if "dropout_scheduler_step" in checkpoint:
+            self.dropout_scheduler.resume(checkpoint["dropout_scheduler_step"])
        # Load model weights from past training
        for model_name in self.models:
            # Transform to DDP/from DDP model
@@ -412,6 +414,7 @@ class GenericTrainingManager:
            "scaler_state_dict": self.scaler.state_dict(),
            "best": self.best,
            "charset": self.dataset.charset,
+            "dropout_scheduler_step": self.dropout_scheduler.step_num,
        }

        for model_name in self.optimizers:

--- a/dan/ocr/predict/__init__.py
+++ b/dan/ocr/predict/__init__.py
@@ -84,7 +84,7 @@ def add_predict_parser(subcommands) -> None:
        default=[],
        type=Level,
        nargs="+",
-        help="Levels of confidence scores. Should be a list of any combinaison of ['char', 'word', 'line'].",
+        help=f"Levels of confidence scores. Should be a list of any combinaison of {list(map(str, Level))}.",
        required=False,
    )
    parser.add_argument(
@@ -97,7 +97,7 @@ def add_predict_parser(subcommands) -> None:
        "--attention-map-level",
        type=Level,
        default=Level.Line,
-        help="Level to plot the attention maps. Should be in ['line', 'word', 'char'].",
+        help=f"Level to plot the attention maps. Should be in {list(map(str, Level))}.",
        required=False,
    )
    parser.add_argument(

--- a/dan/ocr/predict/attention.py
+++ b/dan/ocr/predict/attention.py
@@ -2,8 +2,8 @@
 import logging
 import re
 from enum import Enum
-from operator import attrgetter
-from typing import List, Tuple
+from itertools import pairwise
+from typing import Dict, List, Tuple

 import cv2
 import numpy as np
@@ -11,6 +11,8 @@ import torch
 from PIL import Image
 from torchvision.transforms.functional import to_pil_image

+from dan.utils import EntityType
+
 logger = logging.getLogger(__name__)


@@ -18,12 +20,87 @@ class Level(str, Enum):
    Char = "char"
    Word = "word"
    Line = "line"
+    NER = "ner"
+
+    def __str__(self):
+        return self.value


 def parse_delimiters(delimiters: List[str]) -> re.Pattern:
    return re.compile(r"|".join(delimiters))


+def build_ner_indices(
+    text: str, tokens: Dict[str, EntityType]
+) -> List[Tuple[int, int]]:
+    """
+    Compute the position of NER tokens in the text and return a list of indices.
+    :param text: list of characters.
+    :param tokens: NER tokens used.
+    Returns a list of indices where tokens are located.
+    """
+    start_tokens, end_tokens = zip(*list(tokens.values()))
+    end_tokens = list(filter(bool, end_tokens))
+
+    if len(end_tokens):
+        assert len(start_tokens) == len(
+            end_tokens
+        ), "You don't have the same number of starting tokens and ending tokens"
+        return [
+            [pos_start, pos_end] for pos_start, pos_end in zip(start_tokens, end_tokens)
+        ]
+
+    return list(
+        pairwise(
+            [pos for pos, char in enumerate(text) if char in start_tokens] + [None]
+        )
+    )
+
+
+def compute_offsets_by_level(
+    level: Level, text_list: List[str], indices: List[Tuple[int, int]]
+):
+    """
+    Compute and return the list of offset between each text part.
+    :param level: Level to use from [char, word, line, ner].
+    :param text_list: list of text to use.
+    :param indices: list of indices where tokens are located for NER computation.
+    Returns a list of offsets.
+    """
+    if level == Level.NER:
+        return (
+            [
+                current - next_token
+                for (_, next_token), (current, _) in pairwise(indices)
+            ]
+            # Pad the list to match the length of the text list
+            + [0]
+        )
+
+    return [int(level != Level.Char)] * len(text_list)
+
+
+def compute_prob_by_ner(
+    characters: str, probabilities: List[float], indices: List[Tuple[int, int]]
+) -> Tuple[List[str], List[np.float64]]:
+    """
+    Split text and confidences using indices and return a list of average confidence scores.
+    :param characters: list of characters.
+    :param probabilities: list of character probabilities.
+    :param indices: list of indices where tokens are located.
+    Returns a list confidence scores.
+    """
+    return zip(
+        *[
+            (
+                f"{characters[current: next_token]}".replace("\n", " "),
+                np.mean(probabilities[current:next_token]),
+            )
+            for current, next_token in indices
+        ]
+    )
+
+
 def compute_prob_by_separator(
    characters: str, probabilities: List[float], separator: re.Pattern
 ) -> Tuple[List[str], List[np.float64]]:
@@ -45,15 +122,22 @@ def compute_prob_by_separator(


 def split_text(
-    text: str, level: Level, word_separators: re.Pattern, line_separators: re.Pattern
-) -> Tuple[List[str], int]:
+    text: str,
+    level: Level,
+    word_separators: re.Pattern,
+    line_separators: re.Pattern,
+    tokens: Dict[str, EntityType],
+) -> Tuple[List[str], List[int]]:
    """
    Split text into a list of characters, word, or lines.
    :param text: Text prediction from DAN
-    :param level: Level to visualize from [char, word, line]
+    :param level: Level to visualize from [char, word, line, ner]
    :param word_separators: List of word separators
    :param line_separators: List of line separators
+    :param tokens: NER tokens used
    """
+    indices = []
+
    match level:
        case Level.Char:
            text_split = list(text)
@@ -63,12 +147,22 @@ def split_text(
        # split into lines
        case Level.Line:
            text_split = re.split(line_separators, text)
+        # split into entities
+        case Level.NER:
+            if not tokens:
+                logger.error("Cannot compute NER level: tokens not found")
+                return [], []
+
+            indices = build_ner_indices(text, tokens)
+            text_split = [
+                f"{text[current: next_token]}".replace("\n", " ")
+                for current, next_token in indices
+            ]
        case _:
-            choices = ", ".join(list(map(attrgetter("value"), Level)))
-            logger.error(f"Level should be either {choices}")
+            logger.error(f"Level should be either {list(map(str, Level))}")
+            return [], []

-    offset = int(level != Level.Char)
-    return text_split, offset
+    return text_split, compute_offsets_by_level(level, text_split, indices)


 def split_text_and_confidences(
@@ -77,15 +171,19 @@ def split_text_and_confidences(
    level: Level,
    word_separators: re.Pattern,
    line_separators: re.Pattern,
-) -> Tuple[List[str], List[np.float64], int]:
+    tokens: Dict[str, EntityType],
+) -> Tuple[List[str], List[np.float64], List[int]]:
    """
    Split text into a list of characters, words or lines with corresponding confidences scores
    :param text: Text prediction from DAN
    :param confidences: Character confidences
-    :param level: Level to visualize from [char, word, line]
+    :param level: Level to visualize from [char, word, line, ner]
    :param word_separators: List of word separators
    :param line_separators: List of line separators
+    :param tokens: NER tokens used
    """
+    indices = []
+
    match level:
        case Level.Char:
            texts = list(text)
@@ -97,12 +195,22 @@ def split_text_and_confidences(
            texts, confidences = compute_prob_by_separator(
                text, confidences, line_separators
            )
+        case Level.NER:
+            if not tokens:
+                logger.error("Cannot compute NER level: tokens not found")
+                return [], [], []
+
+            indices = build_ner_indices(text, tokens)
+            texts, confidences = compute_prob_by_ner(text, confidences, indices)
        case _:
-            choices = ", ".join(list(map(attrgetter("value"), Level)))
-            logger.error(f"Level should be either {choices}")
+            logger.error(f"Level should be either {list(map(str, Level))}")
+            return [], [], []

-    offset = int(level != Level.Char)
-    return texts, [np.around(num, 2) for num in confidences], offset
+    return (
+        texts,
+        [np.around(num, 2) for num in confidences],
+        compute_offsets_by_level(level, texts, indices),
+    )


 def get_predicted_polygons_with_confidence(
@@ -117,13 +225,14 @@ def get_predicted_polygons_with_confidence(
    max_object_height: int = 50,
    word_separators: re.Pattern = parse_delimiters(["\n", " "]),
    line_separators: re.Pattern = parse_delimiters(["\n"]),
+    tokens: Dict[str, EntityType] = {},
 ) -> List[dict]:
    """
    Returns the polygons of each object of the current prediction
    :param text: Text predicted by DAN
    :param weights: Attention weights of size (n_char, feature_height, feature_width)
    :param confidences: Character confidences
-    :param level: Level to display (must be in [char, word, line])
+    :param level: Level to display (must be in [char, word, line, ner])
    :param height: Original image height
    :param width: Original image width
    :param threshold_method: Thresholding method. Should be in ["otsu", "simple"]
@@ -131,16 +240,17 @@ def get_predicted_polygons_with_confidence(
    :param max_object_height: Maximum height of predicted objects.
    :param word_separators: List of word separators
    :param line_separators: List of line separators
+    :param tokens: NER tokens used
    """
    # Split text into characters, words or lines
-    text_list, confidence_list, offset = split_text_and_confidences(
-        text, confidences, level, word_separators, line_separators
+    text_list, confidence_list, offsets = split_text_and_confidences(
+        text, confidences, level, word_separators, line_separators, tokens
    )

    max_value = weights.sum(0).max()
    polygons = []
    start_index = 0
-    for text_piece, confidence in zip(text_list, confidence_list):
+    for text_piece, confidence, offset in zip(text_list, confidence_list, offsets):
        polygon, _ = get_polygon(
            text_piece,
            max_value,
@@ -370,6 +480,7 @@ def plot_attention(
    max_object_height: int = 50,
    word_separators: re.Pattern = parse_delimiters(["\n", " "]),
    line_separators: re.Pattern = parse_delimiters(["\n"]),
+    tokens: Dict[str, EntityType] = {},
    display_polygons: bool = False,
 ) -> None:
    """
@@ -377,12 +488,13 @@ def plot_attention(
    :param image: Input image as torch.Tensor
    :param text: Text predicted by DAN
    :param weights: Attention weights of size (n_char, feature_height, feature_width)
-    :param level: Level to display (must be in [char, word, line])
+    :param level: Level to display (must be in [char, word, line, ner])
    :param scale: Scaling factor for the output gif image
    :param outname: Name of the gif image
    :param max_object_height: Maximum height of predicted objects.
    :param word_separators: List of word separators
    :param line_separators: List of line separators
+    :param tokens: NER tokens used
    :param display_polygons: Whether to plot extracted polygons
    """
    image = to_pil_image(image)
@@ -392,13 +504,15 @@ def plot_attention(
    mask = Image.new("L", (image.width, image.height), color=(110))

    # Split text into characters, words or lines
-    text_list, offset = split_text(text, level, word_separators, line_separators)
+    text_list, offsets = split_text(
+        text, level, word_separators, line_separators, tokens
+    )

    # Iterate on characters, words or lines
    tot_len = 0
    max_value = weights.sum(0).max()

-    for text_piece in text_list:
+    for text_piece, offset in zip(text_list, offsets):
        # Accumulate weights for the current word/line and resize to original image size
        coverage_vector = compute_coverage(
            text_piece, max_value, tot_len, weights, (image.width, image.height)
@@ -428,6 +542,9 @@ def plot_attention(
        # Blend coverage vector with original image
        attention_map.append(blend_coverage(coverage_vector, image, mask, scale))

+    if not attention_map:
+        return
+
    attention_map[0].save(
        outname,
        save_all=True,

--- a/dan/ocr/predict/prediction.py
+++ b/dan/ocr/predict/prediction.py
@@ -4,9 +4,8 @@ import json
 import logging
 import pickle
 import re
-from itertools import pairwise
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import numpy as np
 import torch
@@ -137,6 +136,7 @@ class DAN:
        extract_objects: bool = False,
        word_separators: re.Pattern = parse_delimiters(["\n", " "]),
        line_separators: re.Pattern = parse_delimiters(["\n"]),
+        tokens: Dict[str, EntityType] = {},
        start_token: str = None,
        threshold_method: str = "otsu",
        threshold_value: int = 0,
@@ -149,7 +149,7 @@ class DAN:
        :param input_sizes: The original images sizes.
        :param confidences: Return the characters probabilities.
        :param attentions: Return characters attention weights.
-        :param attention_level: Level of text pieces (must be in [char, word, line])
+        :param attention_level: Level of text pieces (must be in [char, word, line, ner])
        :param extract_objects: Whether to extract polygons' coordinates.
        :param threshold_method: Thresholding method. Should be in ["otsu", "simple"].
        :param threshold_value: Thresholding value to use for the "simple" thresholding method.
@@ -289,24 +289,13 @@ class DAN:
                    max_object_height=max_object_height,
                    word_separators=word_separators,
                    line_separators=line_separators,
+                    tokens=tokens,
                )
                for i in range(batch_size)
            ]
        return out


-def parse_ner_predictions(
-    text: str, char_confidences: List[float], predictions: Iterable[Tuple[int, int]]
-) -> List[dict]:
-    return [
-        {
-            "text": f"{text[current: next_token]}".replace("\n", " "),
-            "confidence": np.round(np.mean(char_confidences[current:next_token]), 2),
-        }
-        for current, next_token in predictions
-    ]
-
-
 def process_batch(
    image_batch: List[Path],
    dan_model: DAN,
@@ -356,6 +345,7 @@ def process_batch(
        extract_objects=predict_objects,
        word_separators=word_separators,
        line_separators=line_separators,
+        tokens=tokens,
        threshold_method=threshold_method,
        threshold_value=threshold_value,
        max_object_height=max_object_height,
@@ -383,28 +373,6 @@ def process_batch(
        if confidence_score:
            result["confidences"] = {}
            char_confidences = prediction["confidences"][idx]
-            text = result["text"]
-            start_tokens, end_tokens = zip(*list(tokens.values()))
-            end_tokens = list(filter(bool, end_tokens))
-
-            if len(end_tokens):
-                assert len(start_tokens) == len(
-                    end_tokens
-                ), "You don't have the same number of starting tokens and ending tokens"
-                indices = [
-                    [pos_start, pos_end]
-                    for pos_start, pos_end in zip(start_tokens, end_tokens)
-                ]
-            else:
-                indices = pairwise(
-                    [pos for pos, char in enumerate(text) if char in start_tokens]
-                    + [None]
-                )
-
-            result["confidences"]["ner"] = parse_ner_predictions(
-                text, char_confidences, indices
-            )
-
            result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)

            for level in confidence_score_levels:
@@ -415,6 +383,7 @@ def process_batch(
                    level,
                    word_separators,
                    line_separators,
+                    tokens,
                )

                for text, conf in zip(texts, confidences):
@@ -427,7 +396,6 @@ def process_batch(
            attentions = prediction["attentions"][idx]
            gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
            logger.info(f"Creating attention GIF in {gif_filename}")
-            # this returns polygons but unused for now.
            plot_attention(
                image=visu_tensor[idx],
                text=predicted_text,
@@ -436,6 +404,7 @@ def process_batch(
                scale=attention_map_scale,
                word_separators=word_separators,
                line_separators=line_separators,
+                tokens=tokens,
                display_polygons=predict_objects,
                threshold_method=threshold_method,
                threshold_value=threshold_value,

--- a/dan/ocr/schedulers.py
+++ b/dan/ocr/schedulers.py
@@ -17,6 +17,9 @@ class DropoutScheduler:
    def step(self, num):
        self.step_num += num

+    def resume(self, step_num):
+        self.step_num = step_num
+
    def init_teta_list(self, models):
        for model_name in models:
            self.init_teta_list_module(models[model_name])

--- a/docs/usage/predict/examples.md
+++ b/docs/usage/predict/examples.md
@@ -19,7 +19,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99
+  "confidences": {
+    "total": 0.99
+  }
 }
 ```

@@ -43,7 +45,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99,
+  "confidences": {
+    "total": 0.99
+  },
  "attention_gif": "dan_humu_page/predict/example_line.gif"
 }
 ```
@@ -72,7 +76,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
  "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99,
+  "confidences": {
+    "total": 0.99
+  },
  "attention_gif": "dan_humu_page/predict/example_word.gif"
 }
 ```

--- a/docs/usage/predict/parameters.md
+++ b/docs/usage/predict/parameters.md
 # Description of parameters

-| Parameter                   | Description                                                                                                                                                       | Type    | Default       |
-| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
-| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                                                            | `Path`  |               |
-| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                                                   | `Path`  |               |
-| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                                                              | `str`   | .jpg          |
-| `--model`                   | Path to the model to use for prediction                                                                                                                           | `Path`  |               |
-| `--parameters`              | Path to the YAML parameters file.                                                                                                                                 | `Path`  |               |
-| `--charset`                 | Path to the charset file.                                                                                                                                         | `Path`  |               |
-| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                                                               | `Path`  |               |
-| `--confidence-score`        | Whether to return confidence scores.                                                                                                                              | `bool`  | `False`       |
-| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.                                                                       | `str`   |               |
-| `--attention-map`           | Whether to plot attention maps.                                                                                                                                   | `bool`  | `False`       |
-| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                                                                     | `float` | `0.5`         |
-| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                                                                                        | `str`   | `"line"`      |
-| `--predict-objects`         | Whether to return polygons coordinates.                                                                                                                           | `bool`  | `False`       |
-| `--word-separators`         | List of word separators.                                                                                                                                          | `list`  | `[" ", "\n"]` |
-| `--line-separators`         | List of line separators.                                                                                                                                          | `list`  | `["\n"]`      |
-| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                                                                 | `str`   | `"otsu"`      |
-| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                                                            | `int`   | `0`           |
-| `--batch-size `             | Size of the batches for prediction.                                                                                                                               | `int`   | `1`           |
-| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages.                                       | `str`   | `None`        |
-| `--use-language-model`      | Whether to use an external n-gram language model to rescore hypotheses. See [the dedicated example](#predict-with-an-external-n-gram-language-model) for details. | `bool`  | `False`       |
+| Parameter                   | Description                                                                                                                 | Type    | Default       |
+| --------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
+| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                      | `Path`  |               |
+| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                             | `Path`  |               |
+| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                        | `str`   | .jpg          |
+| `--model`                   | Path to the model to use for prediction                                                                                     | `Path`  |               |
+| `--parameters`              | Path to the YAML parameters file.                                                                                           | `Path`  |               |
+| `--charset`                 | Path to the charset file.                                                                                                   | `Path`  |               |
+| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                         | `Path`  |               |
+| `--confidence-score`        | Whether to return confidence scores.                                                                                        | `bool`  | `False`       |
+| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char", "ner"]`.                          | `str`   |               |
+| `--attention-map`           | Whether to plot attention maps.                                                                                             | `bool`  | `False`       |
+| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                               | `float` | `0.5`         |
+| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char", "ner"]`.                                           | `str`   | `"line"`      |
+| `--predict-objects`         | Whether to return polygons coordinates.                                                                                     | `bool`  | `False`       |
+| `--word-separators`         | List of word separators.                                                                                                    | `list`  | `[" ", "\n"]` |
+| `--line-separators`         | List of line separators.                                                                                                    | `list`  | `["\n"]`      |
+| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                           | `str`   | `"otsu"`      |
+| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                      | `int`   | `0`           |
+| `--batch-size `             | Size of the batches for prediction.                                                                                         | `int`   | `1`           |
+| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages. | `str`   | `None`        |
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ albumentations==1.3.1
 arkindex-export==0.1.7
 boto3==1.26.124
 editdistance==0.6.2
+flashlight-text==0.0.4
 imageio==2.26.1
 imagesize==1.4.1
 mdutils==1.6.0

--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -4,7 +4,7 @@ import json
 import logging
 import pickle
 import re
-from operator import methodcaller
+from operator import attrgetter, methodcaller
 from typing import NamedTuple
 from unittest.mock import patch

@@ -20,6 +20,7 @@ from dan.datasets.extract.exceptions import (
 from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
 from dan.datasets.extract.utils import (
    EntityType,
+    download_image,
    insert_token,
    normalize_linebreaks,
    normalize_spaces,
@@ -566,6 +567,40 @@ def test_download_image_error(iiif_url, caplog, capsys):
    assert captured.out == "deadbeef: Image URL must be HTTP(S)\n"


+def test_download_image_error_try_max(responses):
+    # An image's URL
+    url = (
+        "https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/full/0/default.jpg"
+    )
+    fixed_url = (
+        "https://blabla.com/iiif/2/image_path.jpg/231,699,2789,3659/max/0/default.jpg"
+    )
+
+    # Fake responses error
+    responses.add(
+        responses.GET,
+        url,
+        status=400,
+    )
+    # Correct response with max
+    responses.add(
+        responses.GET,
+        fixed_url,
+        status=200,
+        body=next((FIXTURES / "prediction" / "images").iterdir()).read_bytes(),
+    )
+
+    image = download_image(url)
+
+    assert image
+    # We try 3 times with the first URL
+    # Then the first try with the new URL is successful
+    assert len(responses.calls) == 4
+    assert list(map(attrgetter("request.url"), responses.calls)) == [url] * 3 + [
+        fixed_url
+    ]
+
+
 @pytest.mark.parametrize("allow_empty", (True, False))
 def test_empty_transcription(allow_empty, mock_database):
    extractor = ArkindexExtractor(

--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -76,16 +76,6 @@ def test_predict(image_name, expected_prediction):
            {
                "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                "confidences": {
-                    "ner": [
-                        {"text": "ⓈBellisson ", "confidence": 1.0},
-                        {"text": "ⒻGeorges ", "confidence": 1.0},
-                        {"text": "Ⓑ91 ", "confidence": 1.0},
-                        {"text": "ⓁP ", "confidence": 1.0},
-                        {"text": "ⒸM ", "confidence": 1.0},
-                        {"text": "ⓀCh ", "confidence": 1.0},
-                        {"text": "ⓄPlombier ", "confidence": 1.0},
-                        {"text": "ⓅPatron?12241", "confidence": 1.0},
-                    ],
                    "total": 1.0,
                    "word": [
                        {"text": "ⓈBellisson", "confidence": 1.0},
@@ -102,11 +92,12 @@ def test_predict(image_name, expected_prediction):
        ),
        (
            "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
-            [Level.Word],
+            [Level.NER, Level.Word],
            3.5,
            {
                "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                "confidences": {
+                    "total": 0.93,
                    "ner": [
                        {"text": "ⓈBellisson ", "confidence": 0.92},
                        {"text": "ⒻGeorges ", "confidence": 0.94},
@@ -117,7 +108,6 @@ def test_predict(image_name, expected_prediction):
                        {"text": "ⓄPlombier ", "confidence": 0.93},
                        {"text": "ⓅPatron?12241", "confidence": 0.93},
                    ],
-                    "total": 0.93,
                    "word": [
                        {"text": "ⓈBellisson", "confidence": 0.93},
                        {"text": "ⒻGeorges", "confidence": 0.94},
@@ -138,16 +128,6 @@ def test_predict(image_name, expected_prediction):
            {
                "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                "confidences": {
-                    "ner": [
-                        {"text": "ⓈBellisson ", "confidence": 1.0},
-                        {"text": "ⒻGeorges ", "confidence": 1.0},
-                        {"text": "Ⓑ91 ", "confidence": 1.0},
-                        {"text": "ⓁP ", "confidence": 1.0},
-                        {"text": "ⒸM ", "confidence": 1.0},
-                        {"text": "ⓀCh ", "confidence": 1.0},
-                        {"text": "ⓄPlombier ", "confidence": 1.0},
-                        {"text": "ⓅPatron?12241", "confidence": 1.0},
-                    ],
                    "total": 1.0,
                    "line": [
                        {
@@ -160,11 +140,12 @@ def test_predict(image_name, expected_prediction):
        ),
        (
            "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
-            [Level.Line],
+            [Level.NER, Level.Line],
            3.5,
            {
                "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                "confidences": {
+                    "total": 0.93,
                    "ner": [
                        {"text": "ⓈBellisson ", "confidence": 0.92},
                        {"text": "ⒻGeorges ", "confidence": 0.94},
@@ -175,7 +156,6 @@ def test_predict(image_name, expected_prediction):
                        {"text": "ⓄPlombier ", "confidence": 0.93},
                        {"text": "ⓅPatron?12241", "confidence": 0.93},
                    ],
-                    "total": 0.93,
                    "line": [
                        {
                            "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
@@ -193,11 +173,12 @@ def test_predict(image_name, expected_prediction):
        ),
        (
            "0dfe8bcd-ed0b-453e-bf19-cc697012296e",
-            [Level.Char, Level.Word, Level.Line],
+            [Level.NER, Level.Char, Level.Word, Level.Line],
            1.0,
            {
                "text": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁS Ⓚch ⓄE dactylo Ⓟ18376",
                "confidences": {
+                    "total": 1.0,
                    "ner": [
                        {"text": "ⓈTemplié ", "confidence": 0.98},
                        {"text": "ⒻMarcelle ", "confidence": 1.0},
@@ -207,7 +188,6 @@ def test_predict(image_name, expected_prediction):
                        {"text": "ⓄE dactylo ", "confidence": 1.0},
                        {"text": "Ⓟ18376", "confidence": 1.0},
                    ],
-                    "total": 1.0,
                    "char": [
                        {"text": "Ⓢ", "confidence": 1.0},
                        {"text": "T", "confidence": 1.0},
@@ -345,16 +325,6 @@ def test_run_prediction(
                {
                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                    "confidences": {
-                        "ner": [
-                            {"text": "ⓈBellisson ", "confidence": 1.0},
-                            {"text": "ⒻGeorges ", "confidence": 1.0},
-                            {"text": "Ⓑ91 ", "confidence": 1.0},
-                            {"text": "ⓁP ", "confidence": 1.0},
-                            {"text": "ⒸM ", "confidence": 1.0},
-                            {"text": "ⓀCh ", "confidence": 1.0},
-                            {"text": "ⓄPlombier ", "confidence": 1.0},
-                            {"text": "ⓅPatron?12241", "confidence": 1.0},
-                        ],
                        "total": 1.0,
                        "word": [
                            {"text": "ⓈBellisson", "confidence": 1.0},
@@ -375,12 +345,13 @@ def test_run_prediction(
                "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
                "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
            ],
-            [Level.Word],
+            [Level.NER, Level.Word],
            1.0,
            [
                {
                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                    "confidences": {
+                        "total": 1.0,
                        "ner": [
                            {"text": "ⓈBellisson ", "confidence": 1.0},
                            {"text": "ⒻGeorges ", "confidence": 1.0},
@@ -391,7 +362,6 @@ def test_run_prediction(
                            {"text": "ⓄPlombier ", "confidence": 1.0},
                            {"text": "ⓅPatron?12241", "confidence": 1.0},
                        ],
-                        "total": 1.0,
                        "word": [
                            {"text": "ⓈBellisson", "confidence": 1.0},
                            {"text": "ⒻGeorges", "confidence": 1.0},
@@ -407,6 +377,7 @@ def test_run_prediction(
                {
                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                    "confidences": {
+                        "total": 1.0,
                        "ner": [
                            {"text": "ⓈBellisson ", "confidence": 1.0},
                            {"text": "ⒻGeorges ", "confidence": 1.0},
@@ -417,7 +388,6 @@ def test_run_prediction(
                            {"text": "ⓄPlombier ", "confidence": 1.0},
                            {"text": "ⓅPatron?12241", "confidence": 1.0},
                        ],
-                        "total": 1.0,
                        "word": [
                            {"text": "ⓈBellisson", "confidence": 1.0},
                            {"text": "ⒻGeorges", "confidence": 1.0},
@@ -440,16 +410,6 @@ def test_run_prediction(
                {
                    "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                    "confidences": {
-                        "ner": [
-                            {"text": "ⓈBellisson ", "confidence": 1.0},
-                            {"text": "ⒻGeorges ", "confidence": 1.0},
-                            {"text": "Ⓑ91 ", "confidence": 1.0},
-                            {"text": "ⓁP ", "confidence": 1.0},
-                            {"text": "ⒸM ", "confidence": 1.0},
-                            {"text": "ⓀCh ", "confidence": 1.0},
-                            {"text": "ⓄPlombier ", "confidence": 1.0},
-                            {"text": "ⓅPatron?12241", "confidence": 1.0},
-                        ],
                        "total": 1.0,
                        "word": [
                            {"text": "ⓈBellisson", "confidence": 1.0},

--- a/tox.ini
+++ b/tox.ini
@@ -10,6 +10,7 @@ wheel_build_env = .pkg
 deps =
    pytest>=6
    pytest-lazy-fixture
+    pytest-responses
    -rrequirements.txt
 commands =
    pytest {tty:--color=yes} {posargs}
No results found