From b0bca1a79982cb8c402910ca95b736b4147bfd74 Mon Sep 17 00:00:00 2001
From: Manon Blanco <blanco@teklia.com>
Date: Fri, 13 Oct 2023 10:04:14 +0000
Subject: [PATCH] New mode to split prediction by starting tokens

---
 dan/ocr/predict/__init__.py      |   4 +-
 dan/ocr/predict/attention.py     | 161 ++++++++++++++++++++++++++-----
 dan/ocr/predict/prediction.py    |  45 ++-------
 docs/usage/predict/examples.md   |  12 ++-
 docs/usage/predict/parameters.md |  43 ++++-----
 tests/test_prediction.py         |  58 ++---------
 6 files changed, 187 insertions(+), 136 deletions(-)

diff --git a/dan/ocr/predict/__init__.py b/dan/ocr/predict/__init__.py
index 0f8c5a0f..dabaeb82 100644
--- a/dan/ocr/predict/__init__.py
+++ b/dan/ocr/predict/__init__.py
@@ -84,7 +84,7 @@ def add_predict_parser(subcommands) -> None:
         default=[],
         type=Level,
         nargs="+",
-        help="Levels of confidence scores. Should be a list of any combinaison of ['char', 'word', 'line'].",
+        help=f"Levels of confidence scores. Should be a list of any combinaison of {list(map(str, Level))}.",
         required=False,
     )
     parser.add_argument(
@@ -97,7 +97,7 @@ def add_predict_parser(subcommands) -> None:
         "--attention-map-level",
         type=Level,
         default=Level.Line,
-        help="Level to plot the attention maps. Should be in ['line', 'word', 'char'].",
+        help=f"Level to plot the attention maps. Should be in {list(map(str, Level))}.",
         required=False,
     )
     parser.add_argument(
diff --git a/dan/ocr/predict/attention.py b/dan/ocr/predict/attention.py
index de431255..3ddbae9e 100644
--- a/dan/ocr/predict/attention.py
+++ b/dan/ocr/predict/attention.py
@@ -2,8 +2,8 @@
 import logging
 import re
 from enum import Enum
-from operator import attrgetter
-from typing import List, Tuple
+from itertools import pairwise
+from typing import Dict, List, Tuple
 
 import cv2
 import numpy as np
@@ -11,6 +11,8 @@ import torch
 from PIL import Image
 from torchvision.transforms.functional import to_pil_image
 
+from dan.utils import EntityType
+
 logger = logging.getLogger(__name__)
 
 
@@ -18,12 +20,87 @@ class Level(str, Enum):
     Char = "char"
     Word = "word"
     Line = "line"
+    NER = "ner"
+
+    def __str__(self):
+        return self.value
 
 
 def parse_delimiters(delimiters: List[str]) -> re.Pattern:
     return re.compile(r"|".join(delimiters))
 
 
+def build_ner_indices(
+    text: str, tokens: Dict[str, EntityType]
+) -> List[Tuple[int, int]]:
+    """
+    Compute the position of NER tokens in the text and return a list of indices.
+    :param text: list of characters.
+    :param tokens: NER tokens used.
+    Returns a list of indices where tokens are located.
+    """
+    start_tokens, end_tokens = zip(*list(tokens.values()))
+    end_tokens = list(filter(bool, end_tokens))
+
+    if len(end_tokens):
+        assert len(start_tokens) == len(
+            end_tokens
+        ), "You don't have the same number of starting tokens and ending tokens"
+        return [
+            [pos_start, pos_end] for pos_start, pos_end in zip(start_tokens, end_tokens)
+        ]
+
+    return list(
+        pairwise(
+            [pos for pos, char in enumerate(text) if char in start_tokens] + [None]
+        )
+    )
+
+
+def compute_offsets_by_level(
+    level: Level, text_list: List[str], indices: List[Tuple[int, int]]
+):
+    """
+    Compute and return the list of offset between each text part.
+    :param level: Level to use from [char, word, line, ner].
+    :param text_list: list of text to use.
+    :param indices: list of indices where tokens are located for NER computation.
+    Returns a list of offsets.
+    """
+    if level == Level.NER:
+        return (
+            [
+                current - next_token
+                for (_, next_token), (current, _) in pairwise(indices)
+            ]
+            # Pad the list to match the length of the text list
+            + [0]
+        )
+
+    return [int(level != Level.Char)] * len(text_list)
+
+
+def compute_prob_by_ner(
+    characters: str, probabilities: List[float], indices: List[Tuple[int, int]]
+) -> Tuple[List[str], List[np.float64]]:
+    """
+    Split text and confidences using indices and return a list of average confidence scores.
+    :param characters: list of characters.
+    :param probabilities: list of character probabilities.
+    :param indices: list of indices where tokens are located.
+    Returns a list confidence scores.
+    """
+    return zip(
+        *[
+            (
+                f"{characters[current: next_token]}".replace("\n", " "),
+                np.mean(probabilities[current:next_token]),
+            )
+            for current, next_token in indices
+        ]
+    )
+
+
 def compute_prob_by_separator(
     characters: str, probabilities: List[float], separator: re.Pattern
 ) -> Tuple[List[str], List[np.float64]]:
@@ -45,15 +122,22 @@ def compute_prob_by_separator(
 
 
 def split_text(
-    text: str, level: Level, word_separators: re.Pattern, line_separators: re.Pattern
-) -> Tuple[List[str], int]:
+    text: str,
+    level: Level,
+    word_separators: re.Pattern,
+    line_separators: re.Pattern,
+    tokens: Dict[str, EntityType],
+) -> Tuple[List[str], List[int]]:
     """
     Split text into a list of characters, word, or lines.
     :param text: Text prediction from DAN
-    :param level: Level to visualize from [char, word, line]
+    :param level: Level to visualize from [char, word, line, ner]
     :param word_separators: List of word separators
     :param line_separators: List of line separators
+    :param tokens: NER tokens used
     """
+    indices = []
+
     match level:
         case Level.Char:
             text_split = list(text)
@@ -63,12 +147,22 @@ def split_text(
         # split into lines
         case Level.Line:
             text_split = re.split(line_separators, text)
+        # split into entities
+        case Level.NER:
+            if not tokens:
+                logger.error("Cannot compute NER level: tokens not found")
+                return [], []
+
+            indices = build_ner_indices(text, tokens)
+            text_split = [
+                f"{text[current: next_token]}".replace("\n", " ")
+                for current, next_token in indices
+            ]
         case _:
-            choices = ", ".join(list(map(attrgetter("value"), Level)))
-            logger.error(f"Level should be either {choices}")
+            logger.error(f"Level should be either {list(map(str, Level))}")
+            return [], []
 
-    offset = int(level != Level.Char)
-    return text_split, offset
+    return text_split, compute_offsets_by_level(level, text_split, indices)
 
 
 def split_text_and_confidences(
@@ -77,15 +171,19 @@ def split_text_and_confidences(
     level: Level,
     word_separators: re.Pattern,
     line_separators: re.Pattern,
-) -> Tuple[List[str], List[np.float64], int]:
+    tokens: Dict[str, EntityType],
+) -> Tuple[List[str], List[np.float64], List[int]]:
     """
     Split text into a list of characters, words or lines with corresponding confidences scores
     :param text: Text prediction from DAN
     :param confidences: Character confidences
-    :param level: Level to visualize from [char, word, line]
+    :param level: Level to visualize from [char, word, line, ner]
     :param word_separators: List of word separators
     :param line_separators: List of line separators
+    :param tokens: NER tokens used
     """
+    indices = []
+
     match level:
         case Level.Char:
             texts = list(text)
@@ -97,12 +195,22 @@ def split_text_and_confidences(
             texts, confidences = compute_prob_by_separator(
                 text, confidences, line_separators
             )
+        case Level.NER:
+            if not tokens:
+                logger.error("Cannot compute NER level: tokens not found")
+                return [], [], []
+
+            indices = build_ner_indices(text, tokens)
+            texts, confidences = compute_prob_by_ner(text, confidences, indices)
         case _:
-            choices = ", ".join(list(map(attrgetter("value"), Level)))
-            logger.error(f"Level should be either {choices}")
+            logger.error(f"Level should be either {list(map(str, Level))}")
+            return [], [], []
 
-    offset = int(level != Level.Char)
-    return texts, [np.around(num, 2) for num in confidences], offset
+    return (
+        texts,
+        [np.around(num, 2) for num in confidences],
+        compute_offsets_by_level(level, texts, indices),
+    )
 
 
 def get_predicted_polygons_with_confidence(
@@ -117,13 +225,14 @@ def get_predicted_polygons_with_confidence(
     max_object_height: int = 50,
     word_separators: re.Pattern = parse_delimiters(["\n", " "]),
     line_separators: re.Pattern = parse_delimiters(["\n"]),
+    tokens: Dict[str, EntityType] = {},
 ) -> List[dict]:
     """
     Returns the polygons of each object of the current prediction
     :param text: Text predicted by DAN
     :param weights: Attention weights of size (n_char, feature_height, feature_width)
     :param confidences: Character confidences
-    :param level: Level to display (must be in [char, word, line])
+    :param level: Level to display (must be in [char, word, line, ner])
     :param height: Original image height
     :param width: Original image width
     :param threshold_method: Thresholding method. Should be in ["otsu", "simple"]
@@ -131,16 +240,17 @@ def get_predicted_polygons_with_confidence(
     :param max_object_height: Maximum height of predicted objects.
     :param word_separators: List of word separators
     :param line_separators: List of line separators
+    :param tokens: NER tokens used
     """
     # Split text into characters, words or lines
-    text_list, confidence_list, offset = split_text_and_confidences(
-        text, confidences, level, word_separators, line_separators
+    text_list, confidence_list, offsets = split_text_and_confidences(
+        text, confidences, level, word_separators, line_separators, tokens
     )
 
     max_value = weights.sum(0).max()
     polygons = []
     start_index = 0
-    for text_piece, confidence in zip(text_list, confidence_list):
+    for text_piece, confidence, offset in zip(text_list, confidence_list, offsets):
         polygon, _ = get_polygon(
             text_piece,
             max_value,
@@ -370,6 +480,7 @@ def plot_attention(
     max_object_height: int = 50,
     word_separators: re.Pattern = parse_delimiters(["\n", " "]),
     line_separators: re.Pattern = parse_delimiters(["\n"]),
+    tokens: Dict[str, EntityType] = {},
     display_polygons: bool = False,
 ) -> None:
     """
@@ -377,12 +488,13 @@ def plot_attention(
     :param image: Input image as torch.Tensor
     :param text: Text predicted by DAN
     :param weights: Attention weights of size (n_char, feature_height, feature_width)
-    :param level: Level to display (must be in [char, word, line])
+    :param level: Level to display (must be in [char, word, line, ner])
     :param scale: Scaling factor for the output gif image
     :param outname: Name of the gif image
     :param max_object_height: Maximum height of predicted objects.
     :param word_separators: List of word separators
     :param line_separators: List of line separators
+    :param tokens: NER tokens used
     :param display_polygons: Whether to plot extracted polygons
     """
     image = to_pil_image(image)
@@ -392,13 +504,15 @@ def plot_attention(
     mask = Image.new("L", (image.width, image.height), color=(110))
 
     # Split text into characters, words or lines
-    text_list, offset = split_text(text, level, word_separators, line_separators)
+    text_list, offsets = split_text(
+        text, level, word_separators, line_separators, tokens
+    )
 
     # Iterate on characters, words or lines
     tot_len = 0
     max_value = weights.sum(0).max()
 
-    for text_piece in text_list:
+    for text_piece, offset in zip(text_list, offsets):
         # Accumulate weights for the current word/line and resize to original image size
         coverage_vector = compute_coverage(
             text_piece, max_value, tot_len, weights, (image.width, image.height)
@@ -428,6 +542,9 @@ def plot_attention(
         # Blend coverage vector with original image
         attention_map.append(blend_coverage(coverage_vector, image, mask, scale))
 
+    if not attention_map:
+        return
+
     attention_map[0].save(
         outname,
         save_all=True,
diff --git a/dan/ocr/predict/prediction.py b/dan/ocr/predict/prediction.py
index f88676af..0719f492 100644
--- a/dan/ocr/predict/prediction.py
+++ b/dan/ocr/predict/prediction.py
@@ -4,9 +4,8 @@ import json
 import logging
 import pickle
 import re
-from itertools import pairwise
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -137,6 +136,7 @@ class DAN:
         extract_objects: bool = False,
         word_separators: re.Pattern = parse_delimiters(["\n", " "]),
         line_separators: re.Pattern = parse_delimiters(["\n"]),
+        tokens: Dict[str, EntityType] = {},
         start_token: str = None,
         threshold_method: str = "otsu",
         threshold_value: int = 0,
@@ -149,7 +149,7 @@ class DAN:
         :param input_sizes: The original images sizes.
         :param confidences: Return the characters probabilities.
         :param attentions: Return characters attention weights.
-        :param attention_level: Level of text pieces (must be in [char, word, line])
+        :param attention_level: Level of text pieces (must be in [char, word, line, ner])
         :param extract_objects: Whether to extract polygons' coordinates.
         :param threshold_method: Thresholding method. Should be in ["otsu", "simple"].
         :param threshold_value: Thresholding value to use for the "simple" thresholding method.
@@ -289,24 +289,13 @@ class DAN:
                     max_object_height=max_object_height,
                     word_separators=word_separators,
                     line_separators=line_separators,
+                    tokens=tokens,
                 )
                 for i in range(batch_size)
             ]
         return out
 
 
-def parse_ner_predictions(
-    text: str, char_confidences: List[float], predictions: Iterable[Tuple[int, int]]
-) -> List[dict]:
-    return [
-        {
-            "text": f"{text[current: next_token]}".replace("\n", " "),
-            "confidence": np.round(np.mean(char_confidences[current:next_token]), 2),
-        }
-        for current, next_token in predictions
-    ]
-
-
 def process_batch(
     image_batch: List[Path],
     dan_model: DAN,
@@ -356,6 +345,7 @@ def process_batch(
         extract_objects=predict_objects,
         word_separators=word_separators,
         line_separators=line_separators,
+        tokens=tokens,
         threshold_method=threshold_method,
         threshold_value=threshold_value,
         max_object_height=max_object_height,
@@ -383,28 +373,6 @@ def process_batch(
         if confidence_score:
             result["confidences"] = {}
             char_confidences = prediction["confidences"][idx]
-            text = result["text"]
-            start_tokens, end_tokens = zip(*list(tokens.values()))
-            end_tokens = list(filter(bool, end_tokens))
-
-            if len(end_tokens):
-                assert len(start_tokens) == len(
-                    end_tokens
-                ), "You don't have the same number of starting tokens and ending tokens"
-                indices = [
-                    [pos_start, pos_end]
-                    for pos_start, pos_end in zip(start_tokens, end_tokens)
-                ]
-            else:
-                indices = pairwise(
-                    [pos for pos, char in enumerate(text) if char in start_tokens]
-                    + [None]
-                )
-
-            result["confidences"]["ner"] = parse_ner_predictions(
-                text, char_confidences, indices
-            )
-
             result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
 
             for level in confidence_score_levels:
@@ -415,6 +383,7 @@ def process_batch(
                     level,
                     word_separators,
                     line_separators,
+                    tokens,
                 )
 
                 for text, conf in zip(texts, confidences):
@@ -427,7 +396,6 @@ def process_batch(
             attentions = prediction["attentions"][idx]
             gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
             logger.info(f"Creating attention GIF in {gif_filename}")
-            # this returns polygons but unused for now.
             plot_attention(
                 image=visu_tensor[idx],
                 text=predicted_text,
@@ -436,6 +404,7 @@ def process_batch(
                 scale=attention_map_scale,
                 word_separators=word_separators,
                 line_separators=line_separators,
+                tokens=tokens,
                 display_polygons=predict_objects,
                 threshold_method=threshold_method,
                 threshold_value=threshold_value,
diff --git a/docs/usage/predict/examples.md b/docs/usage/predict/examples.md
index 7efcdc99..f337603e 100644
--- a/docs/usage/predict/examples.md
+++ b/docs/usage/predict/examples.md
@@ -19,7 +19,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
   "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99
+  "confidences": {
+    "total": 0.99
+  }
 }
 ```
 
@@ -43,7 +45,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
   "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99,
+  "confidences": {
+    "total": 0.99
+  },
   "attention_gif": "dan_humu_page/predict/example_line.gif"
 }
 ```
@@ -72,7 +76,9 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 ```json
 {
   "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
-  "confidence": 0.99,
+  "confidences": {
+    "total": 0.99
+  },
   "attention_gif": "dan_humu_page/predict/example_word.gif"
 }
 ```
diff --git a/docs/usage/predict/parameters.md b/docs/usage/predict/parameters.md
index 1b5496db..14ecde94 100644
--- a/docs/usage/predict/parameters.md
+++ b/docs/usage/predict/parameters.md
@@ -1,24 +1,23 @@
 # Description of parameters
 
-| Parameter                   | Description                                                                                                                                                       | Type    | Default       |
-| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
-| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                                                            | `Path`  |               |
-| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                                                   | `Path`  |               |
-| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                                                              | `str`   | .jpg          |
-| `--model`                   | Path to the model to use for prediction                                                                                                                           | `Path`  |               |
-| `--parameters`              | Path to the YAML parameters file.                                                                                                                                 | `Path`  |               |
-| `--charset`                 | Path to the charset file.                                                                                                                                         | `Path`  |               |
-| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                                                               | `Path`  |               |
-| `--confidence-score`        | Whether to return confidence scores.                                                                                                                              | `bool`  | `False`       |
-| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.                                                                       | `str`   |               |
-| `--attention-map`           | Whether to plot attention maps.                                                                                                                                   | `bool`  | `False`       |
-| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                                                                     | `float` | `0.5`         |
-| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                                                                                        | `str`   | `"line"`      |
-| `--predict-objects`         | Whether to return polygons coordinates.                                                                                                                           | `bool`  | `False`       |
-| `--word-separators`         | List of word separators.                                                                                                                                          | `list`  | `[" ", "\n"]` |
-| `--line-separators`         | List of line separators.                                                                                                                                          | `list`  | `["\n"]`      |
-| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                                                                 | `str`   | `"otsu"`      |
-| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                                                            | `int`   | `0`           |
-| `--batch-size `             | Size of the batches for prediction.                                                                                                                               | `int`   | `1`           |
-| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages.                                       | `str`   | `None`        |
-| `--use-language-model`      | Whether to use an external n-gram language model to rescore hypotheses. See [the dedicated example](#predict-with-an-external-n-gram-language-model) for details. | `bool`  | `False`       |
+| Parameter                   | Description                                                                                                                 | Type    | Default       |
+| --------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
+| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                      | `Path`  |               |
+| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                             | `Path`  |               |
+| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                        | `str`   | .jpg          |
+| `--model`                   | Path to the model to use for prediction                                                                                     | `Path`  |               |
+| `--parameters`              | Path to the YAML parameters file.                                                                                           | `Path`  |               |
+| `--charset`                 | Path to the charset file.                                                                                                   | `Path`  |               |
+| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                         | `Path`  |               |
+| `--confidence-score`        | Whether to return confidence scores.                                                                                        | `bool`  | `False`       |
+| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char", "ner"]`.                          | `str`   |               |
+| `--attention-map`           | Whether to plot attention maps.                                                                                             | `bool`  | `False`       |
+| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                               | `float` | `0.5`         |
+| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char", "ner"]`.                                           | `str`   | `"line"`      |
+| `--predict-objects`         | Whether to return polygons coordinates.                                                                                     | `bool`  | `False`       |
+| `--word-separators`         | List of word separators.                                                                                                    | `list`  | `[" ", "\n"]` |
+| `--line-separators`         | List of line separators.                                                                                                    | `list`  | `["\n"]`      |
+| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                           | `str`   | `"otsu"`      |
+| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                      | `int`   | `0`           |
+| `--batch-size `             | Size of the batches for prediction.                                                                                         | `int`   | `1`           |
+| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages. | `str`   | `None`        |
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index 06710724..cc61d2ec 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -76,16 +76,6 @@ def test_predict(image_name, expected_prediction):
             {
                 "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                 "confidences": {
-                    "ner": [
-                        {"text": "â“ˆBellisson ", "confidence": 1.0},
-                        {"text": "â’»Georges ", "confidence": 1.0},
-                        {"text": "â’·91 ", "confidence": 1.0},
-                        {"text": "â“P ", "confidence": 1.0},
-                        {"text": "â’¸M ", "confidence": 1.0},
-                        {"text": "â“€Ch ", "confidence": 1.0},
-                        {"text": "â“„Plombier ", "confidence": 1.0},
-                        {"text": "â“…Patron?12241", "confidence": 1.0},
-                    ],
                     "total": 1.0,
                     "word": [
                         {"text": "â“ˆBellisson", "confidence": 1.0},
@@ -102,11 +92,12 @@ def test_predict(image_name, expected_prediction):
         ),
         (
             "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
-            [Level.Word],
+            [Level.NER, Level.Word],
             3.5,
             {
                 "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                 "confidences": {
+                    "total": 0.93,
                     "ner": [
                         {"text": "â“ˆBellisson ", "confidence": 0.92},
                         {"text": "â’»Georges ", "confidence": 0.94},
@@ -117,7 +108,6 @@ def test_predict(image_name, expected_prediction):
                         {"text": "â“„Plombier ", "confidence": 0.93},
                         {"text": "â“…Patron?12241", "confidence": 0.93},
                     ],
-                    "total": 0.93,
                     "word": [
                         {"text": "â“ˆBellisson", "confidence": 0.93},
                         {"text": "â’»Georges", "confidence": 0.94},
@@ -138,16 +128,6 @@ def test_predict(image_name, expected_prediction):
             {
                 "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                 "confidences": {
-                    "ner": [
-                        {"text": "â“ˆBellisson ", "confidence": 1.0},
-                        {"text": "â’»Georges ", "confidence": 1.0},
-                        {"text": "â’·91 ", "confidence": 1.0},
-                        {"text": "â“P ", "confidence": 1.0},
-                        {"text": "â’¸M ", "confidence": 1.0},
-                        {"text": "â“€Ch ", "confidence": 1.0},
-                        {"text": "â“„Plombier ", "confidence": 1.0},
-                        {"text": "â“…Patron?12241", "confidence": 1.0},
-                    ],
                     "total": 1.0,
                     "line": [
                         {
@@ -160,11 +140,12 @@ def test_predict(image_name, expected_prediction):
         ),
         (
             "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
-            [Level.Line],
+            [Level.NER, Level.Line],
             3.5,
             {
                 "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                 "confidences": {
+                    "total": 0.93,
                     "ner": [
                         {"text": "â“ˆBellisson ", "confidence": 0.92},
                         {"text": "â’»Georges ", "confidence": 0.94},
@@ -175,7 +156,6 @@ def test_predict(image_name, expected_prediction):
                         {"text": "â“„Plombier ", "confidence": 0.93},
                         {"text": "â“…Patron?12241", "confidence": 0.93},
                     ],
-                    "total": 0.93,
                     "line": [
                         {
                             "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
@@ -193,11 +173,12 @@ def test_predict(image_name, expected_prediction):
         ),
         (
             "0dfe8bcd-ed0b-453e-bf19-cc697012296e",
-            [Level.Char, Level.Word, Level.Line],
+            [Level.NER, Level.Char, Level.Word, Level.Line],
             1.0,
             {
                 "text": "â“ˆTempliÃ© â’»Marcelle â’·93 â“S â“€ch â“„E dactylo â“…18376",
                 "confidences": {
+                    "total": 1.0,
                     "ner": [
                         {"text": "â“ˆTempliÃ© ", "confidence": 0.98},
                         {"text": "â’»Marcelle ", "confidence": 1.0},
@@ -207,7 +188,6 @@ def test_predict(image_name, expected_prediction):
                         {"text": "â“„E dactylo ", "confidence": 1.0},
                         {"text": "â“…18376", "confidence": 1.0},
                     ],
-                    "total": 1.0,
                     "char": [
                         {"text": "â“ˆ", "confidence": 1.0},
                         {"text": "T", "confidence": 1.0},
@@ -345,16 +325,6 @@ def test_run_prediction(
                 {
                     "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                     "confidences": {
-                        "ner": [
-                            {"text": "â“ˆBellisson ", "confidence": 1.0},
-                            {"text": "â’»Georges ", "confidence": 1.0},
-                            {"text": "â’·91 ", "confidence": 1.0},
-                            {"text": "â“P ", "confidence": 1.0},
-                            {"text": "â’¸M ", "confidence": 1.0},
-                            {"text": "â“€Ch ", "confidence": 1.0},
-                            {"text": "â“„Plombier ", "confidence": 1.0},
-                            {"text": "â“…Patron?12241", "confidence": 1.0},
-                        ],
                         "total": 1.0,
                         "word": [
                             {"text": "â“ˆBellisson", "confidence": 1.0},
@@ -375,12 +345,13 @@ def test_run_prediction(
                 "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
                 "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84",
             ],
-            [Level.Word],
+            [Level.NER, Level.Word],
             1.0,
             [
                 {
                     "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                     "confidences": {
+                        "total": 1.0,
                         "ner": [
                             {"text": "â“ˆBellisson ", "confidence": 1.0},
                             {"text": "â’»Georges ", "confidence": 1.0},
@@ -391,7 +362,6 @@ def test_run_prediction(
                             {"text": "â“„Plombier ", "confidence": 1.0},
                             {"text": "â“…Patron?12241", "confidence": 1.0},
                         ],
-                        "total": 1.0,
                         "word": [
                             {"text": "â“ˆBellisson", "confidence": 1.0},
                             {"text": "â’»Georges", "confidence": 1.0},
@@ -407,6 +377,7 @@ def test_run_prediction(
                 {
                     "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                     "confidences": {
+                        "total": 1.0,
                         "ner": [
                             {"text": "â“ˆBellisson ", "confidence": 1.0},
                             {"text": "â’»Georges ", "confidence": 1.0},
@@ -417,7 +388,6 @@ def test_run_prediction(
                             {"text": "â“„Plombier ", "confidence": 1.0},
                             {"text": "â“…Patron?12241", "confidence": 1.0},
                         ],
-                        "total": 1.0,
                         "word": [
                             {"text": "â“ˆBellisson", "confidence": 1.0},
                             {"text": "â’»Georges", "confidence": 1.0},
@@ -440,16 +410,6 @@ def test_run_prediction(
                 {
                     "text": "â“ˆBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241",
                     "confidences": {
-                        "ner": [
-                            {"text": "â“ˆBellisson ", "confidence": 1.0},
-                            {"text": "â’»Georges ", "confidence": 1.0},
-                            {"text": "â’·91 ", "confidence": 1.0},
-                            {"text": "â“P ", "confidence": 1.0},
-                            {"text": "â’¸M ", "confidence": 1.0},
-                            {"text": "â“€Ch ", "confidence": 1.0},
-                            {"text": "â“„Plombier ", "confidence": 1.0},
-                            {"text": "â“…Patron?12241", "confidence": 1.0},
-                        ],
                         "total": 1.0,
                         "word": [
                             {"text": "â“ˆBellisson", "confidence": 1.0},
-- 
GitLab