From f7563b0c125455d345eda0f7429d82fcb3f0242a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Tue, 26 Sep 2023 12:07:04 +0200
Subject: [PATCH] Document prediction command

---
 dan/ocr/decoder.py       |  10 ++-
 docs/usage/predict.md    | 134 ++++++++++++++++-----------------------
 tests/test_prediction.py |  16 ++---
 3 files changed, 69 insertions(+), 91 deletions(-)

diff --git a/dan/ocr/decoder.py b/dan/ocr/decoder.py
index 8279d205..a975bda5 100644
--- a/dan/ocr/decoder.py
+++ b/dan/ocr/decoder.py
@@ -571,10 +571,14 @@ class CTCLanguageDecoder:
             ).strip()
             for hypothesis in hypotheses
         ]
-        # Normalize confidence score
+        # Normalize confidence scoref"{np.around(np.mean(char_confidences[current : next_token]), 2)}
         out["confidence"] = [
-            np.exp(
-                hypothesis[0].score / ((self.language_model_weight + 1) * length.item())
+            np.around(
+                np.exp(
+                    hypothesis[0].score
+                    / ((self.language_model_weight + 1) * length.item())
+                ),
+                2,
             )
             for hypothesis, length in zip(hypotheses, batch_sizes)
         ]
diff --git a/docs/usage/predict.md b/docs/usage/predict.md
index e3e68385..4a13bd22 100644
--- a/docs/usage/predict.md
+++ b/docs/usage/predict.md
@@ -4,57 +4,28 @@ Use the `teklia-dan predict` command to apply a trained DAN model on an image.
 
 ## Description of parameters
 
-| Parameter                   | Description                                                                                                                                                     | Type    | Default       |
-| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
-| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                                                          | `Path`  |               |
-| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                                                 | `Path`  |               |
-| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                                                            | `str`   | .jpg          |
-| `--model`                   | Path to the model to use for prediction                                                                                                                         | `Path`  |               |
-| `--parameters`              | Path to the YAML parameters file.                                                                                                                               | `Path`  |               |
-| `--charset`                 | Path to the charset file.                                                                                                                                       | `Path`  |               |
-| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                                                             | `Path`  |               |
-| `--confidence-score`        | Whether to return confidence scores.                                                                                                                            | `bool`  | `False`       |
-| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.                                                                     | `str`   |               |
-| `--attention-map`           | Whether to plot attention maps.                                                                                                                                 | `bool`  | `False`       |
-| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                                                                   | `float` | `0.5`         |
-| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                                                                                      | `str`   | `"line"`      |
-| `--predict-objects`         | Whether to return polygons coordinates.                                                                                                                         | `bool`  | `False`       |
-| `--word-separators`         | List of word separators.                                                                                                                                        | `list`  | `[" ", "\n"]` |
-| `--line-separators`         | List of line separators.                                                                                                                                        | `list`  | `["\n"]`      |
-| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                                                               | `str`   | `"otsu"`      |
-| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                                                          | `int`   | `0`           |
-| `--batch-size `             | Size of the batches for prediction.                                                                                                                             | `int`   | `1`           |
-| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages.                                     | `str`   | `None`        |
-| `--use-language-model`      | Whether to use an external n-gram language model to rescore hypotheses. See [the next section](#rescoring-hypotheses-with-a-n-gram-language-model) for details. | `bool`  | `False`       |
-
-## Rescoring hypotheses with a N-gram language model
-
-A dataset extracted with the `teklia-dan dataset extract` command should contain the files required to build a language model (in the `language_model` folder).
-
-To refine DAN's predictions with a language model, follow these steps:
-
-1. Install and build [kenlm](https://github.com/kpu/kenlm)
-1. Build a 6-gram language model using the following command
-
-```sh
-bin/lmplz --order 6 \
-    --text my_dataset/language_model/corpus.txt \
-    --arpa my_dataset/language_model/model.arpa
-```
-
-1. Update `inference_parameters.yml`. The `weight` parameter defines how much weight to give to the language model. It should be set carefully (usually between 0.5 and 2.0) as it will affect the quality of the predictions.
-
-```yaml
-parameters:
-  ...
-  language_model:
-    model: my_dataset/language_model/model.arpa
-    lexicon: my_dataset/language_model/lexicon.txt
-    tokens: my_dataset/language_model/tokens.txt
-    weight: 1.0
-```
-
-1. Predict with the `--use-language-model` argument.
+| Parameter                   | Description                                                                                                                                                       | Type    | Default       |
+| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
+| `--image`                   | Path to the image to predict. Must not be provided with `--image-dir`.                                                                                            | `Path`  |               |
+| `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                                                   | `Path`  |               |
+| `--image-extension`         | The extension of the images in the folder. Ignored if `--image-dir` is not provided.                                                                              | `str`   | .jpg          |
+| `--model`                   | Path to the model to use for prediction                                                                                                                           | `Path`  |               |
+| `--parameters`              | Path to the YAML parameters file.                                                                                                                                 | `Path`  |               |
+| `--charset`                 | Path to the charset file.                                                                                                                                         | `Path`  |               |
+| `--output`                  | Path to the output folder. Results will be saved in this directory.                                                                                               | `Path`  |               |
+| `--confidence-score`        | Whether to return confidence scores.                                                                                                                              | `bool`  | `False`       |
+| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`.                                                                       | `str`   |               |
+| `--attention-map`           | Whether to plot attention maps.                                                                                                                                   | `bool`  | `False`       |
+| `--attention-map-scale`     | Image scaling factor before creating the GIF.                                                                                                                     | `float` | `0.5`         |
+| `--attention-map-level`     | Level to plot the attention maps. Should be in `["line", "word", "char"]`.                                                                                        | `str`   | `"line"`      |
+| `--predict-objects`         | Whether to return polygons coordinates.                                                                                                                           | `bool`  | `False`       |
+| `--word-separators`         | List of word separators.                                                                                                                                          | `list`  | `[" ", "\n"]` |
+| `--line-separators`         | List of line separators.                                                                                                                                          | `list`  | `["\n"]`      |
+| `--threshold-method`        | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`.                                                                                 | `str`   | `"otsu"`      |
+| `--threshold-value `        | Threshold to use for the "simple" thresholding method.                                                                                                            | `int`   | `0`           |
+| `--batch-size `             | Size of the batches for prediction.                                                                                                                               | `int`   | `1`           |
+| `--start-token `            | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages.                                       | `str`   | `None`        |
+| `--use-language-model`      | Whether to use an external n-gram language model to rescore hypotheses. See [the dedicated example](#predict-with-an-external-n-gram-language-model) for details. | `bool`  | `False`       |
 
 ## Examples
 
@@ -191,6 +162,33 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 
 ### Predict with an external n-gram language model
 
+#### Build the language model
+
+A dataset extracted with the `teklia-dan dataset extract` command should contain the files required to build a language model (in the `language_model` folder). To refine DAN's predictions with a language model, follow these steps:
+
+1. Install and build [kenlm](https://github.com/kpu/kenlm)
+1. Build a 6-gram language model using the following command
+
+```sh
+bin/lmplz --order 6 \
+    --text my_dataset/language_model/corpus.txt \
+    --arpa my_dataset/language_model/model.arpa
+```
+
+1. Update `inference_parameters.yml`. The `weight` parameter defines how much weight to give to the language model. It should be set carefully (usually between 0.5 and 2.0) as it will affect the quality of the predictions.
+
+```yaml
+parameters:
+  ...
+  language_model:
+    model: my_dataset/language_model/model.arpa
+    lexicon: my_dataset/language_model/lexicon.txt
+    tokens: my_dataset/language_model/tokens.txt
+    weight: 0.5
+```
+
+#### Predict
+
 To run a prediction with the n-gram language model, run this command:
 
 ```shell
@@ -207,34 +205,10 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
 
 ```json
 {
-  "text": "Oslo\n39 \nOresden den 24te Rasser!\nH\u00f8jst\u00e6redesherr Hartvig - assert!\nUllereder fra den f\u00f8rste tide da\njeg havder den tilfredsstillelser at vide den ar-\ndistiske ledelser af Kristiania theater i Deres\nhronder, har jeg g\u00e5t hernede med et stille\nh\u00e5b om fra Dem at modtage et forelag, sig -\nsende tils at lade \"K\u00e6rlighedens \u00abKomedie\u00bb\nopf\u00f8re fore det norske purblikum.\nEt s\u00e5dant forslag er imidlertid, imod\nforventning; ikke fremkommet, og jeg n\u00f8des der-\nfor tils self at grivbe initiativet, hvilket hervede\nsker, idet jeg\nbeder\nbet\nragte stigkket some ved denne\nskrivelse officielde indleveret til theatret. No-\nget exemplar af bogen vedlagger jeg ikke da\ndenne (i 2den udgave) med Lethed kan er -\nholdet deroppe.\nDe bet\u00e6nkeligheder, jeg i sin tid n\u00e6-\nrede mod stykkets opf\u00f8relse, er for l\u00e6nge si -\ndem forsvundne. Af mange begn er jeg kom-\nmen til den overbevisning at almenlreden\naru har f\u00e5tt sine \u00f8gne opladte for den sand -\nMed at dette arbejde i sin indersten id\u00e9 hviler\np\u00e5 et ubedinget meralsk grundlag, og brad\nstykkets hele kunstneriske struktuve ang\u00e5r,",
-  "language_model": [
-    {
-      "confidence": 0.68,
-      "polygon": [
-        [
-          264,
-          118
-        ],
-        [
-          410,
-          118
-        ],
-        [
-          410,
-          185
-        ],
-        [
-          264,
-          185
-        ]
-      ],
-      "text": "Oslo",
-      "text_confidence": 0.8
-    }
-  ],
-  "attention_gif": "dan_humu_page/predict/example_line.gif"
+  "text": "etc., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f8rterne\nbestad i at jeg kj\u00f8bte\net Forkl\u00e6de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f8rter og et Forkl\u00e6de,\nsamt Fragt paa det Gods\n(N\u00f8i) some man sendte\nmig ubet\u00e6lt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg",
+  "language_model": {
+    "text": "eet., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f9rterne\nbestad i at jeg kj\u00f9bte\net Forkl\u00e7de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f9rter og et Forkl\u00e7de,\nsamt Fragt paa det Gods\n(N\u00f9i) some man sendte\nmig ubetalt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg",
+    "confidence": 0.87
+  }
 }
 ```
-
-<img src="../../assets/example_line_polygon.gif" >
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index a9c39232..1d43db8e 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -547,28 +547,28 @@ def test_run_prediction_batch(
                     "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                     "language_model": {
                         "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
-                        "confidence": 0.9226743371961854,
+                        "confidence": 0.92,
                     },
                 },
                 {
                     "text": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁS Ⓚch ⓄE dactylo Ⓟ18376",
                     "language_model": {
                         "text": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁS Ⓚch ⓄE dactylo Ⓟ18376",
-                        "confidence": 0.8759829104754289,
+                        "confidence": 0.88,
                     },
                 },
                 {
                     "text": "Ⓢd ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF Ⓞd Ⓟ14 31",
                     "language_model": {
                         "text": "Ⓢd ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF Ⓞd Ⓟ14 31",
-                        "confidence": 0.864021797502254,
+                        "confidence": 0.86,
                     },
                 },
                 {
                     "text": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS Ⓒv ⓀBelle mère",
                     "language_model": {
                         "text": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS Ⓒv ⓀBelle mère",
-                        "confidence": 0.8903665579889012,
+                        "confidence": 0.89,
                     },
                 },
             ],
@@ -586,28 +586,28 @@ def test_run_prediction_batch(
                     "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
                     "language_model": {
                         "text": "ⓈBellisson ⒻGeorges Ⓑ91 ⓁP ⒸM ⓀCh ⓄPlombier ⓅPatron?12241",
-                        "confidence": 0.8982517863786614,
+                        "confidence": 0.90,
                     },
                 },
                 {
                     "text": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁS Ⓚch ⓄE dactylo Ⓟ18376",
                     "language_model": {
                         "text": "ⓈTemplié ⒻMarcelle Ⓑ93 ⓁS Ⓚch ⓄE dactylo Ⓟ18376",
-                        "confidence": 0.8386571587822831,
+                        "confidence": 0.84,
                     },
                 },
                 {
                     "text": "Ⓢd ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF Ⓞd Ⓟ14 31",
                     "language_model": {
                         "text": "Ⓢd ⒻCharles Ⓑ11 ⓁP ⒸC ⓀF Ⓞd Ⓟ14331",
-                        "confidence": 0.8334836549049839,
+                        "confidence": 0.83,
                     },
                 },
                 {
                     "text": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS Ⓒv ⓀBelle mère",
                     "language_model": {
                         "text": "ⓈNaudin ⒻMarie Ⓑ53 ⓁS Ⓒv ⓀBelle mère",
-                        "confidence": 0.8565623750166133,
+                        "confidence": 0.86,
                     },
                 },
             ],
-- 
GitLab