From f7563b0c125455d345eda0f7429d82fcb3f0242a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Tue, 26 Sep 2023 12:07:04 +0200 Subject: [PATCH] Document prediction command --- dan/ocr/decoder.py | 10 ++- docs/usage/predict.md | 134 ++++++++++++++++----------------------- tests/test_prediction.py | 16 ++--- 3 files changed, 69 insertions(+), 91 deletions(-) diff --git a/dan/ocr/decoder.py b/dan/ocr/decoder.py index 8279d205..a975bda5 100644 --- a/dan/ocr/decoder.py +++ b/dan/ocr/decoder.py @@ -571,10 +571,14 @@ class CTCLanguageDecoder: ).strip() for hypothesis in hypotheses ] - # Normalize confidence score + # Normalize confidence scoref"{np.around(np.mean(char_confidences[current : next_token]), 2)} out["confidence"] = [ - np.exp( - hypothesis[0].score / ((self.language_model_weight + 1) * length.item()) + np.around( + np.exp( + hypothesis[0].score + / ((self.language_model_weight + 1) * length.item()) + ), + 2, ) for hypothesis, length in zip(hypotheses, batch_sizes) ] diff --git a/docs/usage/predict.md b/docs/usage/predict.md index e3e68385..4a13bd22 100644 --- a/docs/usage/predict.md +++ b/docs/usage/predict.md @@ -4,57 +4,28 @@ Use the `teklia-dan predict` command to apply a trained DAN model on an image. ## Description of parameters -| Parameter | Description | Type | Default | -| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- | -| `--image` | Path to the image to predict. Must not be provided with `--image-dir`. | `Path` | | -| `--image-dir` | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `Path` | | -| `--image-extension` | The extension of the images in the folder. Ignored if `--image-dir` is not provided. | `str` | .jpg | -| `--model` | Path to the model to use for prediction | `Path` | | -| `--parameters` | Path to the YAML parameters file. | `Path` | | -| `--charset` | Path to the charset file. | `Path` | | -| `--output` | Path to the output folder. Results will be saved in this directory. | `Path` | | -| `--confidence-score` | Whether to return confidence scores. | `bool` | `False` | -| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`. | `str` | | -| `--attention-map` | Whether to plot attention maps. | `bool` | `False` | -| `--attention-map-scale` | Image scaling factor before creating the GIF. | `float` | `0.5` | -| `--attention-map-level` | Level to plot the attention maps. Should be in `["line", "word", "char"]`. | `str` | `"line"` | -| `--predict-objects` | Whether to return polygons coordinates. | `bool` | `False` | -| `--word-separators` | List of word separators. | `list` | `[" ", "\n"]` | -| `--line-separators` | List of line separators. | `list` | `["\n"]` | -| `--threshold-method` | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`. | `str` | `"otsu"` | -| `--threshold-value ` | Threshold to use for the "simple" thresholding method. | `int` | `0` | -| `--batch-size ` | Size of the batches for prediction. | `int` | `1` | -| `--start-token ` | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages. | `str` | `None` | -| `--use-language-model` | Whether to use an external n-gram language model to rescore hypotheses. See [the next section](#rescoring-hypotheses-with-a-n-gram-language-model) for details. | `bool` | `False` | - -## Rescoring hypotheses with a N-gram language model - -A dataset extracted with the `teklia-dan dataset extract` command should contain the files required to build a language model (in the `language_model` folder). - -To refine DAN's predictions with a language model, follow these steps: - -1. Install and build [kenlm](https://github.com/kpu/kenlm) -1. Build a 6-gram language model using the following command - -```sh -bin/lmplz --order 6 \ - --text my_dataset/language_model/corpus.txt \ - --arpa my_dataset/language_model/model.arpa -``` - -1. Update `inference_parameters.yml`. The `weight` parameter defines how much weight to give to the language model. It should be set carefully (usually between 0.5 and 2.0) as it will affect the quality of the predictions. - -```yaml -parameters: - ... - language_model: - model: my_dataset/language_model/model.arpa - lexicon: my_dataset/language_model/lexicon.txt - tokens: my_dataset/language_model/tokens.txt - weight: 1.0 -``` - -1. Predict with the `--use-language-model` argument. +| Parameter | Description | Type | Default | +| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- | +| `--image` | Path to the image to predict. Must not be provided with `--image-dir`. | `Path` | | +| `--image-dir` | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `Path` | | +| `--image-extension` | The extension of the images in the folder. Ignored if `--image-dir` is not provided. | `str` | .jpg | +| `--model` | Path to the model to use for prediction | `Path` | | +| `--parameters` | Path to the YAML parameters file. | `Path` | | +| `--charset` | Path to the charset file. | `Path` | | +| `--output` | Path to the output folder. Results will be saved in this directory. | `Path` | | +| `--confidence-score` | Whether to return confidence scores. | `bool` | `False` | +| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`. | `str` | | +| `--attention-map` | Whether to plot attention maps. | `bool` | `False` | +| `--attention-map-scale` | Image scaling factor before creating the GIF. | `float` | `0.5` | +| `--attention-map-level` | Level to plot the attention maps. Should be in `["line", "word", "char"]`. | `str` | `"line"` | +| `--predict-objects` | Whether to return polygons coordinates. | `bool` | `False` | +| `--word-separators` | List of word separators. | `list` | `[" ", "\n"]` | +| `--line-separators` | List of line separators. | `list` | `["\n"]` | +| `--threshold-method` | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`. | `str` | `"otsu"` | +| `--threshold-value ` | Threshold to use for the "simple" thresholding method. | `int` | `0` | +| `--batch-size ` | Size of the batches for prediction. | `int` | `1` | +| `--start-token ` | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages. | `str` | `None` | +| `--use-language-model` | Whether to use an external n-gram language model to rescore hypotheses. See [the dedicated example](#predict-with-an-external-n-gram-language-model) for details. | `bool` | `False` | ## Examples @@ -191,6 +162,33 @@ It will create the following JSON file named `dan_humu_page/predict/example.json ### Predict with an external n-gram language model +#### Build the language model + +A dataset extracted with the `teklia-dan dataset extract` command should contain the files required to build a language model (in the `language_model` folder). To refine DAN's predictions with a language model, follow these steps: + +1. Install and build [kenlm](https://github.com/kpu/kenlm) +1. Build a 6-gram language model using the following command + +```sh +bin/lmplz --order 6 \ + --text my_dataset/language_model/corpus.txt \ + --arpa my_dataset/language_model/model.arpa +``` + +1. Update `inference_parameters.yml`. The `weight` parameter defines how much weight to give to the language model. It should be set carefully (usually between 0.5 and 2.0) as it will affect the quality of the predictions. + +```yaml +parameters: + ... + language_model: + model: my_dataset/language_model/model.arpa + lexicon: my_dataset/language_model/lexicon.txt + tokens: my_dataset/language_model/tokens.txt + weight: 0.5 +``` + +#### Predict + To run a prediction with the n-gram language model, run this command: ```shell @@ -207,34 +205,10 @@ It will create the following JSON file named `dan_humu_page/predict/example.json ```json { - "text": "Oslo\n39 \nOresden den 24te Rasser!\nH\u00f8jst\u00e6redesherr Hartvig - assert!\nUllereder fra den f\u00f8rste tide da\njeg havder den tilfredsstillelser at vide den ar-\ndistiske ledelser af Kristiania theater i Deres\nhronder, har jeg g\u00e5t hernede med et stille\nh\u00e5b om fra Dem at modtage et forelag, sig -\nsende tils at lade \"K\u00e6rlighedens \u00abKomedie\u00bb\nopf\u00f8re fore det norske purblikum.\nEt s\u00e5dant forslag er imidlertid, imod\nforventning; ikke fremkommet, og jeg n\u00f8des der-\nfor tils self at grivbe initiativet, hvilket hervede\nsker, idet jeg\nbeder\nbet\nragte stigkket some ved denne\nskrivelse officielde indleveret til theatret. No-\nget exemplar af bogen vedlagger jeg ikke da\ndenne (i 2den udgave) med Lethed kan er -\nholdet deroppe.\nDe bet\u00e6nkeligheder, jeg i sin tid n\u00e6-\nrede mod stykkets opf\u00f8relse, er for l\u00e6nge si -\ndem forsvundne. Af mange begn er jeg kom-\nmen til den overbevisning at almenlreden\naru har f\u00e5tt sine \u00f8gne opladte for den sand -\nMed at dette arbejde i sin indersten id\u00e9 hviler\np\u00e5 et ubedinget meralsk grundlag, og brad\nstykkets hele kunstneriske struktuve ang\u00e5r,", - "language_model": [ - { - "confidence": 0.68, - "polygon": [ - [ - 264, - 118 - ], - [ - 410, - 118 - ], - [ - 410, - 185 - ], - [ - 264, - 185 - ] - ], - "text": "Oslo", - "text_confidence": 0.8 - } - ], - "attention_gif": "dan_humu_page/predict/example_line.gif" + "text": "etc., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f8rterne\nbestad i at jeg kj\u00f8bte\net Forkl\u00e6de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f8rter og et Forkl\u00e6de,\nsamt Fragt paa det Gods\n(N\u00f8i) some man sendte\nmig ubet\u00e6lt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg", + "language_model": { + "text": "eet., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f9rterne\nbestad i at jeg kj\u00f9bte\net Forkl\u00e7de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f9rter og et Forkl\u00e7de,\nsamt Fragt paa det Gods\n(N\u00f9i) some man sendte\nmig ubetalt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg", + "confidence": 0.87 + } } ``` - -<img src="../../assets/example_line_polygon.gif" > diff --git a/tests/test_prediction.py b/tests/test_prediction.py index a9c39232..1d43db8e 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -547,28 +547,28 @@ def test_run_prediction_batch( "text": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241", "language_model": { "text": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241", - "confidence": 0.9226743371961854, + "confidence": 0.92, }, }, { "text": "ⓈTemplié â’»Marcelle â’·93 â“S â“€ch â“„E dactylo â“…18376", "language_model": { "text": "ⓈTemplié â’»Marcelle â’·93 â“S â“€ch â“„E dactylo â“…18376", - "confidence": 0.8759829104754289, + "confidence": 0.88, }, }, { "text": "Ⓢd â’»Charles â’·11 â“P â’¸C â“€F â“„d â“…14 31", "language_model": { "text": "Ⓢd â’»Charles â’·11 â“P â’¸C â“€F â“„d â“…14 31", - "confidence": 0.864021797502254, + "confidence": 0.86, }, }, { "text": "ⓈNaudin â’»Marie â’·53 â“S â’¸v â“€Belle mère", "language_model": { "text": "ⓈNaudin â’»Marie â’·53 â“S â’¸v â“€Belle mère", - "confidence": 0.8903665579889012, + "confidence": 0.89, }, }, ], @@ -586,28 +586,28 @@ def test_run_prediction_batch( "text": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241", "language_model": { "text": "ⓈBellisson â’»Georges â’·91 â“P â’¸M â“€Ch â“„Plombier â“…Patron?12241", - "confidence": 0.8982517863786614, + "confidence": 0.90, }, }, { "text": "ⓈTemplié â’»Marcelle â’·93 â“S â“€ch â“„E dactylo â“…18376", "language_model": { "text": "ⓈTemplié â’»Marcelle â’·93 â“S â“€ch â“„E dactylo â“…18376", - "confidence": 0.8386571587822831, + "confidence": 0.84, }, }, { "text": "Ⓢd â’»Charles â’·11 â“P â’¸C â“€F â“„d â“…14 31", "language_model": { "text": "Ⓢd â’»Charles â’·11 â“P â’¸C â“€F â“„d â“…14331", - "confidence": 0.8334836549049839, + "confidence": 0.83, }, }, { "text": "ⓈNaudin â’»Marie â’·53 â“S â’¸v â“€Belle mère", "language_model": { "text": "ⓈNaudin â’»Marie â’·53 â“S â’¸v â“€Belle mère", - "confidence": 0.8565623750166133, + "confidence": 0.86, }, }, ], -- GitLab