Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (2)
*gif filter=lfs diff=lfs merge=lfs -text
......@@ -56,3 +56,6 @@ See the [dedicated section](https://teklia.gitlab.io/atr/dan/usage/train/) on th
### Synthetic data generation
See the [dedicated section](https://teklia.gitlab.io/atr/dan/usage/generate/) on the official DAN documentation.
### Model prediction
See the [dedicated section](https://teklia.gitlab.io/atr/dan/usage/predict/) on the official DAN documentation.
......@@ -5,6 +5,7 @@ import errno
from dan.datasets import add_dataset_parser
from dan.ocr import add_train_parser
from dan.ocr.line import add_generate_parser
from dan.predict import add_predict_parser
def get_parser():
......@@ -14,6 +15,7 @@ def get_parser():
add_dataset_parser(subcommands)
add_train_parser(subcommands)
add_generate_parser(subcommands)
add_predict_parser(subcommands)
return parser
......
# -*- coding: utf-8 -*-
"""
Predict on an image using a trained DAN model.
"""
import pathlib
from dan.predict.prediction import run
def add_predict_parser(subcommands) -> None:
parser = subcommands.add_parser(
"predict",
description=__doc__,
help=__doc__,
)
# Required arguments.
parser.add_argument(
"--image",
type=pathlib.Path,
help="Path to the image to predict.",
required=True,
)
parser.add_argument(
"--model",
type=pathlib.Path,
help="Path to the model to use for prediction.",
required=True,
)
parser.add_argument(
"--parameters",
type=pathlib.Path,
help="Path to the YAML parameters file.",
required=True,
default="page",
)
parser.add_argument(
"--charset",
type=pathlib.Path,
help="Path to the charset file.",
required=True,
)
parser.add_argument(
"--output",
type=pathlib.Path,
help="Path to the output folder.",
required=True,
)
# Optional arguments.
parser.add_argument(
"--scale",
type=float,
default=1.0,
required=False,
help="Image scaling factor before feeding it to DAN",
)
parser.add_argument(
"--confidence-score",
action="store_true",
help="Whether to return confidence scores.",
required=False,
)
parser.add_argument(
"--attention-map",
action="store_true",
help="Whether to plot attention maps.",
required=False,
)
parser.add_argument(
"--attention-map-level",
type=str,
choices=["line", "word", "char"],
default="line",
help="Level of attention maps.",
required=False,
)
parser.add_argument(
"--attention-map-scale",
type=float,
default=0.5,
help="Image scaling factor before creating the GIF",
required=False,
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
import cv2
import numpy as np
from PIL import Image
from dan import logger
def split_text(text, level):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize (char, word, line)
"""
# split into characters
if level == "char":
text_split = list(text)
offset = 0
# split into words
elif level == "word":
text = text.replace("\n", " ")
text_split = text.split(" ")
offset = 1
# split into lines
elif level == "line":
text_split = text.split("\n")
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return text_split, offset
def plot_attention(image, text, weights, level, scale, outname):
"""
Create a gif by blending attention maps to the image for each text piece (char, word or line)
:param image: Input image in PIL format
:param text: Text predicted by DAN
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param level: Level to display (must be in [char, word, line])
:param scale: Scaling factor for the output gif image
:param outname: Name of the gif image
"""
height, width, _ = image.shape
attention_map = []
# Convert to PIL Image and create mask
mask = Image.new("L", (width, height), color=(110))
image = Image.fromarray(image)
# Split text into characters, words or lines
text_list, offset = split_text(text, level)
# Iterate on characters, words or lines
tot_len = 0
max_value = weights.sum(0).max()
for text_piece in text_list:
# blank vector to accumulate weights for the current word/line
coverage_vector = np.zeros((height, width))
for i in range(len(text_piece)):
local_weight = weights[i + tot_len]
local_weight = cv2.resize(local_weight, (width, height))
coverage_vector = np.clip(coverage_vector + local_weight, 0, 1)
# Keep track of text length
tot_len += len(text_piece) + offset
# Normalize coverage vector
coverage_vector = (coverage_vector / max_value * 255).astype(np.uint8)
# Blend coverage vector with original image
blank_array = np.zeros((height, width)).astype(np.uint8)
coverage_vector = Image.fromarray(
np.stack([coverage_vector, blank_array, blank_array], axis=2), "RGB"
)
blend = Image.composite(image, coverage_vector, mask)
# Resize to save time
blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
attention_map.append(blend)
attention_map[0].save(
outname,
save_all=True,
format="GIF",
append_images=attention_map[1:],
duration=1000,
loop=True,
)
# -*- coding: utf-8 -*-
import logging
import os
import pickle
import cv2
......@@ -8,9 +8,13 @@ import numpy as np
import torch
import yaml
from dan import logger
from dan.datasets.extract.utils import save_json
from dan.decoder import GlobalHTADecoder
from dan.models import FCN_Encoder
from dan.ocr.utils import LM_ind_to_str
from dan.predict.attention import plot_attention
from dan.utils import read_image
class DAN:
......@@ -50,7 +54,7 @@ class DAN:
decoder = GlobalHTADecoder(parameters["decoder"]).to(self.device)
decoder.load_state_dict(checkpoint["decoder_state_dict"], strict=True)
logging.debug(f"Loaded model {model_path}")
logger.debug(f"Loaded model {model_path}")
if mode == "train":
encoder.train()
......@@ -66,13 +70,11 @@ class DAN:
self.mean, self.std = parameters["mean"], parameters["std"]
self.max_chars = parameters["max_char_prediction"]
def predict(self, input_image, confidences=False):
def preprocess(self, input_image):
"""
Run prediction on an input image.
:param input_image: The image to predict.
:param confidences: Return the characters probabilities.
Preprocess an input_image.
:param input_image: The input image to preprocess.
"""
# Preprocess image.
assert isinstance(
input_image, np.ndarray
), "Input image must be an np.array in RGB"
......@@ -80,12 +82,18 @@ class DAN:
if len(input_image.shape) < 3:
input_image = cv2.cvtColor(input_image, cv2.COLOR_GRAY2RGB)
reduced_size = [input_image.shape[:2]]
input_image = (input_image - self.mean) / self.std
input_image = np.expand_dims(input_image.transpose((2, 0, 1)), axis=0)
input_tensor = torch.from_numpy(input_image).to(self.device)
logging.debug("Image pre-processed")
return input_image
def predict(self, input_tensor, input_sizes, confidences=False, attentions=False):
"""
Run prediction on an input image.
:param input_tensor: A batch of images to predict.
:param input_sizes: The original images sizes.
:param confidences: Return the characters probabilities.
:param attentions: Return characters attention weights.
"""
input_tensor.to(self.device)
start_token = len(self.charset) + 1
end_token = len(self.charset)
......@@ -102,6 +110,7 @@ class DAN:
whole_output = list()
confidence_scores = list()
attention_maps = list()
cache = None
hidden_predict = None
......@@ -125,7 +134,7 @@ class DAN:
features,
enhanced_features,
predicted_tokens,
reduced_size,
input_sizes,
predicted_tokens_len,
features_size,
start=0,
......@@ -134,6 +143,7 @@ class DAN:
num_pred=1,
)
whole_output.append(output)
attention_maps.append(weights)
confidence_scores.append(
torch.max(torch.softmax(pred[:, :], dim=1), dim=1).values
)
......@@ -158,6 +168,8 @@ class DAN:
confidence_scores = (
torch.cat(confidence_scores, dim=1).cpu().detach().numpy()
)
attention_maps = torch.cat(attention_maps, dim=1).cpu().detach().numpy()
predicted_tokens = predicted_tokens[:, 1:]
prediction_len[torch.eq(reached_end, False)] = self.max_chars - 1
predicted_tokens = [
......@@ -169,8 +181,76 @@ class DAN:
predicted_text = [
LM_ind_to_str(self.charset, t, oov_symbol="") for t in predicted_tokens
]
logging.info("Image processed")
logger.info("Images processed")
out = {"text": predicted_text}
if confidences:
return predicted_text[0], confidence_scores[0]
return predicted_text[0]
out["confidences"] = confidence_scores
if attentions:
out["attentions"] = attention_maps
return out
def run(
image,
model,
parameters,
charset,
output,
scale,
confidence_score,
attention_map,
attention_map_level,
attention_map_scale,
):
# Create output directory if necessary
if not os.path.exists(output):
os.mkdir(output)
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
dan_model = DAN(device)
dan_model.load(model, parameters, charset, mode="eval")
# Load image and pre-process it
im = read_image(image, scale=scale)
logger.info("Image loaded.")
im_p = dan_model.preprocess(im)
logger.debug("Image pre-processed.")
# Convert to tensor of size (batch_size, channel, height, width) with batch_size=1
input_tensor = torch.tensor(im_p).permute(2, 0, 1).unsqueeze(0)
input_tensor = input_tensor.to(device)
input_sizes = [im.shape[:2]]
# Predict
prediction = dan_model.predict(
input_tensor,
input_sizes,
confidences=confidence_score,
attentions=attention_map,
)
result = {"text": prediction["text"][0]}
# Average character-based confidence scores
if confidence_score:
# TODO: select the level for confidence scores (char, word, line, total)
result["confidence"] = np.around(np.mean(prediction["confidences"][0]), 2)
# Save gif with attention map
if attention_map:
gif_filename = f"{output}/{image.stem}_{attention_map_level}.gif"
logger.info(f"Creating attention GIF in {gif_filename}")
plot_attention(
image=im,
text=prediction["text"][0],
weights=prediction["attentions"][0],
level=attention_map_level,
scale=attention_map_scale,
outname=gif_filename,
)
result["attention_gif"] = gif_filename
json_filename = f"{output}/{image.stem}.json"
logger.info(f"Saving JSON prediction in {json_filename}")
save_json(json_filename, result)
......@@ -185,3 +185,17 @@ def pad_image_width_random(img, new_width, padding_value, max_pad_left_ratio=1):
pad_right = np.ones((h, pad_right, c), dtype=img.dtype) * padding_value
img = np.concatenate([pad_left, img, pad_right], axis=1)
return img
def read_image(filename, scale=1.0):
"""
Read image and rescale it
:param filename: Image path
:param scale: Scaling factor before prediction
"""
image = cv2.cvtColor(cv2.imread(str(filename)), cv2.COLOR_BGR2RGB)
if scale != 1.0:
width = int(image.shape[1] * scale)
height = int(image.shape[0] * scale)
image = cv2.resize(image, (width, height), interpolation=cv2.INTER_AREA)
return image
Image diff could not be displayed: it is too large. Options to address this: view the blob.
Image diff could not be displayed: it is too large. Options to address this: view the blob.
# Attention
::: dan.predict.attention
# Inference
::: dan.predict
::: dan.predict.prediction
......@@ -10,3 +10,6 @@ When `teklia-dan` is installed in your environment, you may use the following co
`teklia-dan generate`
: To generate synthetic data to train DAN models. More details in [the dedicated section](./generate.md).
`teklia-dan predict`
: To predict an image using a trained DAN model. More details in [the dedicated section](./predict.md).
# Predict
## Description
Use the `teklia-dan predict` command to predict a trained DAN model on an image.
| Parameter | Description | Type | Default |
| ------------------------------ | ---------------------------------------------------------------------------- | -------- | ------- |
| `--image` | Path to the image to predict. | `Path` | |
| `--model` | Path to the model to use for prediction | `Path` | |
| `--parameters` | Path to the YAML parameters file. | `Path` | |
| `--charset` | Path to the charset file. | `Path` | |
| `--output` | Path to the output folder. Results will be saved in this directory. | `Path` | |
| `--scale` | Image scaling factor before feeding it to DAN. | `float` | 1.0 |
| `--confidence-score` | Whether to return confidence scores. | `bool` | False |
| `--attention-map` | Whether to plot attention maps. | `bool` | False |
| `--attention-map-level` | Level to plot the attention maps. Should be in `["line", "word", "char"]`. | `str` | line |
| `--attention-map-scale` | Image scaling factor before creating the GIF. | `float` | 0.5 |
## Examples
### Predict with confidence scores
To run a prediction with confidence scores, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--scale 0.5 \
--confidence-score
```
It will create the following JSON file named `dan_humu_page/predict/example.json`
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99
}
```
### Predict with confidence scores and line-level attention maps
To run a prediction with confidence scores and plot line-level attention maps, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--scale 0.5 \
--confidence-score \
--attention-map \
```
It will create the following JSON file named `dan_humu_page/predict/example.json` and a GIF showing a word-level attention map `dan_humu_page/predict/example_line.gif`
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99,
"attention_gif": "dan_humu_page/predict/example_line.gif"
}
```
<video autoplay>
<source src="../assets/example_line.gif">
</video>
### Predict with confidence scores and word-level attention maps
To run a prediction with confidence scores and plot word-level attention maps, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--scale 0.5 \
--confidence-score \
--attention-map \
--attention-map-level word \
--attention-map-scale 0.5
```
It will create the following JSON file named `dan_humu_page/predict/example.json` and a GIF showing a word-level attention map `dan_humu_page/predict/example_word.gif`.
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99,
"attention_gif": "dan_humu_page/predict/example_word.gif"
}
```
<video autoplay>
<source src="../assets/example_word.gif">
</video>
## Remarks
The script plotting attention maps assumes that:
* words are separated with the symbol ` `
* lines are separated with the symbol `\n`
......@@ -61,6 +61,7 @@ nav:
- Dataset formatting: usage/datasets/format.md
- Training: usage/train.md
- Generate: usage/generate.md
- Predict: usage/predict.md
- Documentation development: dev/build_docs.md
- Python Reference:
- Datasets:
......@@ -92,11 +93,13 @@ nav:
- Model utils: ref/ocr/line/model_utils.md
- Training: ref/ocr/line/train.md
- Utils: ref/ocr/line/utils.md
- Prediction:
- Inference: ref/predict/prediction.md
- Attention: ref/predict/attention.md
- Decoders: ref/decoder.md
- Models: ref/models.md
- MLflow: ref/mlflow.md
- Post Processing: ref/post_processing.md
- Inference: ref/predict.md
- Schedulers: ref/schedulers.md
- Transformations: ref/transforms.md
- Utils: ref/utils.md
......