Skip to content
Snippets Groups Projects
Commit 04b15f97 authored by Solene Tarride's avatar Solene Tarride
Browse files

Predict polygons but preserve old behavior

parent da7e8b3f
No related branches found
No related tags found
1 merge request!76Add predicted objects to predict command
......@@ -6,38 +6,17 @@ import numpy as np
from PIL import Image
from dan import logger
from dan.utils import round_floats
def split_text(text: str, level: str, word_separators, line_separators):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize from [char, word, line]
"""
# split into characters
if level == "char":
text_split = list(text)
offset = 0
# split into words
elif level == "word":
text_split = re.split(word_separators, text)
offset = 1
# split into lines
elif level == "line":
text_split = re.split(line_separators, text)
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return text_split, offset
def compute_coverage(text: str, max_value: float, offset: int, attentions):
def compute_coverage(text: str, max_value: float, offset: int, attentions, size: tuple):
"""
Aggregates attention maps for the current text piece (char, word, line)
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
:param offset: Offset value to get the relevant part of text piece
:param attentions: Attention weights of size (n_char, feature_height, feature_width)
:param size: Target size (width, height) to resize the coverage vector
"""
_, height, width = attentions.shape
......@@ -49,6 +28,11 @@ def compute_coverage(text: str, max_value: float, offset: int, attentions):
# Normalize coverage vector
coverage_vector = (coverage_vector / max_value * 255).astype(np.uint8)
# Resize it
if size:
coverage_vector = cv2.resize(coverage_vector, size)
return coverage_vector
......@@ -74,9 +58,82 @@ def blend_coverage(coverage_vector, image, mask, scale):
return blend
def get_predicted_polygons(
def parse_delimiters(delimiters):
return re.compile(r"|".join(delimiters))
def compute_prob_by_separator(characters, probabilities, separator):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of character probabilities.
:param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern = re.compile(f"[^{separator.pattern}]+")
matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
# Iterate over text pieces and compute mean confidence
probs = [np.mean(probabilities[start:end]) for (start, end) in matches]
texts = [characters[start:end] for (start, end) in matches]
return texts, probs
def split_text(text: str, level: str, word_separators, line_separators):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
if level == "char":
text_split = list(text)
offset = 0
# split into words
elif level == "word":
text_split = re.split(word_separators, text)
offset = 1
# split into lines
elif level == "line":
text_split = re.split(line_separators, text)
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return text_split, offset
def split_text_and_confidences(
text, confidences, level, word_separators, line_separators
):
"""
Split text into a list of characters, words or lines with corresponding confidences scores
:param text: Text prediction from DAN
:param confidences: Character confidences
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
if level == "char":
texts = list(text)
offset = 0
elif level == "word":
texts, probs = compute_prob_by_separator(text, confidences, word_separators)
offset = 1
elif level == "line":
texts, probs = compute_prob_by_separator(text, confidences, line_separators)
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return texts, round_floats(probs), offset
def get_predicted_polygons_with_confidence(
text,
weights,
confidences,
level,
height,
width,
......@@ -87,63 +144,85 @@ def get_predicted_polygons(
Returns the polygons of each object of the current prediction
:param text: Text predicted by DAN
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param confidences: Character confidences
:param level: Level to display (must be in [char, word, line])
:param height: Original image height
:param width: Original image width
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
# Split text into characters, words or lines
text_list, offset = split_text(text, level, word_separators, line_separators)
max_value = weights.sum(0).max()
text_list, confidence_list, offset = split_text_and_confidences(
text, confidences, level, word_separators, line_separators
)
# Set offset based on current text_piece to be used.
return [
get_polygon(
text_piece, level, offset * n_offset, max_value, weights, height, width
max_value = weights.sum(0).max()
polygons = []
start_index = 0
for text_piece, confidence in zip(text_list, confidence_list):
start_index += len(text_piece) + offset
polygon = get_polygon(
text_piece, max_value, offset, weights, size=(width, height)
)
for n_offset, text_piece in enumerate(text_list)
]
polygon["text"] = text_piece
polygon["text_confidence"] = confidence
polygons.append(polygon)
return polygons
def compute_contour_metrics(coverage_vector, contour):
"""
Compute the contours's area and the mean value inside it.
:param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
:param contour: Contour of the current attention blob
"""
# draw the contour zone
mask = np.zeros(coverage_vector.shape, dtype=np.uint8)
cv2.drawContours(mask, [contour], -1, (255), -1)
max_value = (
np.where(mask > 0, coverage_vector, 0).max() / 255
) # cv2.max(coverage_vector, mask=mask)[0] / 255.
area = cv2.contourArea(contour)
return max_value, max_value * area
def get_polygon(text_piece, level, offset, max_value, weights, height, width):
def get_polygon(text, max_value, offset, weights, size=None, return_contours=False):
"""
Gets polygon associated with element of current text_piece, indexed by offset
:param text_piece: Current text element
:param level: Level to display (must be in [char, word, line])
:param offset: Offset value to get the relevant part of text piece
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param height: Original image height
:param width: Original image width
:param offset: Offset value to get the relevant part of text piece
:param attentions: Attention weights of size (n_char, feature_height, feature_width)
:param size: Target size (width, height) to resize the coverage vector
:param return_contours: Return the contour of the current polygon (used for plotting)
"""
coverage_vector = compute_coverage(text_piece, max_value, offset, weights)
coverage_vector = cv2.resize(coverage_vector, (width, height))
# Compute coverage vector
coverage_vector = compute_coverage(text, max_value, offset, weights, size=size)
# Generate a binary image for the current channel.
bin_img = coverage_vector.copy()
bin_img[bin_img > 0] = 1
bin_mask = np.array(np.where(coverage_vector > 5, 255, 0), dtype=np.uint8)
bin_mask = np.asarray(bin_mask, dtype=np.uint8)
# Detect the objects contours
contours, _ = cv2.findContours(bin_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Select best contour
metrics = [compute_contour_metrics(coverage_vector, cnt) for cnt in contours]
confidences, scores = map(list, zip(*metrics))
best_contour = contours[np.argmax(scores)]
confidence = round(confidences[np.argmax(scores)] / max_value, 2)
# Format for JSON
polygon = {
"confidence": confidence,
"polygon": [coordinates[0].tolist() for coordinates in best_contour],
}
# Detect the objects contours.
contours, _ = cv2.findContours(
np.uint8(bin_img), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
if return_contours:
return polygon, best_contour
mask = np.zeros(coverage_vector.shape)
cv2.drawContours(mask, contours, -1, 1, -1)
confidence = round((np.sum(mask * coverage_vector) / np.sum(mask)), 2)
# Put together all contours for now.
pre_contours_tojson = [[item.tolist() for item in contours]]
# Quick hack to have better json format:
contours_tojson = []
for contour in pre_contours_tojson[0]:
for coordinate in contour:
contours_tojson.append(coordinate[0])
return {
"confidence": confidence, # average of coverage vector on contours
"polygon": contours_tojson,
"type": level,
}
return polygon
def plot_attention(
......@@ -155,7 +234,7 @@ def plot_attention(
outname,
word_separators=["\n", " "],
line_separators=["\n"],
output_polygons=False,
display_polygons=False,
):
"""
Create a gif by blending attention maps to the image for each text piece (char, word or line)
......@@ -165,11 +244,13 @@ def plot_attention(
:param level: Level to display (must be in [char, word, line])
:param scale: Scaling factor for the output gif image
:param outname: Name of the gif image
:param word_separators: List of word separators
:param line_separators: List of line separators
:param output_polygons: Whether to plot extracted polygons
"""
height, width, _ = image.shape
attention_map = []
polygons = []
# Convert to PIL Image and create mask
mask = Image.new("L", (width, height), color=(110))
......@@ -180,21 +261,26 @@ def plot_attention(
# Iterate on characters, words or lines
tot_len = 0
max_value = weights.sum(0).max()
for text_piece in text_list:
# Accumulate weights for the current word/line and resize to original image size
coverage_vector = compute_coverage(text_piece, max_value, tot_len, weights)
coverage_vector = cv2.resize(coverage_vector, (width, height))
coverage_vector = compute_coverage(
text_piece, max_value, tot_len, weights, (width, height)
)
# Get polygons if flag is set:
if output_polygons:
polygons.append(
get_polygon(
text_piece, level, tot_len, max_value, weights, height, width
)
if display_polygons:
# draw the contour
_, contour = get_polygon(
text_piece,
max_value,
tot_len,
weights,
(width, height),
return_contours=True,
)
cv2.drawContours(coverage_vector, [contour], 0, (255), 3)
# Keep track of text length
tot_len += len(text_piece) + offset
......@@ -210,5 +296,3 @@ def plot_attention(
duration=1000,
loop=True,
)
return polygons
......@@ -2,7 +2,6 @@
import os
import pickle
import re
import cv2
import numpy as np
......@@ -14,8 +13,13 @@ from dan.datasets.extract.utils import save_json
from dan.decoder import GlobalHTADecoder
from dan.models import FCN_Encoder
from dan.ocr.utils import LM_ind_to_str
from dan.predict.attention import get_predicted_polygons, plot_attention
from dan.utils import read_image, round_floats
from dan.predict.attention import (
get_predicted_polygons_with_confidence,
parse_delimiters,
plot_attention,
split_text_and_confidences,
)
from dan.utils import read_image
class DAN:
......@@ -93,6 +97,7 @@ class DAN:
confidences=False,
attentions=False,
attention_level=False,
extract_objects=False,
word_separators=["\n", " "],
line_separators=["\n"],
):
......@@ -113,13 +118,20 @@ class DAN:
# Run the prediction.
with torch.no_grad():
b = input_tensor.size(0)
reached_end = torch.zeros((b,), dtype=torch.bool, device=self.device)
prediction_len = torch.zeros((b,), dtype=torch.int, device=self.device)
batch_size = input_tensor.size(0)
reached_end = torch.zeros(
(batch_size,), dtype=torch.bool, device=self.device
)
prediction_len = torch.zeros(
(batch_size,), dtype=torch.int, device=self.device
)
predicted_tokens = (
torch.ones((b, 1), dtype=torch.long, device=self.device) * start_token
torch.ones((batch_size, 1), dtype=torch.long, device=self.device)
* start_token
)
predicted_tokens_len = torch.ones(
(batch_size,), dtype=torch.int, device=self.device
)
predicted_tokens_len = torch.ones((b,), dtype=torch.int, device=self.device)
whole_output = list()
confidence_scores = list()
......@@ -188,10 +200,11 @@ class DAN:
predicted_tokens = predicted_tokens[:, 1:]
prediction_len[torch.eq(reached_end, False)] = self.max_chars - 1
predicted_tokens = [
predicted_tokens[i, : prediction_len[i]] for i in range(b)
predicted_tokens[i, : prediction_len[i]] for i in range(batch_size)
]
confidence_scores = [
confidence_scores[i, : prediction_len[i]].tolist() for i in range(b)
confidence_scores[i, : prediction_len[i]].tolist()
for i in range(batch_size)
]
# Transform tokens to characters
......@@ -201,44 +214,30 @@ class DAN:
logger.info("Images processed")
out = {"text": predicted_text}
out = {}
out["text"] = predicted_text
if confidences:
out["confidences"] = confidence_scores
if attentions:
out["attentions"] = attention_maps
# Also get information on polygons
out["objects"] = get_predicted_polygons(
predicted_text[0],
attention_maps[0],
attention_level,
input_sizes[0][0],
input_sizes[0][1],
word_separators,
line_separators,
)
if extract_objects:
out["objects"] = [
get_predicted_polygons_with_confidence(
predicted_text[i],
attention_maps[i],
confidence_scores[i],
attention_level,
input_sizes[i][0],
input_sizes[i][1],
word_separators,
line_separators,
)
for i in range(batch_size)
]
return out
def parse_delimiters(delimiters):
return re.compile(r"|".join(delimiters))
def compute_prob_by_separator(characters, probabilities, separator):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of probabilities.
:param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern = re.compile(f"[^{separator.pattern}]+")
matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
# Iterate over text pieces and compute mean confidence
return [np.mean(probabilities[start:end]) for (start, end) in matches]
def run(
image,
model,
......@@ -255,6 +254,22 @@ def run(
line_separators,
predict_objects,
):
"""
Predict a single image save the output
:param image: Path to the image to predict.
:param model: Path to the model to use for prediction.
:param parameters: Path to the YAML parameters file.
:param charset: Path to the charset.
:param output: Path to the output folder where the results will be saved.
:param scale: Scaling factor to resize the image.
:param confidence_score: Whether to compute confidence score.
:param attention_map: Whether to plot the attention map.
:param attention_map_level: Level of objects to extract.
:param attention_map_scale: Scaling factor for the attention map.
:param word_separators: List of word separators.
:param line_separators: List of line separators.
:param predict_objects: Whether to extract objects.
"""
# Create output directory if necessary
if not os.path.exists(output):
os.mkdir(output)
......@@ -286,30 +301,36 @@ def run(
confidences=confidence_score,
attentions=attention_map,
attention_level=attention_map_level,
extract_objects=predict_objects,
word_separators=word_separators,
line_separators=line_separators,
)
text = prediction["text"][0]
result = {"text": text}
result = {}
result["text"] = prediction["text"][0]
result["objects"] = prediction["objects"]
# Return extracted objects (coordinates, text, confidence)
if predict_objects:
result["objects"] = prediction["objects"][0]
# Average character-based confidence scores
# Return mean confidence score
if confidence_score:
result["confidences"] = {}
char_confidences = prediction["confidences"][0]
result["confidences"] = {"total": np.around(np.mean(char_confidences), 2)}
if "word" in confidence_score_levels:
word_probs = compute_prob_by_separator(
text, char_confidences, word_separators
)
result["confidences"].update({"word": round_floats(word_probs)})
if "line" in confidence_score_levels:
line_probs = compute_prob_by_separator(
text, char_confidences, line_separators
result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
for level in confidence_score_levels:
result["confidences"][level] = []
texts, confidences, _ = split_text_and_confidences(
prediction["text"][0],
char_confidences,
level,
word_separators,
line_separators,
)
result["confidences"].update({"line": round_floats(line_probs)})
if "char" in confidence_score_levels:
result["confidences"].update({"char": round_floats(char_confidences)})
for text, conf in zip(texts, confidences):
result["confidences"][level].append({"text": text, "confidence": conf})
# Save gif with attention map
if attention_map:
......@@ -324,7 +345,7 @@ def run(
scale=attention_map_scale,
word_separators=word_separators,
line_separators=line_separators,
output_polygons=predict_objects,
display_polygons=predict_objects,
outname=gif_filename,
)
result["attention_gif"] = gif_filename
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment