Skip to content
Snippets Groups Projects

Add predicted objects to predict command

Merged Thibault Lavigne requested to merge 36-add-predicted-objects-to-predict-command into main
All threads resolved!
1 file
+ 2
3
Compare changes
  • Side-by-side
  • Inline
+ 252
16
@@ -6,18 +6,43 @@ import numpy as np
from PIL import Image
from dan import logger
from dan.utils import round_floats
def split_text(text, level, word_separators, line_separators):
def parse_delimiters(delimiters):
return re.compile(r"|".join(delimiters))
def compute_prob_by_separator(characters, probabilities, separator):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of character probabilities.
:param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern = re.compile(f"[^{separator.pattern}]+")
matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
# Iterate over text pieces and compute mean confidence
probs = [np.mean(probabilities[start:end]) for (start, end) in matches]
texts = [characters[start:end] for (start, end) in matches]
return texts, probs
def split_text(text: str, level: str, word_separators, line_separators):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize (char, word, line)
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
# split into characters
if level == "char":
text_split = list(text)
offset = 0
# split into words
elif level == "word":
text_split = re.split(word_separators, text)
@@ -31,13 +56,89 @@ def split_text(text, level, word_separators, line_separators):
return text_split, offset
def compute_coverage(text: str, max_value: float, offset: int, attentions):
def split_text_and_confidences(
text, confidences, level, word_separators, line_separators
):
"""
Split text into a list of characters, words or lines with corresponding confidences scores
:param text: Text prediction from DAN
:param confidences: Character confidences
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
if level == "char":
texts = list(text)
offset = 0
elif level == "word":
texts, probs = compute_prob_by_separator(text, confidences, word_separators)
offset = 1
elif level == "line":
texts, probs = compute_prob_by_separator(text, confidences, line_separators)
offset = 1
else:
logger.error("Level should be either 'char', 'word', or 'line'")
return texts, round_floats(probs), offset
def get_predicted_polygons_with_confidence(
text,
weights,
confidences,
level,
height,
width,
threshold_method="otsu",
threshold_value=0,
word_separators=["\n", " "],
line_separators=["\n"],
):
"""
Returns the polygons of each object of the current prediction
:param text: Text predicted by DAN
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param confidences: Character confidences
:param level: Level to display (must be in [char, word, line])
:param height: Original image height
:param width: Original image width
:param threshold_method: Thresholding method. Should be in ["otsu", "simple"]
:param threshold_value: Thresholding value for the "simple" method.
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
# Split text into characters, words or lines
text_list, confidence_list, offset = split_text_and_confidences(
text, confidences, level, word_separators, line_separators
)
max_value = weights.sum(0).max()
polygons = []
start_index = 0
for text_piece, confidence in zip(text_list, confidence_list):
start_index += len(text_piece) + offset
polygon, _ = get_polygon(
text_piece,
max_value,
offset,
weights,
threshold_method=threshold_method,
threshold_value=threshold_value,
size=(width, height),
)
polygon["text"] = text_piece
polygon["text_confidence"] = confidence
polygons.append(polygon)
return polygons
def compute_coverage(text: str, max_value: float, offset: int, attentions, size: tuple):
"""
Aggregates attention maps for the current text piece (char, word, line)
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
:param offset: Offset value to get the relevant part of text piece
:param attentions: Attention weights of size (n_char, feature_height, feature_width)
:param size: Target size (width, height) to resize the coverage vector
"""
_, height, width = attentions.shape
@@ -49,9 +150,130 @@ def compute_coverage(text: str, max_value: float, offset: int, attentions):
# Normalize coverage vector
coverage_vector = (coverage_vector / max_value * 255).astype(np.uint8)
# Resize it
if size:
coverage_vector = cv2.resize(coverage_vector, size)
return coverage_vector
def blend_coverage(coverage_vector, image, mask, scale):
"""
Blends current coverage_vector over original image, used to make an attention map.
:param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
:param image: Input image in PIL format
:param mask: Mask of the image (of any color)
:param scale: Scaling factor for the output gif image
"""
height, width = coverage_vector.shape
# Blend coverage vector with original image
blank_array = np.zeros((height, width)).astype(np.uint8)
coverage_vector = Image.fromarray(
np.stack([coverage_vector, blank_array, blank_array], axis=2), "RGB"
)
blend = Image.composite(image, coverage_vector, mask)
# Resize to save time
blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
return blend
def compute_contour_metrics(coverage_vector, contour):
"""
Compute the contours's area and the mean value inside it.
:param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
:param contour: Contour of the current attention blob
"""
# draw the contour zone
mask = np.zeros(coverage_vector.shape, dtype=np.uint8)
cv2.drawContours(mask, [contour], -1, (255), -1)
max_value = np.where(mask > 0, coverage_vector, 0).max() / 255
area = cv2.contourArea(contour)
return max_value, max_value * area
def polygon_to_bbx(polygon):
x, y, w, h = cv2.boundingRect(polygon)
return [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
def threshold(mask, threshold_method="otsu", threshold_value=0):
"""
Threshold a grayscale mask.
:param mask: a grayscale image (np.array)
:param threshold_method: method to be used for thresholding. Should be in ["otsu", "simple"].
:param threshold_value: the threshold value used for binarization (used for the "simple" method).
"""
min_kernel = 1
max_kernel = mask.shape[1] // 100
if threshold_method == "simple":
bin_mask = np.array(np.where(mask > threshold_value, 255, 0), dtype=np.uint8)
return np.asarray(bin_mask, dtype=np.uint8)
elif threshold_method == "otsu":
# Blur and apply Otsu thresholding
blur = cv2.GaussianBlur(mask, (15, 15), 0)
_, bin_mask = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Apply dilation
kernel_width = cv2.getStructuringElement(
cv2.MORPH_CROSS, (max_kernel, min_kernel)
)
dilated = cv2.dilate(bin_mask, kernel_width, iterations=3)
return np.asarray(dilated, dtype=np.uint8)
else:
raise NotImplementedError(f"Method {threshold_method} is not implemented.")
def get_polygon(
text, max_value, offset, weights, threshold_method, threshold_value, size=None
):
"""
Gets polygon associated with element of current text_piece, indexed by offset
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
:param offset: Offset value to get the relevant part of text piece
:param size: Target size (width, height) to resize the coverage vector
:param threshold_method: Binarization method to use (should be in ["simple", "otsu"])
:param threshold_value: Threshold value used for the "simple" binarization method
"""
# Compute coverage vector
coverage_vector = compute_coverage(text, max_value, offset, weights, size=size)
# Generate a binary image for the current channel.
bin_mask = threshold(
coverage_vector,
threshold_method=threshold_method,
threshold_value=threshold_value,
)
# Detect the objects contours
contours, _ = cv2.findContours(bin_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return {}, None
# Select best contour
metrics = [compute_contour_metrics(coverage_vector, cnt) for cnt in contours]
confidences, scores = map(list, zip(*metrics))
best_contour = contours[np.argmax(scores)]
confidence = round(confidences[np.argmax(scores)] / max_value, 2)
# Format for JSON
coord = polygon_to_bbx(np.squeeze(best_contour))
polygon = {
"confidence": confidence,
"polygon": coord,
}
simplified_contour = np.expand_dims(np.array(coord, dtype=np.int32), axis=1)
return polygon, simplified_contour
def plot_attention(
image,
text,
@@ -59,8 +281,11 @@ def plot_attention(
level,
scale,
outname,
threshold_method="otsu",
threshold_value=0,
word_separators=["\n", " "],
line_separators=["\n"],
display_polygons=False,
):
"""
Create a gif by blending attention maps to the image for each text piece (char, word or line)
@@ -70,6 +295,9 @@ def plot_attention(
:param level: Level to display (must be in [char, word, line])
:param scale: Scaling factor for the output gif image
:param outname: Name of the gif image
:param word_separators: List of word separators
:param line_separators: List of line separators
:param display_polygons: Whether to plot extracted polygons
"""
height, width, _ = image.shape
@@ -84,27 +312,35 @@ def plot_attention(
# Iterate on characters, words or lines
tot_len = 0
max_value = weights.sum(0).max()
for text_piece in text_list:
# Accumulate weights for the current word/line and resize to original image size
coverage_vector = compute_coverage(text_piece, max_value, tot_len, weights)
coverage_vector = cv2.resize(coverage_vector, (width, height))
coverage_vector = compute_coverage(
text_piece, max_value, tot_len, weights, (width, height)
)
# Get polygons if flag is set:
if display_polygons:
# draw the contour
_, contour = get_polygon(
text_piece,
max_value,
tot_len,
weights,
threshold_method=threshold_method,
threshold_value=threshold_value,
size=(width, height),
)
if contour is not None:
cv2.drawContours(coverage_vector, [contour], 0, (255), 5)
# Keep track of text length
tot_len += len(text_piece) + offset
# Blend coverage vector with original image
blank_array = np.zeros((height, width)).astype(np.uint8)
coverage_vector = Image.fromarray(
np.stack([coverage_vector, blank_array, blank_array], axis=2), "RGB"
)
blend = Image.composite(image, coverage_vector, mask)
# Resize to save time
blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
attention_map.append(blend)
attention_map.append(blend_coverage(coverage_vector, image, mask, scale))
attention_map[0].save(
outname,
Loading