Thibault Lavigne
--- a/dan/predict/attention.py

+ 252

− 16
+++ b/dan/predict/attention.py

+ 252

− 16
 @@ -6,18 +6,43 @@ import numpy as np
 from PIL import Image

 from dan import logger
+from dan.utils import round_floats


-def split_text(text, level, word_separators, line_separators):
+def parse_delimiters(delimiters):
+    return re.compile(r"|".join(delimiters))
+
+
+def compute_prob_by_separator(characters, probabilities, separator):
+    """
+    Split text and confidences using separators and return a list of average confidence scores.
+    :param characters: list of characters.
+    :param probabilities: list of character probabilities.
+    :param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
+    Returns a list confidence scores.
+    """
+    # match anything except separators, get start and end index
+    pattern = re.compile(f"[^{separator.pattern}]+")
+    matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
+
+    # Iterate over text pieces and compute mean confidence
+    probs = [np.mean(probabilities[start:end]) for (start, end) in matches]
+    texts = [characters[start:end] for (start, end) in matches]
+    return texts, probs
+
+
+def split_text(text: str, level: str, word_separators, line_separators):
    """
    Split text into a list of characters, word, or lines.
    :param text: Text prediction from DAN
-    :param level: Level to visualize (char, word, line)
+    :param level: Level to visualize from [char, word, line]
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
    """
-    # split into characters
    if level == "char":
        text_split = list(text)
        offset = 0
+
    # split into words
    elif level == "word":
        text_split = re.split(word_separators, text)
 @@ -31,13 +56,89 @@ def split_text(text, level, word_separators, line_separators):
    return text_split, offset


-def compute_coverage(text: str, max_value: float, offset: int, attentions):
+def split_text_and_confidences(
+    text, confidences, level, word_separators, line_separators
+):
+    """
+    Split text into a list of characters, words or lines with corresponding confidences scores
+    :param text: Text prediction from DAN
+    :param confidences: Character confidences
+    :param level: Level to visualize from [char, word, line]
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    """
+    if level == "char":
+        texts = list(text)
+        offset = 0
+    elif level == "word":
+        texts, probs = compute_prob_by_separator(text, confidences, word_separators)
+        offset = 1
+    elif level == "line":
+        texts, probs = compute_prob_by_separator(text, confidences, line_separators)
+        offset = 1
+    else:
+        logger.error("Level should be either 'char', 'word', or 'line'")
+    return texts, round_floats(probs), offset
+
+
+def get_predicted_polygons_with_confidence(
+    text,
+    weights,
+    confidences,
+    level,
+    height,
+    width,
+    threshold_method="otsu",
+    threshold_value=0,
+    word_separators=["\n", " "],
+    line_separators=["\n"],
+):
+    """
+    Returns the polygons of each object of the current prediction
+    :param text: Text predicted by DAN
+    :param weights: Attention weights of size (n_char, feature_height, feature_width)
+    :param confidences: Character confidences
+    :param level: Level to display (must be in [char, word, line])
+    :param height: Original image height
+    :param width: Original image width
+    :param threshold_method: Thresholding method. Should be in ["otsu", "simple"]
+    :param threshold_value: Thresholding value for the "simple" method.
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    """
+    # Split text into characters, words or lines
+    text_list, confidence_list, offset = split_text_and_confidences(
+        text, confidences, level, word_separators, line_separators
+    )
+
+    max_value = weights.sum(0).max()
+    polygons = []
+    start_index = 0
+    for text_piece, confidence in zip(text_list, confidence_list):
+        start_index += len(text_piece) + offset
+        polygon, _ = get_polygon(
+            text_piece,
+            max_value,
+            offset,
+            weights,
+            threshold_method=threshold_method,
+            threshold_value=threshold_value,
+            size=(width, height),
+        )
+        polygon["text"] = text_piece
+        polygon["text_confidence"] = confidence
+        polygons.append(polygon)
+    return polygons
+
+
+def compute_coverage(text: str, max_value: float, offset: int, attentions, size: tuple):
    """
    Aggregates attention maps for the current text piece (char, word, line)
    :param text: Text piece selected with offset after splitting DAN prediction
    :param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
    :param offset: Offset value to get the relevant part of text piece
    :param attentions: Attention weights of size (n_char, feature_height, feature_width)
+    :param size: Target size (width, height) to resize the coverage vector
    """
    _, height, width = attentions.shape

 @@ -49,9 +150,130 @@ def compute_coverage(text: str, max_value: float, offset: int, attentions):

    # Normalize coverage vector
    coverage_vector = (coverage_vector / max_value * 255).astype(np.uint8)
+
+    # Resize it
+    if size:
+        coverage_vector = cv2.resize(coverage_vector, size)
+
    return coverage_vector


+def blend_coverage(coverage_vector, image, mask, scale):
+    """
+    Blends current coverage_vector over original image, used to make an attention map.
+    :param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
+    :param image: Input image in PIL format
+    :param mask: Mask of the image (of any color)
+    :param scale: Scaling factor for the output gif image
+    """
+    height, width = coverage_vector.shape
+
+    # Blend coverage vector with original image
+    blank_array = np.zeros((height, width)).astype(np.uint8)
+    coverage_vector = Image.fromarray(
+        np.stack([coverage_vector, blank_array, blank_array], axis=2), "RGB"
+    )
+    blend = Image.composite(image, coverage_vector, mask)
+
+    # Resize to save time
+    blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
+    return blend
+
+
+def compute_contour_metrics(coverage_vector, contour):
+    """
+    Compute the contours's area and the mean value inside it.
+    :param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
+    :param contour: Contour of the current attention blob
+    """
+    # draw the contour zone
+    mask = np.zeros(coverage_vector.shape, dtype=np.uint8)
+    cv2.drawContours(mask, [contour], -1, (255), -1)
+
+    max_value = np.where(mask > 0, coverage_vector, 0).max() / 255
+    area = cv2.contourArea(contour)
+    return max_value, max_value * area
+
+
+def polygon_to_bbx(polygon):
+    x, y, w, h = cv2.boundingRect(polygon)
+    return [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
+
+
+def threshold(mask, threshold_method="otsu", threshold_value=0):
+    """
+    Threshold a grayscale mask.
+    :param mask: a grayscale image (np.array)
+    :param threshold_method: method to be used for thresholding. Should be in ["otsu", "simple"].
+    :param threshold_value: the threshold value used for binarization (used for the "simple" method).
+    """
+    min_kernel = 1
+    max_kernel = mask.shape[1] // 100
+
+    if threshold_method == "simple":
+        bin_mask = np.array(np.where(mask > threshold_value, 255, 0), dtype=np.uint8)
+        return np.asarray(bin_mask, dtype=np.uint8)
+
+    elif threshold_method == "otsu":
+        # Blur and apply Otsu thresholding
+        blur = cv2.GaussianBlur(mask, (15, 15), 0)
+        _, bin_mask = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        # Apply dilation
+        kernel_width = cv2.getStructuringElement(
+            cv2.MORPH_CROSS, (max_kernel, min_kernel)
+        )
+        dilated = cv2.dilate(bin_mask, kernel_width, iterations=3)
+        return np.asarray(dilated, dtype=np.uint8)
+
+    else:
+        raise NotImplementedError(f"Method {threshold_method} is not implemented.")
+
+
+def get_polygon(
+    text, max_value, offset, weights, threshold_method, threshold_value, size=None
+):
+    """
+    Gets polygon associated with element of current text_piece, indexed by offset
+    :param text: Text piece selected with offset after splitting DAN prediction
+    :param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
+    :param offset: Offset value to get the relevant part of text piece
+    :param size: Target size (width, height) to resize the coverage vector
+    :param threshold_method: Binarization method to use (should be in ["simple", "otsu"])
+    :param threshold_value: Threshold value used for the "simple" binarization method
+    """
+    # Compute coverage vector
+    coverage_vector = compute_coverage(text, max_value, offset, weights, size=size)
+
+    # Generate a binary image for the current channel.
+    bin_mask = threshold(
+        coverage_vector,
+        threshold_method=threshold_method,
+        threshold_value=threshold_value,
+    )
+
+    # Detect the objects contours
+    contours, _ = cv2.findContours(bin_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    if not contours:
+        return {}, None
+
+    # Select best contour
+    metrics = [compute_contour_metrics(coverage_vector, cnt) for cnt in contours]
+    confidences, scores = map(list, zip(*metrics))
+    best_contour = contours[np.argmax(scores)]
+    confidence = round(confidences[np.argmax(scores)] / max_value, 2)
+
+    # Format for JSON
+    coord = polygon_to_bbx(np.squeeze(best_contour))
+    polygon = {
+        "confidence": confidence,
+        "polygon": coord,
+    }
+    simplified_contour = np.expand_dims(np.array(coord, dtype=np.int32), axis=1)
+
+    return polygon, simplified_contour
+
+
 def plot_attention(
    image,
    text,
 @@ -59,8 +281,11 @@ def plot_attention(
    level,
    scale,
    outname,
+    threshold_method="otsu",
+    threshold_value=0,
    word_separators=["\n", " "],
    line_separators=["\n"],
+    display_polygons=False,
 ):
    """
    Create a gif by blending attention maps to the image for each text piece (char, word or line)
 @@ -70,6 +295,9 @@ def plot_attention(
    :param level: Level to display (must be in [char, word, line])
    :param scale: Scaling factor for the output gif image
    :param outname: Name of the gif image
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    :param display_polygons: Whether to plot extracted polygons
    """

    height, width, _ = image.shape
 @@ -84,27 +312,35 @@ def plot_attention(

    # Iterate on characters, words or lines
    tot_len = 0
-
    max_value = weights.sum(0).max()

    for text_piece in text_list:
        # Accumulate weights for the current word/line and resize to original image size
-        coverage_vector = compute_coverage(text_piece, max_value, tot_len, weights)
-        coverage_vector = cv2.resize(coverage_vector, (width, height))
+        coverage_vector = compute_coverage(
+            text_piece, max_value, tot_len, weights, (width, height)
+        )
+
+        # Get polygons if flag is set:
+        if display_polygons:
+            # draw the contour
+            _, contour = get_polygon(
+                text_piece,
+                max_value,
+                tot_len,
+                weights,
+                threshold_method=threshold_method,
+                threshold_value=threshold_value,
+                size=(width, height),
+            )
+
+            if contour is not None:
+                cv2.drawContours(coverage_vector, [contour], 0, (255), 5)

        # Keep track of text length
        tot_len += len(text_piece) + offset

        # Blend coverage vector with original image
-        blank_array = np.zeros((height, width)).astype(np.uint8)
-        coverage_vector = Image.fromarray(
-            np.stack([coverage_vector, blank_array, blank_array], axis=2), "RGB"
-        )
-        blend = Image.composite(image, coverage_vector, mask)
-
-        # Resize to save time
-        blend = blend.resize((int(width * scale), int(height * scale)), Image.ANTIALIAS)
-        attention_map.append(blend)
+        attention_map.append(blend_coverage(coverage_vector, image, mask, scale))

    attention_map[0].save(
        outname,