Predict polygons but preserve old behavior

04b15f97 · Solene Tarride · da7e8b3f · 04b15f97 · 04b15f97
Commit 04b15f97 authored 2 years ago by Solene Tarride
--- a/dan/predict/attention.py
+++ b/dan/predict/attention.py
@@ -6,38 +6,17 @@ import numpy as np
 from PIL import Image

 from dan import logger
+from dan.utils import round_floats


-def split_text(text: str, level: str, word_separators, line_separators):
-    """
-    Split text into a list of characters, word, or lines.
-    :param text: Text prediction from DAN
-    :param level: Level to visualize from [char, word, line]
-    """
-    # split into characters
-    if level == "char":
-        text_split = list(text)
-        offset = 0
-    # split into words
-    elif level == "word":
-        text_split = re.split(word_separators, text)
-        offset = 1
-    # split into lines
-    elif level == "line":
-        text_split = re.split(line_separators, text)
-        offset = 1
-    else:
-        logger.error("Level should be either 'char', 'word', or 'line'")
-    return text_split, offset
-
-
-def compute_coverage(text: str, max_value: float, offset: int, attentions):
+def compute_coverage(text: str, max_value: float, offset: int, attentions, size: tuple):
    """
    Aggregates attention maps for the current text piece (char, word, line)
    :param text: Text piece selected with offset after splitting DAN prediction
    :param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
    :param offset: Offset value to get the relevant part of text piece
    :param attentions: Attention weights of size (n_char, feature_height, feature_width)
+    :param size: Target size (width, height) to resize the coverage vector
    """
    _, height, width = attentions.shape

@@ -49,6 +28,11 @@ def compute_coverage(text: str, max_value: float, offset: int, attentions):

    # Normalize coverage vector
    coverage_vector = (coverage_vector / max_value * 255).astype(np.uint8)
+
+    # Resize it
+    if size:
+        coverage_vector = cv2.resize(coverage_vector, size)
+
    return coverage_vector


@@ -74,9 +58,82 @@ def blend_coverage(coverage_vector, image, mask, scale):
    return blend


-def get_predicted_polygons(
+def parse_delimiters(delimiters):
+    return re.compile(r"|".join(delimiters))
+
+
+def compute_prob_by_separator(characters, probabilities, separator):
+    """
+    Split text and confidences using separators and return a list of average confidence scores.
+    :param characters: list of characters.
+    :param probabilities: list of character probabilities.
+    :param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
+    Returns a list confidence scores.
+    """
+    # match anything except separators, get start and end index
+    pattern = re.compile(f"[^{separator.pattern}]+")
+    matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
+
+    # Iterate over text pieces and compute mean confidence
+    probs = [np.mean(probabilities[start:end]) for (start, end) in matches]
+    texts = [characters[start:end] for (start, end) in matches]
+    return texts, probs
+
+
+def split_text(text: str, level: str, word_separators, line_separators):
+    """
+    Split text into a list of characters, word, or lines.
+    :param text: Text prediction from DAN
+    :param level: Level to visualize from [char, word, line]
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    """
+    if level == "char":
+        text_split = list(text)
+        offset = 0
+
+    # split into words
+    elif level == "word":
+        text_split = re.split(word_separators, text)
+        offset = 1
+    # split into lines
+    elif level == "line":
+        text_split = re.split(line_separators, text)
+        offset = 1
+    else:
+        logger.error("Level should be either 'char', 'word', or 'line'")
+    return text_split, offset
+
+
+def split_text_and_confidences(
+    text, confidences, level, word_separators, line_separators
+):
+    """
+    Split text into a list of characters, words or lines with corresponding confidences scores
+    :param text: Text prediction from DAN
+    :param confidences: Character confidences
+    :param level: Level to visualize from [char, word, line]
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    """
+    if level == "char":
+        texts = list(text)
+        offset = 0
+    elif level == "word":
+        texts, probs = compute_prob_by_separator(text, confidences, word_separators)
+        offset = 1
+    elif level == "line":
+        texts, probs = compute_prob_by_separator(text, confidences, line_separators)
+        offset = 1
+    else:
+        logger.error("Level should be either 'char', 'word', or 'line'")
+    return texts, round_floats(probs), offset
+
+
+def get_predicted_polygons_with_confidence(
    text,
    weights,
+    confidences,
    level,
    height,
    width,
@@ -87,63 +144,85 @@ def get_predicted_polygons(
    Returns the polygons of each object of the current prediction
    :param text: Text predicted by DAN
    :param weights: Attention weights of size (n_char, feature_height, feature_width)
+    :param confidences: Character confidences
    :param level: Level to display (must be in [char, word, line])
    :param height: Original image height
    :param width: Original image width
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
    """
    # Split text into characters, words or lines
-    text_list, offset = split_text(text, level, word_separators, line_separators)
-    max_value = weights.sum(0).max()
+    text_list, confidence_list, offset = split_text_and_confidences(
+        text, confidences, level, word_separators, line_separators
+    )

-    # Set offset based on current text_piece to be used.
-    return [
-        get_polygon(
-            text_piece, level, offset * n_offset, max_value, weights, height, width
+    max_value = weights.sum(0).max()
+    polygons = []
+    start_index = 0
+    for text_piece, confidence in zip(text_list, confidence_list):
+        start_index += len(text_piece) + offset
+        polygon = get_polygon(
+            text_piece, max_value, offset, weights, size=(width, height)
        )
-        for n_offset, text_piece in enumerate(text_list)
-    ]
+        polygon["text"] = text_piece
+        polygon["text_confidence"] = confidence
+        polygons.append(polygon)
+    return polygons
+
+
+def compute_contour_metrics(coverage_vector, contour):
+    """
+    Compute the contours's area and the mean value inside it.
+    :param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
+    :param contour: Contour of the current attention blob
+    """
+    # draw the contour zone
+    mask = np.zeros(coverage_vector.shape, dtype=np.uint8)
+    cv2.drawContours(mask, [contour], -1, (255), -1)
+
+    max_value = (
+        np.where(mask > 0, coverage_vector, 0).max() / 255
+    )  # cv2.max(coverage_vector, mask=mask)[0] / 255.
+    area = cv2.contourArea(contour)
+    return max_value, max_value * area


-def get_polygon(text_piece, level, offset, max_value, weights, height, width):
+def get_polygon(text, max_value, offset, weights, size=None, return_contours=False):
    """
    Gets polygon associated with element of current text_piece, indexed by offset
-    :param text_piece: Current text element
-    :param level: Level to display (must be in [char, word, line])
-    :param offset: Offset value to get the relevant part of text piece
+    :param text: Text piece selected with offset after splitting DAN prediction
    :param max_value: Maximum "attention intensity" for parts of a text piece, used for normalization
-    :param weights: Attention weights of size (n_char, feature_height, feature_width)
-    :param height: Original image height
-    :param width: Original image width
+    :param offset: Offset value to get the relevant part of text piece
+    :param attentions: Attention weights of size (n_char, feature_height, feature_width)
+    :param size: Target size (width, height) to resize the coverage vector
+    :param return_contours: Return the contour of the current polygon (used for plotting)
    """
-    coverage_vector = compute_coverage(text_piece, max_value, offset, weights)
-    coverage_vector = cv2.resize(coverage_vector, (width, height))
+    # Compute coverage vector
+    coverage_vector = compute_coverage(text, max_value, offset, weights, size=size)

    # Generate a binary image for the current channel.
-    bin_img = coverage_vector.copy()
-    bin_img[bin_img > 0] = 1
+    bin_mask = np.array(np.where(coverage_vector > 5, 255, 0), dtype=np.uint8)
+    bin_mask = np.asarray(bin_mask, dtype=np.uint8)
+
+    # Detect the objects contours
+    contours, _ = cv2.findContours(bin_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    # Select best contour
+    metrics = [compute_contour_metrics(coverage_vector, cnt) for cnt in contours]
+    confidences, scores = map(list, zip(*metrics))
+    best_contour = contours[np.argmax(scores)]
+    confidence = round(confidences[np.argmax(scores)] / max_value, 2)
+
+    # Format for JSON
+    polygon = {
+        "confidence": confidence,
+        "polygon": [coordinates[0].tolist() for coordinates in best_contour],
+    }

-    # Detect the objects contours.
-    contours, _ = cv2.findContours(
-        np.uint8(bin_img), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-    )
+    if return_contours:
+        return polygon, best_contour

-    mask = np.zeros(coverage_vector.shape)
-    cv2.drawContours(mask, contours, -1, 1, -1)
-    confidence = round((np.sum(mask * coverage_vector) / np.sum(mask)), 2)
-
-    # Put together all contours for now.
-    pre_contours_tojson = [[item.tolist() for item in contours]]
-    # Quick hack to have better json format:
-    contours_tojson = []
-    for contour in pre_contours_tojson[0]:
-        for coordinate in contour:
-            contours_tojson.append(coordinate[0])
-
-    return {
-        "confidence": confidence,  # average of coverage vector on contours
-        "polygon": contours_tojson,
-        "type": level,
-    }
+    return polygon


 def plot_attention(
@@ -155,7 +234,7 @@ def plot_attention(
    outname,
    word_separators=["\n", " "],
    line_separators=["\n"],
-    output_polygons=False,
+    display_polygons=False,
 ):
    """
    Create a gif by blending attention maps to the image for each text piece (char, word or line)
@@ -165,11 +244,13 @@ def plot_attention(
    :param level: Level to display (must be in [char, word, line])
    :param scale: Scaling factor for the output gif image
    :param outname: Name of the gif image
+    :param word_separators: List of word separators
+    :param line_separators: List of line separators
+    :param output_polygons: Whether to plot extracted polygons
    """

    height, width, _ = image.shape
    attention_map = []
-    polygons = []

    # Convert to PIL Image and create mask
    mask = Image.new("L", (width, height), color=(110))
@@ -180,21 +261,26 @@ def plot_attention(

    # Iterate on characters, words or lines
    tot_len = 0
-
    max_value = weights.sum(0).max()

    for text_piece in text_list:
        # Accumulate weights for the current word/line and resize to original image size
-        coverage_vector = compute_coverage(text_piece, max_value, tot_len, weights)
-        coverage_vector = cv2.resize(coverage_vector, (width, height))
+        coverage_vector = compute_coverage(
+            text_piece, max_value, tot_len, weights, (width, height)
+        )

        # Get polygons if flag is set:
-        if output_polygons:
-            polygons.append(
-                get_polygon(
-                    text_piece, level, tot_len, max_value, weights, height, width
-                )
+        if display_polygons:
+            # draw the contour
+            _, contour = get_polygon(
+                text_piece,
+                max_value,
+                tot_len,
+                weights,
+                (width, height),
+                return_contours=True,
            )
+            cv2.drawContours(coverage_vector, [contour], 0, (255), 3)

        # Keep track of text length
        tot_len += len(text_piece) + offset
@@ -210,5 +296,3 @@ def plot_attention(
        duration=1000,
        loop=True,
    )
-
-    return polygons
--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -2,7 +2,6 @@

 import os
 import pickle
-import re

 import cv2
 import numpy as np
@@ -14,8 +13,13 @@ from dan.datasets.extract.utils import save_json
 from dan.decoder import GlobalHTADecoder
 from dan.models import FCN_Encoder
 from dan.ocr.utils import LM_ind_to_str
-from dan.predict.attention import get_predicted_polygons, plot_attention
-from dan.utils import read_image, round_floats
+from dan.predict.attention import (
+    get_predicted_polygons_with_confidence,
+    parse_delimiters,
+    plot_attention,
+    split_text_and_confidences,
+)
+from dan.utils import read_image


 class DAN:
@@ -93,6 +97,7 @@ class DAN:
        confidences=False,
        attentions=False,
        attention_level=False,
+        extract_objects=False,
        word_separators=["\n", " "],
        line_separators=["\n"],
    ):
@@ -113,13 +118,20 @@ class DAN:

        # Run the prediction.
        with torch.no_grad():
-            b = input_tensor.size(0)
-            reached_end = torch.zeros((b,), dtype=torch.bool, device=self.device)
-            prediction_len = torch.zeros((b,), dtype=torch.int, device=self.device)
+            batch_size = input_tensor.size(0)
+            reached_end = torch.zeros(
+                (batch_size,), dtype=torch.bool, device=self.device
+            )
+            prediction_len = torch.zeros(
+                (batch_size,), dtype=torch.int, device=self.device
+            )
            predicted_tokens = (
-                torch.ones((b, 1), dtype=torch.long, device=self.device) * start_token
+                torch.ones((batch_size, 1), dtype=torch.long, device=self.device)
+                * start_token
+            )
+            predicted_tokens_len = torch.ones(
+                (batch_size,), dtype=torch.int, device=self.device
            )
-            predicted_tokens_len = torch.ones((b,), dtype=torch.int, device=self.device)

            whole_output = list()
            confidence_scores = list()
@@ -188,10 +200,11 @@ class DAN:
            predicted_tokens = predicted_tokens[:, 1:]
            prediction_len[torch.eq(reached_end, False)] = self.max_chars - 1
            predicted_tokens = [
-                predicted_tokens[i, : prediction_len[i]] for i in range(b)
+                predicted_tokens[i, : prediction_len[i]] for i in range(batch_size)
            ]
            confidence_scores = [
-                confidence_scores[i, : prediction_len[i]].tolist() for i in range(b)
+                confidence_scores[i, : prediction_len[i]].tolist()
+                for i in range(batch_size)
            ]

            # Transform tokens to characters
@@ -201,44 +214,30 @@ class DAN:

            logger.info("Images processed")

-        out = {"text": predicted_text}
+        out = {}
+
+        out["text"] = predicted_text
        if confidences:
            out["confidences"] = confidence_scores
        if attentions:
            out["attentions"] = attention_maps
-            # Also get information on polygons
-            out["objects"] = get_predicted_polygons(
-                predicted_text[0],
-                attention_maps[0],
-                attention_level,
-                input_sizes[0][0],
-                input_sizes[0][1],
-                word_separators,
-                line_separators,
-            )
+        if extract_objects:
+            out["objects"] = [
+                get_predicted_polygons_with_confidence(
+                    predicted_text[i],
+                    attention_maps[i],
+                    confidence_scores[i],
+                    attention_level,
+                    input_sizes[i][0],
+                    input_sizes[i][1],
+                    word_separators,
+                    line_separators,
+                )
+                for i in range(batch_size)
+            ]
        return out


-def parse_delimiters(delimiters):
-    return re.compile(r"|".join(delimiters))
-
-
-def compute_prob_by_separator(characters, probabilities, separator):
-    """
-    Split text and confidences using separators and return a list of average confidence scores.
-    :param characters: list of characters.
-    :param probabilities: list of probabilities.
-    :param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
-    Returns a list confidence scores.
-    """
-    # match anything except separators, get start and end index
-    pattern = re.compile(f"[^{separator.pattern}]+")
-    matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
-
-    # Iterate over text pieces and compute mean confidence
-    return [np.mean(probabilities[start:end]) for (start, end) in matches]
-
-
 def run(
    image,
    model,
@@ -255,6 +254,22 @@ def run(
    line_separators,
    predict_objects,
 ):
+    """
+    Predict a single image save the output
+    :param image: Path to the image to predict.
+    :param model: Path to the model to use for prediction.
+    :param parameters: Path to the YAML parameters file.
+    :param charset: Path to the charset.
+    :param output: Path to the output folder where the results will be saved.
+    :param scale: Scaling factor to resize the image.
+    :param confidence_score: Whether to compute confidence score.
+    :param attention_map: Whether to plot the attention map.
+    :param attention_map_level: Level of objects to extract.
+    :param attention_map_scale: Scaling factor for the attention map.
+    :param word_separators: List of word separators.
+    :param line_separators: List of line separators.
+    :param predict_objects: Whether to extract objects.
+    """
    # Create output directory if necessary
    if not os.path.exists(output):
        os.mkdir(output)
@@ -286,30 +301,36 @@ def run(
        confidences=confidence_score,
        attentions=attention_map,
        attention_level=attention_map_level,
+        extract_objects=predict_objects,
        word_separators=word_separators,
        line_separators=line_separators,
    )
-    text = prediction["text"][0]
-    result = {"text": text}
+    result = {}
+    result["text"] = prediction["text"][0]

-    result["objects"] = prediction["objects"]
+    # Return extracted objects (coordinates, text, confidence)
+    if predict_objects:
+        result["objects"] = prediction["objects"][0]

-    # Average character-based confidence scores
+    # Return mean confidence score
    if confidence_score:
+        result["confidences"] = {}
+
        char_confidences = prediction["confidences"][0]
-        result["confidences"] = {"total": np.around(np.mean(char_confidences), 2)}
-        if "word" in confidence_score_levels:
-            word_probs = compute_prob_by_separator(
-                text, char_confidences, word_separators
-            )
-            result["confidences"].update({"word": round_floats(word_probs)})
-        if "line" in confidence_score_levels:
-            line_probs = compute_prob_by_separator(
-                text, char_confidences, line_separators
+        result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
+
+        for level in confidence_score_levels:
+            result["confidences"][level] = []
+            texts, confidences, _ = split_text_and_confidences(
+                prediction["text"][0],
+                char_confidences,
+                level,
+                word_separators,
+                line_separators,
            )
-            result["confidences"].update({"line": round_floats(line_probs)})
-        if "char" in confidence_score_levels:
-            result["confidences"].update({"char": round_floats(char_confidences)})
+
+            for text, conf in zip(texts, confidences):
+                result["confidences"][level].append({"text": text, "confidence": conf})

    # Save gif with attention map
    if attention_map:
@@ -324,7 +345,7 @@ def run(
            scale=attention_map_scale,
            word_separators=word_separators,
            line_separators=line_separators,
-            output_polygons=predict_objects,
+            display_polygons=predict_objects,
            outname=gif_filename,
        )
        result["attention_gif"] = gif_filename