use regex delimiters

ead5ff4c · Solene Tarride · 96327069 · ead5ff4c · ead5ff4c
Commit ead5ff4c authored 2 years ago by Solene Tarride
--- a/dan/predict/attention.py
+++ b/dan/predict/attention.py
 # -*- coding: utf-8 -*-
+import re
 import cv2
 import numpy as np
 from PIL import Image
@@ -18,17 +20,11 @@ def split_text(text, level, word_separators, line_separators):
        offset = 0
    # split into words
    elif level == "word":
-        main_sep = word_separators[0]
+        text_split = re.split(word_separators, text)
-        for other_sep in word_separators[1:]:
-            text = text.replace(other_sep, main_sep)
-        text_split = text.split(main_sep)
        offset = 1
    # split into lines
    elif level == "line":
-        main_sep = line_separators[0]
+        text_split = re.split(line_separators, text)
-        for other_sep in line_separators[1:]:
-            text = text.replace(other_sep, main_sep)
-        text_split = text.split(main_sep)
        offset = 1
    else:
        logger.error("Level should be either 'char', 'word', or 'line'")

--- a/dan/predict/prediction.py
+++ b/dan/predict/prediction.py
@@ -2,6 +2,7 @@
 import os
 import pickle
+import re
 import cv2
 import numpy as np
@@ -17,30 +18,6 @@ from dan.predict.attention import plot_attention
 from dan.utils import read_image, round_floats
-def compute_prob_by_separator(characters, probabilities, separators=["\n"]):
-    """
-    Split text and confidences using separators and return a list of average confidence scores.
-    :param characters: list of characters.
-    :param probabilities: list of probabilities.
-    :param separators: list of characters to split text. Use ["\n", " "] for word confidences and ["\n"] for line confidences.
-    Returns a list confidence scores.
-    """
-    probs = []
-    prob_split = []
-    text_split = ""
-    for char, prob in zip(characters, probabilities):
-        if char not in separators:
-            prob_split.append(prob)
-            text_split += char
-        elif text_split:
-            probs.append(np.mean(prob_split))
-            prob_split = []
-            text_split = ""
-    if text_split:
-        probs.append(np.mean(prob_split))
-    return probs
 class DAN:
    """
    The DAN class is used to apply a DAN model.
@@ -226,6 +203,31 @@ class DAN:
        return out
+def parse_delimiters(delimiters):
+    return re.compile(r"|".join(delimiters))
+def compute_prob_by_separator(characters, probabilities, separator):
+    """
+    Split text and confidences using separators and return a list of average confidence scores.
+    :param characters: list of characters.
+    :param probabilities: list of probabilities.
+    :param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
+    Returns a list confidence scores.
+    """
+    # match anything except separators, get start and end index
+    pattern = re.compile(f"[^{separator.pattern}]+")
+    matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
+    # Iterate over text pieces and compute mean confidence
+    probs = []
+    for match in matches:
+        start = match[0]
+        end = match[1]
+        probs.append(np.mean(probabilities[start:end]))
+    return probs
 def run(
    image,
    model,
@@ -271,6 +273,10 @@ def run(
    text = prediction["text"][0]
    result = {"text": text}
+    # Parse delimiters to regex
+    word_separators = parse_delimiters(word_separators)
+    line_separators = parse_delimiters(line_separators)
    # Average character-based confidence scores
    if confidence_score:
        char_confidences = prediction["confidences"][0]