Skip to content
Snippets Groups Projects
Commit ead5ff4c authored by Solene Tarride's avatar Solene Tarride
Browse files

use regex delimiters

parent 96327069
No related branches found
No related tags found
1 merge request!66Compute confidence scores by char, word or line
This commit is part of merge request !66. Comments created here will be created in the context of that merge request.
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import cv2 import cv2
import numpy as np import numpy as np
from PIL import Image from PIL import Image
...@@ -18,17 +20,11 @@ def split_text(text, level, word_separators, line_separators): ...@@ -18,17 +20,11 @@ def split_text(text, level, word_separators, line_separators):
offset = 0 offset = 0
# split into words # split into words
elif level == "word": elif level == "word":
main_sep = word_separators[0] text_split = re.split(word_separators, text)
for other_sep in word_separators[1:]:
text = text.replace(other_sep, main_sep)
text_split = text.split(main_sep)
offset = 1 offset = 1
# split into lines # split into lines
elif level == "line": elif level == "line":
main_sep = line_separators[0] text_split = re.split(line_separators, text)
for other_sep in line_separators[1:]:
text = text.replace(other_sep, main_sep)
text_split = text.split(main_sep)
offset = 1 offset = 1
else: else:
logger.error("Level should be either 'char', 'word', or 'line'") logger.error("Level should be either 'char', 'word', or 'line'")
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import os import os
import pickle import pickle
import re
import cv2 import cv2
import numpy as np import numpy as np
...@@ -17,30 +18,6 @@ from dan.predict.attention import plot_attention ...@@ -17,30 +18,6 @@ from dan.predict.attention import plot_attention
from dan.utils import read_image, round_floats from dan.utils import read_image, round_floats
def compute_prob_by_separator(characters, probabilities, separators=["\n"]):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of probabilities.
:param separators: list of characters to split text. Use ["\n", " "] for word confidences and ["\n"] for line confidences.
Returns a list confidence scores.
"""
probs = []
prob_split = []
text_split = ""
for char, prob in zip(characters, probabilities):
if char not in separators:
prob_split.append(prob)
text_split += char
elif text_split:
probs.append(np.mean(prob_split))
prob_split = []
text_split = ""
if text_split:
probs.append(np.mean(prob_split))
return probs
class DAN: class DAN:
""" """
The DAN class is used to apply a DAN model. The DAN class is used to apply a DAN model.
...@@ -226,6 +203,31 @@ class DAN: ...@@ -226,6 +203,31 @@ class DAN:
return out return out
def parse_delimiters(delimiters):
return re.compile(r"|".join(delimiters))
def compute_prob_by_separator(characters, probabilities, separator):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of probabilities.
:param separators: regex for separators. Use parse_delimiters(["\n", " "]) for word confidences and parse_delimiters(["\n"]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern = re.compile(f"[^{separator.pattern}]+")
matches = [(m.start(), m.end()) for m in re.finditer(pattern, characters)]
# Iterate over text pieces and compute mean confidence
probs = []
for match in matches:
start = match[0]
end = match[1]
probs.append(np.mean(probabilities[start:end]))
return probs
def run( def run(
image, image,
model, model,
...@@ -271,6 +273,10 @@ def run( ...@@ -271,6 +273,10 @@ def run(
text = prediction["text"][0] text = prediction["text"][0]
result = {"text": text} result = {"text": text}
# Parse delimiters to regex
word_separators = parse_delimiters(word_separators)
line_separators = parse_delimiters(line_separators)
# Average character-based confidence scores # Average character-based confidence scores
if confidence_score: if confidence_score:
char_confidences = prediction["confidences"][0] char_confidences = prediction["confidences"][0]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment