Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
04b15f97
Commit
04b15f97
authored
2 years ago
by
Solene Tarride
Browse files
Options
Downloads
Patches
Plain Diff
Predict polygons but preserve old behavior
parent
da7e8b3f
No related branches found
No related tags found
1 merge request
!76
Add predicted objects to predict command
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dan/predict/attention.py
+161
-77
161 additions, 77 deletions
dan/predict/attention.py
dan/predict/prediction.py
+79
-58
79 additions, 58 deletions
dan/predict/prediction.py
with
240 additions
and
135 deletions
dan/predict/attention.py
+
161
−
77
View file @
04b15f97
...
...
@@ -6,38 +6,17 @@ import numpy as np
from
PIL
import
Image
from
dan
import
logger
from
dan.utils
import
round_floats
def
split_text
(
text
:
str
,
level
:
str
,
word_separators
,
line_separators
):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize from [char, word, line]
"""
# split into characters
if
level
==
"
char
"
:
text_split
=
list
(
text
)
offset
=
0
# split into words
elif
level
==
"
word
"
:
text_split
=
re
.
split
(
word_separators
,
text
)
offset
=
1
# split into lines
elif
level
==
"
line
"
:
text_split
=
re
.
split
(
line_separators
,
text
)
offset
=
1
else
:
logger
.
error
(
"
Level should be either
'
char
'
,
'
word
'
, or
'
line
'"
)
return
text_split
,
offset
def
compute_coverage
(
text
:
str
,
max_value
:
float
,
offset
:
int
,
attentions
):
def
compute_coverage
(
text
:
str
,
max_value
:
float
,
offset
:
int
,
attentions
,
size
:
tuple
):
"""
Aggregates attention maps for the current text piece (char, word, line)
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum
"
attention intensity
"
for parts of a text piece, used for normalization
:param offset: Offset value to get the relevant part of text piece
:param attentions: Attention weights of size (n_char, feature_height, feature_width)
:param size: Target size (width, height) to resize the coverage vector
"""
_
,
height
,
width
=
attentions
.
shape
...
...
@@ -49,6 +28,11 @@ def compute_coverage(text: str, max_value: float, offset: int, attentions):
# Normalize coverage vector
coverage_vector
=
(
coverage_vector
/
max_value
*
255
).
astype
(
np
.
uint8
)
# Resize it
if
size
:
coverage_vector
=
cv2
.
resize
(
coverage_vector
,
size
)
return
coverage_vector
...
...
@@ -74,9 +58,82 @@ def blend_coverage(coverage_vector, image, mask, scale):
return
blend
def
get_predicted_polygons
(
def
parse_delimiters
(
delimiters
):
return
re
.
compile
(
r
"
|
"
.
join
(
delimiters
))
def
compute_prob_by_separator
(
characters
,
probabilities
,
separator
):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of character probabilities.
:param separators: regex for separators. Use parse_delimiters([
"
\n
"
,
"
"
]) for word confidences and parse_delimiters([
"
\n
"
]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern
=
re
.
compile
(
f
"
[^
{
separator
.
pattern
}
]+
"
)
matches
=
[(
m
.
start
(),
m
.
end
())
for
m
in
re
.
finditer
(
pattern
,
characters
)]
# Iterate over text pieces and compute mean confidence
probs
=
[
np
.
mean
(
probabilities
[
start
:
end
])
for
(
start
,
end
)
in
matches
]
texts
=
[
characters
[
start
:
end
]
for
(
start
,
end
)
in
matches
]
return
texts
,
probs
def
split_text
(
text
:
str
,
level
:
str
,
word_separators
,
line_separators
):
"""
Split text into a list of characters, word, or lines.
:param text: Text prediction from DAN
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
if
level
==
"
char
"
:
text_split
=
list
(
text
)
offset
=
0
# split into words
elif
level
==
"
word
"
:
text_split
=
re
.
split
(
word_separators
,
text
)
offset
=
1
# split into lines
elif
level
==
"
line
"
:
text_split
=
re
.
split
(
line_separators
,
text
)
offset
=
1
else
:
logger
.
error
(
"
Level should be either
'
char
'
,
'
word
'
, or
'
line
'"
)
return
text_split
,
offset
def
split_text_and_confidences
(
text
,
confidences
,
level
,
word_separators
,
line_separators
):
"""
Split text into a list of characters, words or lines with corresponding confidences scores
:param text: Text prediction from DAN
:param confidences: Character confidences
:param level: Level to visualize from [char, word, line]
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
if
level
==
"
char
"
:
texts
=
list
(
text
)
offset
=
0
elif
level
==
"
word
"
:
texts
,
probs
=
compute_prob_by_separator
(
text
,
confidences
,
word_separators
)
offset
=
1
elif
level
==
"
line
"
:
texts
,
probs
=
compute_prob_by_separator
(
text
,
confidences
,
line_separators
)
offset
=
1
else
:
logger
.
error
(
"
Level should be either
'
char
'
,
'
word
'
, or
'
line
'"
)
return
texts
,
round_floats
(
probs
),
offset
def
get_predicted_polygons_with_confidence
(
text
,
weights
,
confidences
,
level
,
height
,
width
,
...
...
@@ -87,63 +144,85 @@ def get_predicted_polygons(
Returns the polygons of each object of the current prediction
:param text: Text predicted by DAN
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param confidences: Character confidences
:param level: Level to display (must be in [char, word, line])
:param height: Original image height
:param width: Original image width
:param word_separators: List of word separators
:param line_separators: List of line separators
"""
# Split text into characters, words or lines
text_list
,
offset
=
split_text
(
text
,
level
,
word_separators
,
line_separators
)
max_value
=
weights
.
sum
(
0
).
max
()
text_list
,
confidence_list
,
offset
=
split_text_and_confidences
(
text
,
confidences
,
level
,
word_separators
,
line_separators
)
# Set offset based on current text_piece to be used.
return
[
get_polygon
(
text_piece
,
level
,
offset
*
n_offset
,
max_value
,
weights
,
height
,
width
max_value
=
weights
.
sum
(
0
).
max
()
polygons
=
[]
start_index
=
0
for
text_piece
,
confidence
in
zip
(
text_list
,
confidence_list
):
start_index
+=
len
(
text_piece
)
+
offset
polygon
=
get_polygon
(
text_piece
,
max_value
,
offset
,
weights
,
size
=
(
width
,
height
)
)
for
n_offset
,
text_piece
in
enumerate
(
text_list
)
]
polygon
[
"
text
"
]
=
text_piece
polygon
[
"
text_confidence
"
]
=
confidence
polygons
.
append
(
polygon
)
return
polygons
def
compute_contour_metrics
(
coverage_vector
,
contour
):
"""
Compute the contours
'
s area and the mean value inside it.
:param coverage_vector: Aggregated attention weights of the current text piece, resized to image. size: (n_char, image_height, image_width)
:param contour: Contour of the current attention blob
"""
# draw the contour zone
mask
=
np
.
zeros
(
coverage_vector
.
shape
,
dtype
=
np
.
uint8
)
cv2
.
drawContours
(
mask
,
[
contour
],
-
1
,
(
255
),
-
1
)
max_value
=
(
np
.
where
(
mask
>
0
,
coverage_vector
,
0
).
max
()
/
255
)
# cv2.max(coverage_vector, mask=mask)[0] / 255.
area
=
cv2
.
contourArea
(
contour
)
return
max_value
,
max_value
*
area
def
get_polygon
(
text
_piece
,
level
,
offset
,
max_value
,
weights
,
height
,
width
):
def
get_polygon
(
text
,
max_value
,
offset
,
weights
,
size
=
None
,
return_contours
=
False
):
"""
Gets polygon associated with element of current text_piece, indexed by offset
:param text_piece: Current text element
:param level: Level to display (must be in [char, word, line])
:param offset: Offset value to get the relevant part of text piece
:param text: Text piece selected with offset after splitting DAN prediction
:param max_value: Maximum
"
attention intensity
"
for parts of a text piece, used for normalization
:param weights: Attention weights of size (n_char, feature_height, feature_width)
:param height: Original image height
:param width: Original image width
:param offset: Offset value to get the relevant part of text piece
:param attentions: Attention weights of size (n_char, feature_height, feature_width)
:param size: Target size (width, height) to resize the coverage vector
:param return_contours: Return the contour of the current polygon (used for plotting)
"""
coverage_vector
=
c
ompute
_
coverage
(
text_piece
,
max_value
,
offset
,
weights
)
coverage_vector
=
c
v2
.
resize
(
coverage_vector
,
(
width
,
height
)
)
# C
ompute
coverage
vector
coverage_vector
=
c
ompute_coverage
(
text
,
max_value
,
offset
,
weights
,
size
=
size
)
# Generate a binary image for the current channel.
bin_img
=
coverage_vector
.
copy
()
bin_img
[
bin_img
>
0
]
=
1
bin_mask
=
np
.
array
(
np
.
where
(
coverage_vector
>
5
,
255
,
0
),
dtype
=
np
.
uint8
)
bin_mask
=
np
.
asarray
(
bin_mask
,
dtype
=
np
.
uint8
)
# Detect the objects contours
contours
,
_
=
cv2
.
findContours
(
bin_mask
,
cv2
.
RETR_EXTERNAL
,
cv2
.
CHAIN_APPROX_SIMPLE
)
# Select best contour
metrics
=
[
compute_contour_metrics
(
coverage_vector
,
cnt
)
for
cnt
in
contours
]
confidences
,
scores
=
map
(
list
,
zip
(
*
metrics
))
best_contour
=
contours
[
np
.
argmax
(
scores
)]
confidence
=
round
(
confidences
[
np
.
argmax
(
scores
)]
/
max_value
,
2
)
# Format for JSON
polygon
=
{
"
confidence
"
:
confidence
,
"
polygon
"
:
[
coordinates
[
0
].
tolist
()
for
coordinates
in
best_contour
],
}
# Detect the objects contours.
contours
,
_
=
cv2
.
findContours
(
np
.
uint8
(
bin_img
),
cv2
.
RETR_EXTERNAL
,
cv2
.
CHAIN_APPROX_SIMPLE
)
if
return_contours
:
return
polygon
,
best_contour
mask
=
np
.
zeros
(
coverage_vector
.
shape
)
cv2
.
drawContours
(
mask
,
contours
,
-
1
,
1
,
-
1
)
confidence
=
round
((
np
.
sum
(
mask
*
coverage_vector
)
/
np
.
sum
(
mask
)),
2
)
# Put together all contours for now.
pre_contours_tojson
=
[[
item
.
tolist
()
for
item
in
contours
]]
# Quick hack to have better json format:
contours_tojson
=
[]
for
contour
in
pre_contours_tojson
[
0
]:
for
coordinate
in
contour
:
contours_tojson
.
append
(
coordinate
[
0
])
return
{
"
confidence
"
:
confidence
,
# average of coverage vector on contours
"
polygon
"
:
contours_tojson
,
"
type
"
:
level
,
}
return
polygon
def
plot_attention
(
...
...
@@ -155,7 +234,7 @@ def plot_attention(
outname
,
word_separators
=
[
"
\n
"
,
"
"
],
line_separators
=
[
"
\n
"
],
output
_polygons
=
False
,
display
_polygons
=
False
,
):
"""
Create a gif by blending attention maps to the image for each text piece (char, word or line)
...
...
@@ -165,11 +244,13 @@ def plot_attention(
:param level: Level to display (must be in [char, word, line])
:param scale: Scaling factor for the output gif image
:param outname: Name of the gif image
:param word_separators: List of word separators
:param line_separators: List of line separators
:param output_polygons: Whether to plot extracted polygons
"""
height
,
width
,
_
=
image
.
shape
attention_map
=
[]
polygons
=
[]
# Convert to PIL Image and create mask
mask
=
Image
.
new
(
"
L
"
,
(
width
,
height
),
color
=
(
110
))
...
...
@@ -180,21 +261,26 @@ def plot_attention(
# Iterate on characters, words or lines
tot_len
=
0
max_value
=
weights
.
sum
(
0
).
max
()
for
text_piece
in
text_list
:
# Accumulate weights for the current word/line and resize to original image size
coverage_vector
=
compute_coverage
(
text_piece
,
max_value
,
tot_len
,
weights
)
coverage_vector
=
cv2
.
resize
(
coverage_vector
,
(
width
,
height
))
coverage_vector
=
compute_coverage
(
text_piece
,
max_value
,
tot_len
,
weights
,
(
width
,
height
)
)
# Get polygons if flag is set:
if
output_polygons
:
polygons
.
append
(
get_polygon
(
text_piece
,
level
,
tot_len
,
max_value
,
weights
,
height
,
width
)
if
display_polygons
:
# draw the contour
_
,
contour
=
get_polygon
(
text_piece
,
max_value
,
tot_len
,
weights
,
(
width
,
height
),
return_contours
=
True
,
)
cv2
.
drawContours
(
coverage_vector
,
[
contour
],
0
,
(
255
),
3
)
# Keep track of text length
tot_len
+=
len
(
text_piece
)
+
offset
...
...
@@ -210,5 +296,3 @@ def plot_attention(
duration
=
1000
,
loop
=
True
,
)
return
polygons
This diff is collapsed.
Click to expand it.
dan/predict/prediction.py
+
79
−
58
View file @
04b15f97
...
...
@@ -2,7 +2,6 @@
import
os
import
pickle
import
re
import
cv2
import
numpy
as
np
...
...
@@ -14,8 +13,13 @@ from dan.datasets.extract.utils import save_json
from
dan.decoder
import
GlobalHTADecoder
from
dan.models
import
FCN_Encoder
from
dan.ocr.utils
import
LM_ind_to_str
from
dan.predict.attention
import
get_predicted_polygons
,
plot_attention
from
dan.utils
import
read_image
,
round_floats
from
dan.predict.attention
import
(
get_predicted_polygons_with_confidence
,
parse_delimiters
,
plot_attention
,
split_text_and_confidences
,
)
from
dan.utils
import
read_image
class
DAN
:
...
...
@@ -93,6 +97,7 @@ class DAN:
confidences
=
False
,
attentions
=
False
,
attention_level
=
False
,
extract_objects
=
False
,
word_separators
=
[
"
\n
"
,
"
"
],
line_separators
=
[
"
\n
"
],
):
...
...
@@ -113,13 +118,20 @@ class DAN:
# Run the prediction.
with
torch
.
no_grad
():
b
=
input_tensor
.
size
(
0
)
reached_end
=
torch
.
zeros
((
b
,),
dtype
=
torch
.
bool
,
device
=
self
.
device
)
prediction_len
=
torch
.
zeros
((
b
,),
dtype
=
torch
.
int
,
device
=
self
.
device
)
batch_size
=
input_tensor
.
size
(
0
)
reached_end
=
torch
.
zeros
(
(
batch_size
,),
dtype
=
torch
.
bool
,
device
=
self
.
device
)
prediction_len
=
torch
.
zeros
(
(
batch_size
,),
dtype
=
torch
.
int
,
device
=
self
.
device
)
predicted_tokens
=
(
torch
.
ones
((
b
,
1
),
dtype
=
torch
.
long
,
device
=
self
.
device
)
*
start_token
torch
.
ones
((
batch_size
,
1
),
dtype
=
torch
.
long
,
device
=
self
.
device
)
*
start_token
)
predicted_tokens_len
=
torch
.
ones
(
(
batch_size
,),
dtype
=
torch
.
int
,
device
=
self
.
device
)
predicted_tokens_len
=
torch
.
ones
((
b
,),
dtype
=
torch
.
int
,
device
=
self
.
device
)
whole_output
=
list
()
confidence_scores
=
list
()
...
...
@@ -188,10 +200,11 @@ class DAN:
predicted_tokens
=
predicted_tokens
[:,
1
:]
prediction_len
[
torch
.
eq
(
reached_end
,
False
)]
=
self
.
max_chars
-
1
predicted_tokens
=
[
predicted_tokens
[
i
,
:
prediction_len
[
i
]]
for
i
in
range
(
b
)
predicted_tokens
[
i
,
:
prediction_len
[
i
]]
for
i
in
range
(
b
atch_size
)
]
confidence_scores
=
[
confidence_scores
[
i
,
:
prediction_len
[
i
]].
tolist
()
for
i
in
range
(
b
)
confidence_scores
[
i
,
:
prediction_len
[
i
]].
tolist
()
for
i
in
range
(
batch_size
)
]
# Transform tokens to characters
...
...
@@ -201,44 +214,30 @@ class DAN:
logger
.
info
(
"
Images processed
"
)
out
=
{
"
text
"
:
predicted_text
}
out
=
{}
out
[
"
text
"
]
=
predicted_text
if
confidences
:
out
[
"
confidences
"
]
=
confidence_scores
if
attentions
:
out
[
"
attentions
"
]
=
attention_maps
# Also get information on polygons
out
[
"
objects
"
]
=
get_predicted_polygons
(
predicted_text
[
0
],
attention_maps
[
0
],
attention_level
,
input_sizes
[
0
][
0
],
input_sizes
[
0
][
1
],
word_separators
,
line_separators
,
)
if
extract_objects
:
out
[
"
objects
"
]
=
[
get_predicted_polygons_with_confidence
(
predicted_text
[
i
],
attention_maps
[
i
],
confidence_scores
[
i
],
attention_level
,
input_sizes
[
i
][
0
],
input_sizes
[
i
][
1
],
word_separators
,
line_separators
,
)
for
i
in
range
(
batch_size
)
]
return
out
def
parse_delimiters
(
delimiters
):
return
re
.
compile
(
r
"
|
"
.
join
(
delimiters
))
def
compute_prob_by_separator
(
characters
,
probabilities
,
separator
):
"""
Split text and confidences using separators and return a list of average confidence scores.
:param characters: list of characters.
:param probabilities: list of probabilities.
:param separators: regex for separators. Use parse_delimiters([
"
\n
"
,
"
"
]) for word confidences and parse_delimiters([
"
\n
"
]) for line confidences.
Returns a list confidence scores.
"""
# match anything except separators, get start and end index
pattern
=
re
.
compile
(
f
"
[^
{
separator
.
pattern
}
]+
"
)
matches
=
[(
m
.
start
(),
m
.
end
())
for
m
in
re
.
finditer
(
pattern
,
characters
)]
# Iterate over text pieces and compute mean confidence
return
[
np
.
mean
(
probabilities
[
start
:
end
])
for
(
start
,
end
)
in
matches
]
def
run
(
image
,
model
,
...
...
@@ -255,6 +254,22 @@ def run(
line_separators
,
predict_objects
,
):
"""
Predict a single image save the output
:param image: Path to the image to predict.
:param model: Path to the model to use for prediction.
:param parameters: Path to the YAML parameters file.
:param charset: Path to the charset.
:param output: Path to the output folder where the results will be saved.
:param scale: Scaling factor to resize the image.
:param confidence_score: Whether to compute confidence score.
:param attention_map: Whether to plot the attention map.
:param attention_map_level: Level of objects to extract.
:param attention_map_scale: Scaling factor for the attention map.
:param word_separators: List of word separators.
:param line_separators: List of line separators.
:param predict_objects: Whether to extract objects.
"""
# Create output directory if necessary
if
not
os
.
path
.
exists
(
output
):
os
.
mkdir
(
output
)
...
...
@@ -286,30 +301,36 @@ def run(
confidences
=
confidence_score
,
attentions
=
attention_map
,
attention_level
=
attention_map_level
,
extract_objects
=
predict_objects
,
word_separators
=
word_separators
,
line_separators
=
line_separators
,
)
text
=
prediction
[
"
text
"
][
0
]
result
=
{
"
text
"
:
text
}
result
=
{}
result
[
"
text
"
]
=
prediction
[
"
text
"
][
0
]
result
[
"
objects
"
]
=
prediction
[
"
objects
"
]
# Return extracted objects (coordinates, text, confidence)
if
predict_objects
:
result
[
"
objects
"
]
=
prediction
[
"
objects
"
][
0
]
#
Average character-based
confidence score
s
#
Return mean
confidence score
if
confidence_score
:
result
[
"
confidences
"
]
=
{}
char_confidences
=
prediction
[
"
confidences
"
][
0
]
result
[
"
confidences
"
]
=
{
"
total
"
:
np
.
around
(
np
.
mean
(
char_confidences
),
2
)}
if
"
word
"
in
confidence_score_levels
:
word_probs
=
compute_prob_by_separator
(
text
,
char_confidences
,
word_separators
)
result
[
"
confidences
"
].
update
({
"
word
"
:
round_floats
(
word_probs
)})
if
"
line
"
in
confidence_score_levels
:
line_probs
=
compute_prob_by_separator
(
text
,
char_confidences
,
line_separators
result
[
"
confidences
"
][
"
total
"
]
=
np
.
around
(
np
.
mean
(
char_confidences
),
2
)
for
level
in
confidence_score_levels
:
result
[
"
confidences
"
][
level
]
=
[]
texts
,
confidences
,
_
=
split_text_and_confidences
(
prediction
[
"
text
"
][
0
],
char_confidences
,
level
,
word_separators
,
line_separators
,
)
result
[
"
confidences
"
].
update
({
"
line
"
:
round_floats
(
line_probs
)})
if
"
char
"
in
confidence_score_levels
:
result
[
"
confidences
"
]
.
update
({
"
char
"
:
round_floats
(
char_
confidence
s
)
})
for
text
,
conf
in
zip
(
texts
,
confidences
)
:
result
[
"
confidences
"
]
[
level
].
append
({
"
text
"
:
text
,
"
confidence
"
:
conf
})
# Save gif with attention map
if
attention_map
:
...
...
@@ -324,7 +345,7 @@ def run(
scale
=
attention_map_scale
,
word_separators
=
word_separators
,
line_separators
=
line_separators
,
output
_polygons
=
predict_objects
,
display
_polygons
=
predict_objects
,
outname
=
gif_filename
,
)
result
[
"
attention_gif
"
]
=
gif_filename
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment