Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
8094104e
Commit
8094104e
authored
1 year ago
by
Yoann Schneider
Committed by
Solene Tarride
1 year ago
Browse files
Options
Downloads
Patches
Plain Diff
Red background
parent
e25de69d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!281
Red background
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
dan/datasets/extract/extract.py
+27
-13
27 additions, 13 deletions
dan/datasets/extract/extract.py
dan/datasets/extract/utils.py
+1
-1
1 addition, 1 deletion
dan/datasets/extract/utils.py
tests/test_extract.py
+4
-3
4 additions, 3 deletions
tests/test_extract.py
with
32 additions
and
17 deletions
dan/datasets/extract/extract.py
+
27
−
13
View file @
8094104e
...
...
@@ -7,7 +7,7 @@ import random
from
collections
import
defaultdict
from
concurrent.futures
import
Future
,
ThreadPoolExecutor
from
pathlib
import
Path
from
typing
import
Dict
,
List
,
Optional
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
uuid
import
UUID
import
cv2
...
...
@@ -35,8 +35,12 @@ from dan.datasets.extract.utils import (
remove_spaces
,
)
from
dan.utils
import
EntityType
,
parse_tokens
from
line_image_extractor.extractor
import
save_img
from
line_image_extractor.image_utils
import
deskew_image
,
polygon_to_bbox
from
line_image_extractor.extractor
import
extract
from
line_image_extractor.image_utils
import
(
BoundingBox
,
Extraction
,
polygon_to_bbox
,
)
IMAGES_DIR
=
"
images
"
# Subpath to the images directory.
...
...
@@ -111,11 +115,13 @@ class ArkindexExtractor:
elif
bigger_height
:
return
f
"
,
{
self
.
max_height
}
"
def
build_iiif_url
(
self
,
polygon
,
image_url
):
def
build_iiif_url
(
self
,
polygon
,
image_url
)
->
Tuple
[
BoundingBox
,
str
]
:
bbox
=
polygon_to_bbox
(
json
.
loads
(
str
(
polygon
)))
size
=
self
.
get_iiif_size_arg
(
width
=
bbox
.
width
,
height
=
bbox
.
height
)
# Rotations are done using the lib
return
IIIF_URL
.
format
(
image_url
=
image_url
,
bbox
=
get_bbox
(
polygon
),
size
=
size
)
return
bbox
,
IIIF_URL
.
format
(
image_url
=
image_url
,
bbox
=
get_bbox
(
polygon
),
size
=
size
)
def
_keep_char
(
self
,
char
:
str
)
->
bool
:
# Keep all text by default if no separator was given
...
...
@@ -214,20 +220,28 @@ class ArkindexExtractor:
:param image_url: Base IIIF URL of the image.
:param destination: Where the image should be saved.
"""
download_url
:
str
=
self
.
build_iiif_url
(
polygon
=
polygon
,
image_url
=
image_url
)
bbox
,
download_url
=
self
.
build_iiif_url
(
polygon
=
polygon
,
image_url
=
image_url
)
try
:
img
:
Image
.
Image
=
download_image
(
download_url
)
# Deskew image
image
=
deskew_image
(
np
.
asarray
(
img
),
polygon
=
np
.
asarray
(
polygon
),
max_deskew_angle
=
45
# The polygon's coordinate are in the referential of the full image
# We need to remove the offset of the bounding rectangle
polygon
=
[(
x
-
bbox
.
x
,
y
-
bbox
.
y
)
for
x
,
y
in
polygon
]
# Normalize bbox
bbox
=
BoundingBox
(
x
=
0
,
y
=
0
,
width
=
bbox
.
width
,
height
=
bbox
.
height
)
image
=
extract
(
img
=
cv2
.
cvtColor
(
np
.
asarray
(
img
),
cv2
.
COLOR_RGB2BGR
),
polygon
=
np
.
asarray
(
polygon
).
clip
(
0
),
bbox
=
bbox
,
extraction_mode
=
Extraction
.
boundingRect
,
max_deskew_angle
=
45
,
)
# Convert to RGB
image
=
cv2
.
cvtColor
(
image
,
cv2
.
COLOR_BGR2RGB
)
destination
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
cv2
.
imwrite
(
str
(
destination
),
image
)
# Save the image to disk
save_img
(
path
=
destination
,
img
=
image
)
except
Exception
as
e
:
raise
ImageDownloadError
(
split
=
split
,
path
=
str
(
destination
),
url
=
download_url
,
exc
=
e
...
...
This diff is collapsed.
Click to expand it.
dan/datasets/extract/utils.py
+
1
−
1
View file @
8094104e
...
...
@@ -55,7 +55,7 @@ def download_image(url):
resp
=
_retried_request
(
url
)
# Preprocess the image and prepare it for classification
image
=
Image
.
open
(
BytesIO
(
resp
.
content
))
image
=
Image
.
open
(
BytesIO
(
resp
.
content
))
.
convert
(
"
RGB
"
)
# Do not rotate JPEG images (see https://github.com/python-pillow/Pillow/issues/4703)
image
=
ImageOps
.
exif_transpose
(
image
)
...
...
This diff is collapsed.
Click to expand it.
tests/test_extract.py
+
4
−
3
View file @
8094104e
...
...
@@ -15,6 +15,7 @@ from dan.datasets.extract.exceptions import NoEndTokenError
from
dan.datasets.extract.extract
import
IIIF_FULL_SIZE
,
ArkindexExtractor
from
dan.datasets.extract.utils
import
EntityType
,
insert_token
,
remove_spaces
from
dan.utils
import
parse_tokens
from
line_image_extractor.image_utils
import
BoundingBox
,
polygon_to_bbox
from
tests
import
FIXTURES
EXTRACTION_DATA_PATH
=
FIXTURES
/
"
extraction
"
...
...
@@ -284,9 +285,9 @@ def test_extract(
if
token
]
def
mock_build_image_url
(
image_url
,
*
args
,
**
kwargs
):
def
mock_build_image_url
(
image_url
,
polygon
,
*
args
,
**
kwargs
):
# During tests, the image URL is its local path
return
image_url
return
polygon_to_bbox
(
json
.
loads
(
str
(
polygon
))),
image_url
extractor
=
ArkindexExtractor
(
folders
=
[
"
train
"
,
"
val
"
,
"
test
"
],
...
...
@@ -423,7 +424,7 @@ def test_download_image_error(iiif_url, caplog, capsys):
"
destination
"
:
"
/dev/null
"
,
}
# Make download_image crash
iiif_url
.
return_value
=
task
[
"
image_url
"
]
iiif_url
.
return_value
=
BoundingBox
(
0
,
0
,
0
,
0
),
task
[
"
image_url
"
]
extractor
=
ArkindexExtractor
(
folders
=
[
"
train
"
,
"
val
"
,
"
test
"
],
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment