Skip to content
Snippets Groups Projects
Commit 8094104e authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Solene Tarride
Browse files

Red background

parent e25de69d
No related branches found
No related tags found
1 merge request!281Red background
......@@ -7,7 +7,7 @@ import random
from collections import defaultdict
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union
from uuid import UUID
import cv2
......@@ -35,8 +35,12 @@ from dan.datasets.extract.utils import (
remove_spaces,
)
from dan.utils import EntityType, parse_tokens
from line_image_extractor.extractor import save_img
from line_image_extractor.image_utils import deskew_image, polygon_to_bbox
from line_image_extractor.extractor import extract
from line_image_extractor.image_utils import (
BoundingBox,
Extraction,
polygon_to_bbox,
)
IMAGES_DIR = "images" # Subpath to the images directory.
......@@ -111,11 +115,13 @@ class ArkindexExtractor:
elif bigger_height:
return f",{self.max_height}"
def build_iiif_url(self, polygon, image_url):
def build_iiif_url(self, polygon, image_url) -> Tuple[BoundingBox, str]:
bbox = polygon_to_bbox(json.loads(str(polygon)))
size = self.get_iiif_size_arg(width=bbox.width, height=bbox.height)
# Rotations are done using the lib
return IIIF_URL.format(image_url=image_url, bbox=get_bbox(polygon), size=size)
return bbox, IIIF_URL.format(
image_url=image_url, bbox=get_bbox(polygon), size=size
)
def _keep_char(self, char: str) -> bool:
# Keep all text by default if no separator was given
......@@ -214,20 +220,28 @@ class ArkindexExtractor:
:param image_url: Base IIIF URL of the image.
:param destination: Where the image should be saved.
"""
download_url: str = self.build_iiif_url(polygon=polygon, image_url=image_url)
bbox, download_url = self.build_iiif_url(polygon=polygon, image_url=image_url)
try:
img: Image.Image = download_image(download_url)
# Deskew image
image = deskew_image(
np.asarray(img), polygon=np.asarray(polygon), max_deskew_angle=45
# The polygon's coordinate are in the referential of the full image
# We need to remove the offset of the bounding rectangle
polygon = [(x - bbox.x, y - bbox.y) for x, y in polygon]
# Normalize bbox
bbox = BoundingBox(x=0, y=0, width=bbox.width, height=bbox.height)
image = extract(
img=cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR),
polygon=np.asarray(polygon).clip(0),
bbox=bbox,
extraction_mode=Extraction.boundingRect,
max_deskew_angle=45,
)
# Convert to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
destination.parent.mkdir(parents=True, exist_ok=True)
cv2.imwrite(str(destination), image)
# Save the image to disk
save_img(path=destination, img=image)
except Exception as e:
raise ImageDownloadError(
split=split, path=str(destination), url=download_url, exc=e
......
......@@ -55,7 +55,7 @@ def download_image(url):
resp = _retried_request(url)
# Preprocess the image and prepare it for classification
image = Image.open(BytesIO(resp.content))
image = Image.open(BytesIO(resp.content)).convert("RGB")
# Do not rotate JPEG images (see https://github.com/python-pillow/Pillow/issues/4703)
image = ImageOps.exif_transpose(image)
......
......@@ -15,6 +15,7 @@ from dan.datasets.extract.exceptions import NoEndTokenError
from dan.datasets.extract.extract import IIIF_FULL_SIZE, ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.utils import parse_tokens
from line_image_extractor.image_utils import BoundingBox, polygon_to_bbox
from tests import FIXTURES
EXTRACTION_DATA_PATH = FIXTURES / "extraction"
......@@ -284,9 +285,9 @@ def test_extract(
if token
]
def mock_build_image_url(image_url, *args, **kwargs):
def mock_build_image_url(image_url, polygon, *args, **kwargs):
# During tests, the image URL is its local path
return image_url
return polygon_to_bbox(json.loads(str(polygon))), image_url
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
......@@ -423,7 +424,7 @@ def test_download_image_error(iiif_url, caplog, capsys):
"destination": "/dev/null",
}
# Make download_image crash
iiif_url.return_value = task["image_url"]
iiif_url.return_value = BoundingBox(0, 0, 0, 0), task["image_url"]
extractor = ArkindexExtractor(
folders=["train", "val", "test"],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment