Skip to content
Snippets Groups Projects
Commit f592b055 authored by Manon Blanco's avatar Manon Blanco Committed by Yoann Schneider
Browse files

Unit tests for extraction command

parent 5d8a8cf8
No related branches found
No related tags found
1 merge request!271Unit tests for extraction command
Showing
with 559 additions and 14 deletions
......@@ -102,7 +102,6 @@ class ArkindexExtractor:
Insert tokens delimiting the start/end of each entity on the transcription.
"""
text, text_offset = "", 0
# Keep all text by default if no separator was given
for entity in entities:
# Text before entity
text += "".join(
......@@ -133,7 +132,7 @@ class ArkindexExtractor:
# Remaining text after the last entity
text += "".join(filter(self._keep_char, full_text[text_offset:]))
if not self.entity_separators:
if not self.entity_separators or self.keep_spaces:
return text
# Add some clean up to avoid several separators between entities
......
# -*- coding: utf-8 -*-
from pathlib import Path
FIXTURES = Path(__file__).resolve().parent / "data"
# -*- coding: utf-8 -*-
from pathlib import Path
import json
import uuid
from operator import itemgetter
from typing import List, Optional, Union
import pytest
from arkindex_export import open_database
from arkindex_export import (
Element,
ElementPath,
Entity,
EntityType,
Image,
ImageServer,
Transcription,
TranscriptionEntity,
WorkerVersion,
database,
)
from dan.ocr.train import update_config
from tests import FIXTURES
FIXTURES = Path(__file__).resolve().parent / "data"
@pytest.fixture(scope="session")
def mock_database(tmp_path_factory):
def create_transcription_entity(
transcription: Transcription,
worker_version: Union[str, None],
type: str,
name: str,
offset: int,
) -> None:
entity_type, _ = EntityType.get_or_create(
name=type, defaults={"id": f"{type}_id"}
)
entity = Entity.create(
id=str(uuid.uuid4()),
name=name,
type=entity_type,
worker_version=worker_version,
)
TranscriptionEntity.create(
id=str(uuid.uuid4()),
entity=entity,
length=len(name),
offset=offset,
transcription=transcription,
worker_version=worker_version,
)
@pytest.fixture
def database_path():
return FIXTURES / "export.sqlite"
def create_transcriptions(element: Element, entities: List[dict]) -> None:
if not entities:
return
# Add transcription with entities
entities = sorted(entities, key=itemgetter("offset"))
# We will add extra spaces to test the "keep_spaces" parameters of the "extract" command
for offset, entity in enumerate(entities[1:], start=1):
entity["offset"] += offset
for worker_version in [None, "worker_version_id"]:
# Use different transcriptions to filter by worker version
if worker_version == "worker_version_id":
for entity in entities:
entity["name"] = entity["name"].lower()
transcription = Transcription.create(
id=element.id + (worker_version or ""),
# Add extra spaces to test the "keep_spaces" parameters of the "extract" command
text=" ".join(map(itemgetter("name"), entities)),
element=element,
worker_version=worker_version,
)
for entity in entities:
create_transcription_entity(
transcription=transcription,
worker_version=worker_version,
**entity,
)
def create_element(id: str, parent: Optional[Element] = None) -> None:
element_path = (FIXTURES / "extraction" / "elements" / id).with_suffix(".json")
element_json = json.loads(element_path.read_text())
polygon = element_json.get("polygon")
# Always use page images because polygons are based on the full image
# Reconstruct and reuse the page ID to use the image cache (and avoid downloading through the Arkindex API)
image_id = "-".join(id.split("-")[:2])
image, _ = (
Image.get_or_create(
id=image_id,
defaults={
"server": image_server,
"url": f"http://image/{image_id}/url",
"width": 0,
"height": 0,
},
)
if polygon
else (None, False)
)
element = Element.create(
id=id,
name=id,
type=element_json["type"],
image=image,
polygon=json.dumps(polygon) if polygon else None,
created=0.0,
updated=0.0,
)
if parent:
ElementPath.create(id=str(uuid.uuid4()), parent=parent, child=element)
create_transcriptions(
element=element,
entities=element_json.get("transcription_entities", []),
)
# Recursive function to create children
for child in element_json.get("children", []):
create_element(id=child, parent=element)
MODELS = [
WorkerVersion,
ImageServer,
Image,
Element,
ElementPath,
EntityType,
Entity,
Transcription,
TranscriptionEntity,
]
# Initialisation
tmp_path = tmp_path_factory.mktemp("data")
database_path = tmp_path / "db.sqlite"
database.init(
database_path,
pragmas={
# Recommended settings from peewee
# http://docs.peewee-orm.com/en/latest/peewee/database.html#recommended-settings
# Do not set journal mode to WAL as it writes in the database
"cache_size": -1 * 64000, # 64MB
"foreign_keys": 1,
"ignore_check_constraints": 0,
"synchronous": 0,
},
)
database.connect()
# Create tables
database.create_tables(MODELS)
image_server = ImageServer.create(
id="image_server_id",
url="http://image/server/url",
display_name="Image server",
)
WorkerVersion.create(
id="worker_version_id",
slug="worker_version",
name="Worker version",
repository_url="http://repository/url",
revision="main",
type="worker",
)
# Create folders
create_element(id="root")
@pytest.fixture(autouse=True)
def demo_db(database_path):
"""
Open connection towards a known demo database
"""
open_database(database_path)
return database_path
@pytest.fixture
......
{
"type": "folder",
"children": [
"train",
"val",
"test"
]
}
{
"type": "text_line",
"polygon": [
[37, 191],
[37, 339],
[767, 339],
[767, 191],
[37, 191]
],
"transcription_entities": [
{
"name": "Coupez",
"type": "surname",
"offset": 0
},
{
"name": "Louis",
"type": "firstname",
"offset": 7
},
{
"name": "7.12.14",
"type": "birthdate",
"offset": 13
}
]
}
{
"type": "text_line",
"polygon": [
[28, 339],
[28, 464],
[767, 464],
[767, 339],
[28, 339]
],
"transcription_entities": [
{
"name": "Poutrain",
"type": "surname",
"offset": 0
},
{
"name": "Adolphe",
"type": "firstname",
"offset": 9
},
{
"name": "9.4.13",
"type": "birthdate",
"offset": 17
}
]
}
{
"type": "text_line",
"polygon": [
[28, 464],
[28, 614],
[767, 614],
[767, 464],
[28, 464]
],
"transcription_entities": [
{
"name": "Gabale",
"type": "surname",
"offset": 0
},
{
"name": "François",
"type": "firstname",
"offset": 7
},
{
"name": "26.3.11",
"type": "birthdate",
"offset": 16
}
]
}
{
"type": "double_page",
"polygon": [
[0, 0],
[0, 1357],
[1900, 1357],
[1900, 0],
[0, 0]
],
"children": [
"test-page_1-line_1",
"test-page_1-line_2",
"test-page_1-line_3"
]
}
{
"type": "text_line",
"polygon": [
[14, 199],
[14, 330],
[767, 330],
[767, 199],
[14, 199]
],
"transcription_entities": [
{
"name": "Durosoy",
"type": "surname",
"offset": 0
},
{
"name": "Louis",
"type": "firstname",
"offset": 8
},
{
"name": "22-4-18",
"type": "birthdate",
"offset": 14
}
]
}
{
"type": "text_line",
"polygon": [
[16, 330],
[16, 471],
[765, 471],
[765, 330],
[16, 330]
],
"transcription_entities": [
{
"name": "Colaiani",
"type": "surname",
"offset": 0
},
{
"name": "Angels",
"type": "firstname",
"offset": 9
},
{
"name": "28.11.17",
"type": "birthdate",
"offset": 16
}
]
}
{
"type": "text_line",
"polygon": [
[11, 473],
[11, 598],
[772, 598],
[772, 473],
[11, 473]
],
"transcription_entities": [
{
"name": "Renouard",
"type": "surname",
"offset": 0
},
{
"name": "Maurice",
"type": "firstname",
"offset": 9
},
{
"name": "25.7.04",
"type": "birthdate",
"offset": 17
}
]
}
{
"type": "double_page",
"polygon": [
[0, 0],
[0, 1334],
[1900, 1334],
[1900, 0],
[0, 0]
],
"children": [
"test-page_2-line_1",
"test-page_2-line_2",
"test-page_2-line_3"
]
}
{
"type": "folder",
"children": [
"test-page_1",
"test-page_2"
]
}
{
"type": "text_line",
"polygon": [
[27, 187],
[27, 327],
[754, 327],
[754, 187],
[27, 187]
],
"transcription_entities": [
{
"name": "Caillet",
"type": "surname",
"offset": 0
},
{
"name": "Maurice",
"type": "firstname",
"offset": 8
},
{
"name": "28.9.06",
"type": "birthdate",
"offset": 16
}
]
}
{
"type": "text_line",
"polygon": [
[28, 328],
[28, 465],
[755, 465],
[755, 328],
[28, 328]
],
"transcription_entities": [
{
"name": "Reboul",
"type": "surname",
"offset": 0
},
{
"name": "Jean",
"type": "firstname",
"offset": 7
},
{
"name": "30.9.02",
"type": "birthdate",
"offset": 12
}
]
}
{
"type": "text_line",
"polygon": [
[23, 463],
[23, 604],
[803, 604],
[803, 463],
[23, 463]
],
"transcription_entities": [
{
"name": "Bareyre",
"type": "surname",
"offset": 0
},
{
"name": "Jean",
"type": "firstname",
"offset": 8
},
{
"name": "28.3.11",
"type": "birthdate",
"offset": 13
}
]
}
{
"type": "text_line",
"polygon": [
[21, 604],
[21, 743],
[812, 743],
[812, 604],
[21, 604]
],
"transcription_entities": [
{
"name": "Roussy",
"type": "surname",
"offset": 0
},
{
"name": "Jean",
"type": "firstname",
"offset": 7
},
{
"name": "4.11.14",
"type": "birthdate",
"offset": 12
}
]
}
{
"type": "double_page",
"polygon": [
[0, 0],
[0, 1334],
[1900, 1334],
[1900, 0],
[0, 0]
],
"children": [
"train-page_1-line_1",
"train-page_1-line_2",
"train-page_1-line_3",
"train-page_1-line_4"
]
}
{
"type": "text_line",
"polygon": [
[18, 197],
[18, 340],
[751, 340],
[751, 197],
[18, 197]
],
"transcription_entities": [
{
"name": "Marin",
"type": "surname",
"offset": 0
},
{
"name": "Marcel",
"type": "firstname",
"offset": 6
},
{
"name": "10.8.06",
"type": "birthdate",
"offset": 13
}
]
}
{
"type": "text_line",
"polygon": [
[18, 340],
[18, 476],
[751, 476],
[751, 340],
[18, 340]
],
"transcription_entities": [
{
"name": "Roques",
"type": "surname",
"offset": 0
},
{
"name": "Eloi",
"type": "firstname",
"offset": 7
},
{
"name": "11.10.04",
"type": "birthdate",
"offset": 12
}
]
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment