Unit tests for extraction command

f592b055 · Manon Blanco · Yoann Schneider · 5d8a8cf8 · f592b055 · f592b055
Commit f592b055 authored 1 year ago by Manon Blanco Committed by Yoann Schneider 1 year ago
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -102,7 +102,6 @@ class ArkindexExtractor:
        Insert tokens delimiting the start/end of each entity on the transcription.
        """
        text, text_offset = "", 0
-        # Keep all text by default if no separator was given
        for entity in entities:
            # Text before entity
            text += "".join(
@@ -133,7 +132,7 @@ class ArkindexExtractor:
        # Remaining text after the last entity
        text += "".join(filter(self._keep_char, full_text[text_offset:]))

-        if not self.entity_separators:
+        if not self.entity_separators or self.keep_spaces:
            return text

        # Add some clean up to avoid several separators between entities

--- a/tests/__init__.py
+++ b/tests/__init__.py
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+FIXTURES = Path(__file__).resolve().parent / "data"
--- a/tests/conftest.py
+++ b/tests/conftest.py
 # -*- coding: utf-8 -*-
-from pathlib import Path
+import json
+import uuid
+from operator import itemgetter
+from typing import List, Optional, Union

 import pytest

-from arkindex_export import open_database
+from arkindex_export import (
+    Element,
+    ElementPath,
+    Entity,
+    EntityType,
+    Image,
+    ImageServer,
+    Transcription,
+    TranscriptionEntity,
+    WorkerVersion,
+    database,
+)
 from dan.ocr.train import update_config
+from tests import FIXTURES

-FIXTURES = Path(__file__).resolve().parent / "data"

+@pytest.fixture(scope="session")
+def mock_database(tmp_path_factory):
+    def create_transcription_entity(
+        transcription: Transcription,
+        worker_version: Union[str, None],
+        type: str,
+        name: str,
+        offset: int,
+    ) -> None:
+        entity_type, _ = EntityType.get_or_create(
+            name=type, defaults={"id": f"{type}_id"}
+        )
+        entity = Entity.create(
+            id=str(uuid.uuid4()),
+            name=name,
+            type=entity_type,
+            worker_version=worker_version,
+        )
+        TranscriptionEntity.create(
+            id=str(uuid.uuid4()),
+            entity=entity,
+            length=len(name),
+            offset=offset,
+            transcription=transcription,
+            worker_version=worker_version,
+        )

-@pytest.fixture
-def database_path():
-    return FIXTURES / "export.sqlite"
+    def create_transcriptions(element: Element, entities: List[dict]) -> None:
+        if not entities:
+            return
+
+        # Add transcription with entities
+        entities = sorted(entities, key=itemgetter("offset"))
+
+        # We will add extra spaces to test the "keep_spaces" parameters of the "extract" command
+        for offset, entity in enumerate(entities[1:], start=1):
+            entity["offset"] += offset
+
+        for worker_version in [None, "worker_version_id"]:
+            # Use different transcriptions to filter by worker version
+            if worker_version == "worker_version_id":
+                for entity in entities:
+                    entity["name"] = entity["name"].lower()
+
+            transcription = Transcription.create(
+                id=element.id + (worker_version or ""),
+                # Add extra spaces to test the "keep_spaces" parameters of the "extract" command
+                text="  ".join(map(itemgetter("name"), entities)),
+                element=element,
+                worker_version=worker_version,
+            )
+
+            for entity in entities:
+                create_transcription_entity(
+                    transcription=transcription,
+                    worker_version=worker_version,
+                    **entity,
+                )
+
+    def create_element(id: str, parent: Optional[Element] = None) -> None:
+        element_path = (FIXTURES / "extraction" / "elements" / id).with_suffix(".json")
+        element_json = json.loads(element_path.read_text())
+
+        polygon = element_json.get("polygon")
+        # Always use page images because polygons are based on the full image
+        # Reconstruct and reuse the page ID to use the image cache (and avoid downloading through the Arkindex API)
+        image_id = "-".join(id.split("-")[:2])
+        image, _ = (
+            Image.get_or_create(
+                id=image_id,
+                defaults={
+                    "server": image_server,
+                    "url": f"http://image/{image_id}/url",
+                    "width": 0,
+                    "height": 0,
+                },
+            )
+            if polygon
+            else (None, False)
+        )
+
+        element = Element.create(
+            id=id,
+            name=id,
+            type=element_json["type"],
+            image=image,
+            polygon=json.dumps(polygon) if polygon else None,
+            created=0.0,
+            updated=0.0,
+        )
+
+        if parent:
+            ElementPath.create(id=str(uuid.uuid4()), parent=parent, child=element)
+
+        create_transcriptions(
+            element=element,
+            entities=element_json.get("transcription_entities", []),
+        )
+
+        # Recursive function to create children
+        for child in element_json.get("children", []):
+            create_element(id=child, parent=element)
+
+    MODELS = [
+        WorkerVersion,
+        ImageServer,
+        Image,
+        Element,
+        ElementPath,
+        EntityType,
+        Entity,
+        Transcription,
+        TranscriptionEntity,
+    ]
+
+    # Initialisation
+    tmp_path = tmp_path_factory.mktemp("data")
+    database_path = tmp_path / "db.sqlite"
+    database.init(
+        database_path,
+        pragmas={
+            # Recommended settings from peewee
+            # http://docs.peewee-orm.com/en/latest/peewee/database.html#recommended-settings
+            # Do not set journal mode to WAL as it writes in the database
+            "cache_size": -1 * 64000,  # 64MB
+            "foreign_keys": 1,
+            "ignore_check_constraints": 0,
+            "synchronous": 0,
+        },
+    )
+    database.connect()
+
+    # Create tables
+    database.create_tables(MODELS)
+
+    image_server = ImageServer.create(
+        id="image_server_id",
+        url="http://image/server/url",
+        display_name="Image server",
+    )
+
+    WorkerVersion.create(
+        id="worker_version_id",
+        slug="worker_version",
+        name="Worker version",
+        repository_url="http://repository/url",
+        revision="main",
+        type="worker",
+    )

+    # Create folders
+    create_element(id="root")

-@pytest.fixture(autouse=True)
-def demo_db(database_path):
-    """
-    Open connection towards a known demo database
-    """
-    open_database(database_path)
+    return database_path


 @pytest.fixture

--- a/tests/data/extraction/elements/root.json
+++ b/tests/data/extraction/elements/root.json
+{
+    "type": "folder",
+    "children": [
+        "train",
+        "val",
+        "test"
+    ]
+}
--- a/tests/data/extraction/elements/test-page_1-line_1.json
+++ b/tests/data/extraction/elements/test-page_1-line_1.json
+{
+    "type": "text_line",
+    "polygon": [
+        [37, 191],
+        [37, 339],
+        [767, 339],
+        [767, 191],
+        [37, 191]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Coupez",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Louis",
+            "type": "firstname",
+            "offset": 7
+        },
+        {
+            "name": "7.12.14",
+            "type": "birthdate",
+            "offset": 13
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_1-line_2.json
+++ b/tests/data/extraction/elements/test-page_1-line_2.json
+{
+    "type": "text_line",
+    "polygon": [
+        [28, 339],
+        [28, 464],
+        [767, 464],
+        [767, 339],
+        [28, 339]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Poutrain",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Adolphe",
+            "type": "firstname",
+            "offset": 9
+        },
+        {
+            "name": "9.4.13",
+            "type": "birthdate",
+            "offset": 17
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_1-line_3.json
+++ b/tests/data/extraction/elements/test-page_1-line_3.json
+{
+    "type": "text_line",
+    "polygon": [
+        [28, 464],
+        [28, 614],
+        [767, 614],
+        [767, 464],
+        [28, 464]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Gabale",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "François",
+            "type": "firstname",
+            "offset": 7
+        },
+        {
+            "name": "26.3.11",
+            "type": "birthdate",
+            "offset": 16
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_1.json
+++ b/tests/data/extraction/elements/test-page_1.json
+{
+    "type": "double_page",
+    "polygon": [
+        [0, 0],
+        [0, 1357],
+        [1900, 1357],
+        [1900, 0],
+        [0, 0]
+    ],
+    "children": [
+        "test-page_1-line_1",
+        "test-page_1-line_2",
+        "test-page_1-line_3"
+    ]
+}
--- a/tests/data/extraction/elements/test-page_2-line_1.json
+++ b/tests/data/extraction/elements/test-page_2-line_1.json
+{
+    "type": "text_line",
+    "polygon": [
+        [14, 199],
+        [14, 330],
+        [767, 330],
+        [767, 199],
+        [14, 199]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Durosoy",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Louis",
+            "type": "firstname",
+            "offset": 8
+        },
+        {
+            "name": "22-4-18",
+            "type": "birthdate",
+            "offset": 14
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_2-line_2.json
+++ b/tests/data/extraction/elements/test-page_2-line_2.json
+{
+    "type": "text_line",
+    "polygon": [
+        [16, 330],
+        [16, 471],
+        [765, 471],
+        [765, 330],
+        [16, 330]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Colaiani",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Angels",
+            "type": "firstname",
+            "offset": 9
+        },
+        {
+            "name": "28.11.17",
+            "type": "birthdate",
+            "offset": 16
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_2-line_3.json
+++ b/tests/data/extraction/elements/test-page_2-line_3.json
+{
+    "type": "text_line",
+    "polygon": [
+        [11, 473],
+        [11, 598],
+        [772, 598],
+        [772, 473],
+        [11, 473]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Renouard",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Maurice",
+            "type": "firstname",
+            "offset": 9
+        },
+        {
+            "name": "25.7.04",
+            "type": "birthdate",
+            "offset": 17
+        }
+    ]
+}
--- a/tests/data/extraction/elements/test-page_2.json
+++ b/tests/data/extraction/elements/test-page_2.json
+{
+    "type": "double_page",
+    "polygon": [
+        [0, 0],
+        [0, 1334],
+        [1900, 1334],
+        [1900, 0],
+        [0, 0]
+    ],
+    "children": [
+        "test-page_2-line_1",
+        "test-page_2-line_2",
+        "test-page_2-line_3"
+    ]
+}
--- a/tests/data/extraction/elements/test.json
+++ b/tests/data/extraction/elements/test.json
+{
+    "type": "folder",
+    "children": [
+        "test-page_1",
+        "test-page_2"
+    ]
+}
--- a/tests/data/extraction/elements/train-page_1-line_1.json
+++ b/tests/data/extraction/elements/train-page_1-line_1.json
+{
+    "type": "text_line",
+    "polygon": [
+        [27, 187],
+        [27, 327],
+        [754, 327],
+        [754, 187],
+        [27, 187]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Caillet",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Maurice",
+            "type": "firstname",
+            "offset": 8
+        },
+        {
+            "name": "28.9.06",
+            "type": "birthdate",
+            "offset": 16
+        }
+    ]
+}
--- a/tests/data/extraction/elements/train-page_1-line_2.json
+++ b/tests/data/extraction/elements/train-page_1-line_2.json
+{
+    "type": "text_line",
+    "polygon": [
+        [28, 328],
+        [28, 465],
+        [755, 465],
+        [755, 328],
+        [28, 328]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Reboul",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Jean",
+            "type": "firstname",
+            "offset": 7
+        },
+        {
+            "name": "30.9.02",
+            "type": "birthdate",
+            "offset": 12
+        }
+    ]
+}
--- a/tests/data/extraction/elements/train-page_1-line_3.json
+++ b/tests/data/extraction/elements/train-page_1-line_3.json
+{
+    "type": "text_line",
+    "polygon": [
+        [23, 463],
+        [23, 604],
+        [803, 604],
+        [803, 463],
+        [23, 463]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Bareyre",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Jean",
+            "type": "firstname",
+            "offset": 8
+        },
+        {
+            "name": "28.3.11",
+            "type": "birthdate",
+            "offset": 13
+        }
+    ]
+}
--- a/tests/data/extraction/elements/train-page_1-line_4.json
+++ b/tests/data/extraction/elements/train-page_1-line_4.json
+{
+    "type": "text_line",
+    "polygon": [
+        [21, 604],
+        [21, 743],
+        [812, 743],
+        [812, 604],
+        [21, 604]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Roussy",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Jean",
+            "type": "firstname",
+            "offset": 7
+        },
+        {
+            "name": "4.11.14",
+            "type": "birthdate",
+            "offset": 12
+        }
+    ]
+}
--- a/tests/data/extraction/elements/train-page_1.json
+++ b/tests/data/extraction/elements/train-page_1.json
+{
+    "type": "double_page",
+    "polygon": [
+        [0, 0],
+        [0, 1334],
+        [1900, 1334],
+        [1900, 0],
+        [0, 0]
+    ],
+    "children": [
+        "train-page_1-line_1",
+        "train-page_1-line_2",
+        "train-page_1-line_3",
+        "train-page_1-line_4"
+    ]
+}
--- a/tests/data/extraction/elements/train-page_2-line_1.json
+++ b/tests/data/extraction/elements/train-page_2-line_1.json
+{
+    "type": "text_line",
+    "polygon": [
+        [18, 197],
+        [18, 340],
+        [751, 340],
+        [751, 197],
+        [18, 197]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Marin",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Marcel",
+            "type": "firstname",
+            "offset": 6
+        },
+        {
+            "name": "10.8.06",
+            "type": "birthdate",
+            "offset": 13
+        }
+    ]
+}
--- a/tests/data/extraction/elements/train-page_2-line_2.json
+++ b/tests/data/extraction/elements/train-page_2-line_2.json
+{
+    "type": "text_line",
+    "polygon": [
+        [18, 340],
+        [18, 476],
+        [751, 476],
+        [751, 340],
+        [18, 340]
+    ],
+    "transcription_entities": [
+        {
+            "name": "Roques",
+            "type": "surname",
+            "offset": 0
+        },
+        {
+            "name": "Eloi",
+            "type": "firstname",
+            "offset": 7
+        },
+        {
+            "name": "11.10.04",
+            "type": "birthdate",
+            "offset": 12
+        }
+    ]
+}