Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/data-generator
1 result
Show changes
Commits on Source (2)
......@@ -7,7 +7,7 @@ repos:
rev: 22.3.0
hooks:
- id: black
- repo: https://gitlab.com/pycqa/flake8
- repo: https://github.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
......
......@@ -31,6 +31,7 @@ class TranscriptionType(Enum):
paragraph: str = "paragraph"
act: str = "act"
page: str = "page"
text: str = "text"
@dataclass
......@@ -136,6 +137,7 @@ class FilterArgs:
Use `--accepted_worker_version_ids manual` to get only manual transcriptions
style: Filter line images by style class. 'other' corresponds to line elements that have neither
handwritten or typewritten class : {[s.name for s in Style]}
accepted_metadatas: Key-value dictionary where each entry is a mandatory Arkindex metadata name/value. Filter lines by metadata.
"""
transcription_type: TranscriptionType = TranscriptionType.text_line
......@@ -144,3 +146,4 @@ class FilterArgs:
accepted_worker_version_ids: List[str] = field(default_factory=list)
skip_vertical_lines: bool = False
style: Style = None
accepted_metadatas: dict = field(default_factory=dict)
......@@ -81,6 +81,8 @@ class HTRDataGenerator:
self.should_filter_by_worker = bool(self.accepted_worker_version_ids)
self.style = filter.style
self.should_filter_by_style = bool(self.style)
self.accepted_metadatas = filter.accepted_metadatas
self.should_filter_by_metadatas = bool(self.accepted_metadatas)
self.transcription_type = filter.transcription_type.value
self.skip_vertical_lines = filter.skip_vertical_lines
self.skipped_pages_count = 0
......@@ -150,11 +152,25 @@ class HTRDataGenerator:
img = read_img(cached_img_path, self.grayscale)
return img
def metadata_filtering(self, elt):
metadatas = {
metadata["name"]: metadata["value"] for metadata in elt["metadata"]
}
for meta in self.accepted_metadatas:
if not (
meta in metadatas and metadatas[meta] == self.accepted_metadatas[meta]
):
return False
return True
def get_accepted_zones(self, page_id: str):
try:
accepted_zones = []
for elt in self.api_client.cached_paginate(
"ListElementChildren", id=page_id, with_classes=True
"ListElementChildren",
id=page_id,
with_classes=self.should_filter_by_class,
with_metadata=self.should_filter_by_metadatas,
):
elem_classes = [c for c in elt["classes"] if c["state"] != "rejected"]
......@@ -193,12 +209,17 @@ class HTRDataGenerator:
raise ValueError(
f"Multiple style classes on the same element! {elt['id']} - {elem_classes}"
)
should_accept = found_class == self.style
if not should_accept:
continue
if self.should_filter_by_metadatas:
if self.metadata_filtering(elt):
if found_class == self.style:
accepted_zones.append(elt["zone"]["id"])
else:
accepted_zones.append(elt["zone"]["id"])
logger.info(
"Number of accepted zone for page {} : {}".format(
page_id, len(accepted_zones)
......@@ -269,9 +290,11 @@ class HTRDataGenerator:
and res["worker_version_id"] not in self.accepted_worker_version_ids
):
continue
if (self.should_filter_by_class or self.should_filter_by_style) and (
res["element"]["zone"]["id"] not in accepted_zones
):
if (
self.should_filter_by_class
or self.should_filter_by_style
or self.should_filter_by_metadatas
) and (res["element"]["zone"]["id"] not in accepted_zones):
continue
if res["element"]["type"] != self.transcription_type:
continue
......@@ -280,7 +303,7 @@ class HTRDataGenerator:
if not text or not text.strip():
continue
if "\n" in text.strip():
if "\n" in text.strip() and not self.transcription_type == "text":
elem_id = res["element"]["id"]
raise ValueError(
f"Newlines are not allowed in line transcriptions - {page_id} - {elem_id} - {text}"
......@@ -396,7 +419,11 @@ class HTRDataGenerator:
)
def extract_lines(self, page_id: str, image_data: dict):
if self.should_filter_by_class or self.should_filter_by_style:
if (
self.should_filter_by_class
or self.should_filter_by_style
or self.should_filter_by_metadatas
):
accepted_zones = self.get_accepted_zones(page_id)
else:
accepted_zones = []
......
......@@ -41,6 +41,23 @@ def fake_image():
return extractor.read_img(img_path)
@pytest.fixture
def fake_run_filter_metadata():
api_client = MockApiClient()
# ignore caching for this test
api_client.cached_paginate = api_client.paginate
with open(FIXTURES / "Maurdor/ListElementChildren/fake_page.json") as f:
pages_json = json.load(f)
api_client.add_response(
"ListElementChildren",
response=pages_json,
id="fake_page",
with_classes=False,
with_metadata=True,
)
return api_client
@pytest.fixture
def fake_run_volume_api_client(fake_volume_id):
api_client = MockApiClient()
......
[
{
"id": "0b719a5a-40c7-47ec-96c8-6c3064df5485", "type": "text", "name": "29", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "0b719a5a-40c7-47ec-96c8-6c3064df5485", "polygon": [[190, 1187], [190, 1242], [389, 1242], [389, 1187], [190, 1187]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/190,1187,199,55/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027308Z", "classes": [], "metadata": [{"id": "8b5100cf-98c6-4d0f-8bbe-6df07cbdd02e", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "bf7de461-1ca2-4160-b1c7-be63e28bd06e", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null
},
{"id": "fbf0d8e5-4729-4bd1-988a-49e178f7d0e6", "type": "text", "name": "27", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "fbf0d8e5-4729-4bd1-988a-49e178f7d0e6", "polygon": [[676, 2124], [676, 2195], [1070, 2195], [1070, 2124], [676, 2124]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/676,2124,394,71/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027161Z", "classes": [], "metadata": [{"id": "75f36675-92a1-43df-90d0-bccd97b43594", "type": "text", "name": "Function", "value": "reference", "dates": []}, {"id": "e90372ce-db83-47ca-9362-b2e5def497e8", "type": "text", "name": "Language", "value": "english", "dates": []}, {"id": "bd1ad52f-dbb4-4dc8-a865-d6c42abd690e", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null},
{"id": "18725942-24f5-4f81-a16f-62c1323c1041", "type": "text", "name": "28", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "18725942-24f5-4f81-a16f-62c1323c1041", "polygon": [[1154, 1244], [1154, 1327], [1456, 1327], [1456, 1244], [1154, 1244]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/1154,1244,302,83/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027234Z", "classes": [], "metadata": [{"id": "1b78325e-24fa-452f-b11c-5db2ac52df59", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "85ce21e0-0521-4e59-bfd0-a2060888be56", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null},
{"id": "3ebe7e48-fe2f-4533-92df-9895db05c3f5", "type": "text", "name": "23", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "3ebe7e48-fe2f-4533-92df-9895db05c3f5", "polygon": [[1403, 1763], [1403, 1830], [1470, 1830], [1470, 1763], [1403, 1763]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/1403,1763,67,67/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.026869Z", "classes": [], "metadata": [{"id": "42034ac4-1ed3-4893-a275-296484b417c5", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "d20a52bb-28e7-407e-998e-b51f10af330a", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null}
]
\ No newline at end of file
......@@ -58,6 +58,23 @@ def test_run_volumes_with_worker_version(
assert api_client.responses == []
def test_get_accepted_zones_filter_metadata(tmpdir, fake_run_filter_metadata):
api_client = fake_run_filter_metadata
out_dir_base = tmpdir
htr_data_gen = HTRDataGenerator(
format="kaldi",
filter=FilterArgs(accepted_metadatas={"Language": "arabic"}),
out_dir_base=out_dir_base,
)
htr_data_gen.api_client = api_client
assert htr_data_gen.get_accepted_zones("fake_page") == [
"0b719a5a-40c7-47ec-96c8-6c3064df5485",
"18725942-24f5-4f81-a16f-62c1323c1041",
"3ebe7e48-fe2f-4533-92df-9895db05c3f5",
]
def test_create_partitions(fake_expected_partitions, tmpdir):
out_dir_base = Path(tmpdir)
splitter = KaldiPartitionSplitter(out_dir_base=out_dir_base)
......