Skip to content
Snippets Groups Projects
Commit 7be2623c authored by Chaza Abdelwahab's avatar Chaza Abdelwahab Committed by Yoann Schneider
Browse files

Implement metadata filtering

parent 9be8a911
No related branches found
No related tags found
1 merge request!42Implement metadata filtering
Pipeline #74607 passed
...@@ -31,6 +31,7 @@ class TranscriptionType(Enum): ...@@ -31,6 +31,7 @@ class TranscriptionType(Enum):
paragraph: str = "paragraph" paragraph: str = "paragraph"
act: str = "act" act: str = "act"
page: str = "page" page: str = "page"
text: str = "text"
@dataclass @dataclass
...@@ -136,6 +137,7 @@ class FilterArgs: ...@@ -136,6 +137,7 @@ class FilterArgs:
Use `--accepted_worker_version_ids manual` to get only manual transcriptions Use `--accepted_worker_version_ids manual` to get only manual transcriptions
style: Filter line images by style class. 'other' corresponds to line elements that have neither style: Filter line images by style class. 'other' corresponds to line elements that have neither
handwritten or typewritten class : {[s.name for s in Style]} handwritten or typewritten class : {[s.name for s in Style]}
accepted_metadatas: Key-value dictionary where each entry is a mandatory Arkindex metadata name/value. Filter lines by metadata.
""" """
transcription_type: TranscriptionType = TranscriptionType.text_line transcription_type: TranscriptionType = TranscriptionType.text_line
...@@ -144,3 +146,4 @@ class FilterArgs: ...@@ -144,3 +146,4 @@ class FilterArgs:
accepted_worker_version_ids: List[str] = field(default_factory=list) accepted_worker_version_ids: List[str] = field(default_factory=list)
skip_vertical_lines: bool = False skip_vertical_lines: bool = False
style: Style = None style: Style = None
accepted_metadatas: dict = field(default_factory=dict)
...@@ -81,6 +81,8 @@ class HTRDataGenerator: ...@@ -81,6 +81,8 @@ class HTRDataGenerator:
self.should_filter_by_worker = bool(self.accepted_worker_version_ids) self.should_filter_by_worker = bool(self.accepted_worker_version_ids)
self.style = filter.style self.style = filter.style
self.should_filter_by_style = bool(self.style) self.should_filter_by_style = bool(self.style)
self.accepted_metadatas = filter.accepted_metadatas
self.should_filter_by_metadatas = bool(self.accepted_metadatas)
self.transcription_type = filter.transcription_type.value self.transcription_type = filter.transcription_type.value
self.skip_vertical_lines = filter.skip_vertical_lines self.skip_vertical_lines = filter.skip_vertical_lines
self.skipped_pages_count = 0 self.skipped_pages_count = 0
...@@ -150,11 +152,25 @@ class HTRDataGenerator: ...@@ -150,11 +152,25 @@ class HTRDataGenerator:
img = read_img(cached_img_path, self.grayscale) img = read_img(cached_img_path, self.grayscale)
return img return img
def metadata_filtering(self, elt):
metadatas = {
metadata["name"]: metadata["value"] for metadata in elt["metadata"]
}
for meta in self.accepted_metadatas:
if not (
meta in metadatas and metadatas[meta] == self.accepted_metadatas[meta]
):
return False
return True
def get_accepted_zones(self, page_id: str): def get_accepted_zones(self, page_id: str):
try: try:
accepted_zones = [] accepted_zones = []
for elt in self.api_client.cached_paginate( for elt in self.api_client.cached_paginate(
"ListElementChildren", id=page_id, with_classes=True "ListElementChildren",
id=page_id,
with_classes=self.should_filter_by_class,
with_metadata=self.should_filter_by_metadatas,
): ):
elem_classes = [c for c in elt["classes"] if c["state"] != "rejected"] elem_classes = [c for c in elt["classes"] if c["state"] != "rejected"]
...@@ -193,12 +209,17 @@ class HTRDataGenerator: ...@@ -193,12 +209,17 @@ class HTRDataGenerator:
raise ValueError( raise ValueError(
f"Multiple style classes on the same element! {elt['id']} - {elem_classes}" f"Multiple style classes on the same element! {elt['id']} - {elem_classes}"
) )
should_accept = found_class == self.style
if not should_accept:
continue
if self.should_filter_by_metadatas:
if self.metadata_filtering(elt):
if found_class == self.style:
accepted_zones.append(elt["zone"]["id"]) accepted_zones.append(elt["zone"]["id"])
else: else:
accepted_zones.append(elt["zone"]["id"]) accepted_zones.append(elt["zone"]["id"])
logger.info( logger.info(
"Number of accepted zone for page {} : {}".format( "Number of accepted zone for page {} : {}".format(
page_id, len(accepted_zones) page_id, len(accepted_zones)
...@@ -269,9 +290,11 @@ class HTRDataGenerator: ...@@ -269,9 +290,11 @@ class HTRDataGenerator:
and res["worker_version_id"] not in self.accepted_worker_version_ids and res["worker_version_id"] not in self.accepted_worker_version_ids
): ):
continue continue
if (self.should_filter_by_class or self.should_filter_by_style) and ( if (
res["element"]["zone"]["id"] not in accepted_zones self.should_filter_by_class
): or self.should_filter_by_style
or self.should_filter_by_metadatas
) and (res["element"]["zone"]["id"] not in accepted_zones):
continue continue
if res["element"]["type"] != self.transcription_type: if res["element"]["type"] != self.transcription_type:
continue continue
...@@ -280,7 +303,7 @@ class HTRDataGenerator: ...@@ -280,7 +303,7 @@ class HTRDataGenerator:
if not text or not text.strip(): if not text or not text.strip():
continue continue
if "\n" in text.strip(): if "\n" in text.strip() and not self.transcription_type == "text":
elem_id = res["element"]["id"] elem_id = res["element"]["id"]
raise ValueError( raise ValueError(
f"Newlines are not allowed in line transcriptions - {page_id} - {elem_id} - {text}" f"Newlines are not allowed in line transcriptions - {page_id} - {elem_id} - {text}"
...@@ -396,7 +419,11 @@ class HTRDataGenerator: ...@@ -396,7 +419,11 @@ class HTRDataGenerator:
) )
def extract_lines(self, page_id: str, image_data: dict): def extract_lines(self, page_id: str, image_data: dict):
if self.should_filter_by_class or self.should_filter_by_style: if (
self.should_filter_by_class
or self.should_filter_by_style
or self.should_filter_by_metadatas
):
accepted_zones = self.get_accepted_zones(page_id) accepted_zones = self.get_accepted_zones(page_id)
else: else:
accepted_zones = [] accepted_zones = []
......
...@@ -41,6 +41,23 @@ def fake_image(): ...@@ -41,6 +41,23 @@ def fake_image():
return extractor.read_img(img_path) return extractor.read_img(img_path)
@pytest.fixture
def fake_run_filter_metadata():
api_client = MockApiClient()
# ignore caching for this test
api_client.cached_paginate = api_client.paginate
with open(FIXTURES / "Maurdor/ListElementChildren/fake_page.json") as f:
pages_json = json.load(f)
api_client.add_response(
"ListElementChildren",
response=pages_json,
id="fake_page",
with_classes=False,
with_metadata=True,
)
return api_client
@pytest.fixture @pytest.fixture
def fake_run_volume_api_client(fake_volume_id): def fake_run_volume_api_client(fake_volume_id):
api_client = MockApiClient() api_client = MockApiClient()
......
[
{
"id": "0b719a5a-40c7-47ec-96c8-6c3064df5485", "type": "text", "name": "29", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "0b719a5a-40c7-47ec-96c8-6c3064df5485", "polygon": [[190, 1187], [190, 1242], [389, 1242], [389, 1187], [190, 1187]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/190,1187,199,55/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027308Z", "classes": [], "metadata": [{"id": "8b5100cf-98c6-4d0f-8bbe-6df07cbdd02e", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "bf7de461-1ca2-4160-b1c7-be63e28bd06e", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null
},
{"id": "fbf0d8e5-4729-4bd1-988a-49e178f7d0e6", "type": "text", "name": "27", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "fbf0d8e5-4729-4bd1-988a-49e178f7d0e6", "polygon": [[676, 2124], [676, 2195], [1070, 2195], [1070, 2124], [676, 2124]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/676,2124,394,71/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027161Z", "classes": [], "metadata": [{"id": "75f36675-92a1-43df-90d0-bccd97b43594", "type": "text", "name": "Function", "value": "reference", "dates": []}, {"id": "e90372ce-db83-47ca-9362-b2e5def497e8", "type": "text", "name": "Language", "value": "english", "dates": []}, {"id": "bd1ad52f-dbb4-4dc8-a865-d6c42abd690e", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null},
{"id": "18725942-24f5-4f81-a16f-62c1323c1041", "type": "text", "name": "28", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "18725942-24f5-4f81-a16f-62c1323c1041", "polygon": [[1154, 1244], [1154, 1327], [1456, 1327], [1456, 1244], [1154, 1244]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/1154,1244,302,83/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.027234Z", "classes": [], "metadata": [{"id": "1b78325e-24fa-452f-b11c-5db2ac52df59", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "85ce21e0-0521-4e59-bfd0-a2060888be56", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null},
{"id": "3ebe7e48-fe2f-4533-92df-9895db05c3f5", "type": "text", "name": "23", "corpus": {"id": "809222f2-b4a4-444c-a1a8-37667ccbff6b", "name": "Maurdor", "public": false}, "thumbnail_url": null, "zone": {"id": "3ebe7e48-fe2f-4533-92df-9895db05c3f5", "polygon": [[1403, 1763], [1403, 1830], [1470, 1830], [1470, 1763], [1403, 1763]], "image": {"id": "65955cb6-7aeb-4f7c-8531-5d797c135e41", "path": "public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "width": 1700, "height": 2339, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png", "s3_url": null, "status": "checked", "server": {"display_name": "https://europe-gamma.iiif.teklia.com/iiif/2", "url": "https://europe-gamma.iiif.teklia.com/iiif/2", "max_width": null, "max_height": null}}, "url": "https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fmaurdor%2Fdev_png%2FYXLOKV-00.png/1403,1763,67,67/full/0/default.jpg"}, "rotation_angle": 0, "mirrored": false, "created": "2022-09-05T22:15:11.026869Z", "classes": [], "metadata": [{"id": "42034ac4-1ed3-4893-a275-296484b417c5", "type": "text", "name": "Language", "value": "arabic", "dates": []}, {"id": "d20a52bb-28e7-407e-998e-b51f10af330a", "type": "text", "name": "Script", "value": "typed", "dates": []}], "has_children": null, "worker_version_id": "329f5b6e-78c9-4240-a2cf-78a746c6f897", "confidence": null}
]
\ No newline at end of file
...@@ -58,6 +58,23 @@ def test_run_volumes_with_worker_version( ...@@ -58,6 +58,23 @@ def test_run_volumes_with_worker_version(
assert api_client.responses == [] assert api_client.responses == []
def test_get_accepted_zones_filter_metadata(tmpdir, fake_run_filter_metadata):
api_client = fake_run_filter_metadata
out_dir_base = tmpdir
htr_data_gen = HTRDataGenerator(
format="kaldi",
filter=FilterArgs(accepted_metadatas={"Language": "arabic"}),
out_dir_base=out_dir_base,
)
htr_data_gen.api_client = api_client
assert htr_data_gen.get_accepted_zones("fake_page") == [
"0b719a5a-40c7-47ec-96c8-6c3064df5485",
"18725942-24f5-4f81-a16f-62c1323c1041",
"3ebe7e48-fe2f-4533-92df-9895db05c3f5",
]
def test_create_partitions(fake_expected_partitions, tmpdir): def test_create_partitions(fake_expected_partitions, tmpdir):
out_dir_base = Path(tmpdir) out_dir_base = Path(tmpdir)
splitter = KaldiPartitionSplitter(out_dir_base=out_dir_base) splitter = KaldiPartitionSplitter(out_dir_base=out_dir_base)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment