Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/data-generator
1 result
Show changes
Commits on Source (5)
......@@ -3,7 +3,7 @@ stages:
- deploy
test:
image: python:3.8
image: python:3
stage: test
cache:
......
......@@ -4,19 +4,18 @@ repos:
hooks:
- id: isort
- repo: https://github.com/ambv/black
rev: 22.3.0
rev: 22.12.0
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 3.9.2
rev: 6.0.0
hooks:
- id: flake8
additional_dependencies:
- 'flake8-coding==1.3.1'
- 'flake8-copyright==0.2.2'
- 'flake8-debugger==3.1.0'
- 'flake8-coding==1.3.2'
- 'flake8-debugger==4.1.2'
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
rev: v4.4.0
hooks:
- id: check-ast
- id: check-docstring-first
......@@ -33,7 +32,7 @@ repos:
- id: check-json
- id: requirements-txt-fixer
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
rev: v2.2.2
hooks:
- id: codespell
args: ['--write-changes', '--exclude-file', '.codespell_ignore_lines']
......
......@@ -101,7 +101,7 @@ class ImageArgs:
extraction_mode: Mode for extracting the line images: {[e.name for e in Extraction]},
max_deskew_angle: Maximum angle by which deskewing is allowed to rotate the line image.
the angle determined by deskew tool is bigger than max then that line won't be deskewed/rotated.
skew_angle: Angle by which the line image will be rotated. Useful for data augmnetation"
skew_angle: Angle by which the line image will be rotated. Useful for data augmentation"
creating skewed text lines for a more robust model. Only used with skew_* extraction modes.
should_rotate (bool): Use text line rotation class to rotate lines if possible
grayscale (bool): Convert images to grayscale (By default grayscale)
......@@ -147,3 +147,4 @@ class FilterArgs:
skip_vertical_lines: bool = False
style: Style = None
accepted_metadatas: dict = field(default_factory=dict)
filter_parent_metadatas: bool = False
......@@ -82,6 +82,7 @@ class HTRDataGenerator:
self.style = filter.style
self.should_filter_by_style = bool(self.style)
self.accepted_metadatas = filter.accepted_metadatas
self.filter_parent_metadatas = filter.filter_parent_metadatas
self.should_filter_by_metadatas = bool(self.accepted_metadatas)
self.transcription_type = filter.transcription_type.value
self.skip_vertical_lines = filter.skip_vertical_lines
......@@ -153,12 +154,20 @@ class HTRDataGenerator:
return img
def metadata_filtering(self, elt):
metadatas = {
metadata["name"]: metadata["value"] for metadata in elt["metadata"]
}
if self.filter_parent_metadatas:
metadatas = []
parents = self.api_client.paginate(
"ListElementParents", id=elt["id"], with_metadata=True
)
for parent in parents:
metadatas.extend(parent["metadata"])
else:
metadatas = elt["metadata"]
metadatas_dict = {metadata["name"]: metadata["value"] for metadata in metadatas}
for meta in self.accepted_metadatas:
if not (
meta in metadatas and metadatas[meta] == self.accepted_metadatas[meta]
meta in metadatas_dict
and metadatas_dict[meta] == self.accepted_metadatas[meta]
):
return False
return True
......@@ -170,15 +179,20 @@ class HTRDataGenerator:
"ListElementChildren",
id=page_id,
with_classes=self.should_filter_by_class,
with_metadata=self.should_filter_by_metadatas,
with_metadata=self.should_filter_by_metadatas
and not self.filter_parent_metadatas,
recursive=True,
):
elem_classes = [c for c in elt["classes"] if c["state"] != "rejected"]
should_accept = True
if self.should_filter_by_class:
# at first filter to only have elements with accepted classes
# if accepted classes list is empty then should accept all
# except for ignored classes
elem_classes = [
c for c in elt["classes"] if c["state"] != "rejected"
]
should_accept = len(self.accepted_classes) == 0
for classification in elem_classes:
class_name = classification["ml_class"]["name"]
......@@ -193,6 +207,9 @@ class HTRDataGenerator:
continue
if self.should_filter_by_style:
elem_classes = [
c for c in elt["classes"] if c["state"] != "rejected"
]
style_counts = Counter()
for classification in elem_classes:
class_name = classification["ml_class"]["name"]
......@@ -216,7 +233,6 @@ class HTRDataGenerator:
if self.should_filter_by_metadatas:
if self.metadata_filtering(elt):
accepted_zones.append(elt["zone"]["id"])
else:
accepted_zones.append(elt["zone"]["id"])
......
apistar==0.7.2
arkindex-client==1.0.9
jsonargparse==4.13.2
teklia-line-image-extractor==0.2.4
tqdm==4.64.0
tqdm==4.64.1
typesystem==0.2.5
......@@ -54,6 +54,7 @@ def fake_run_filter_metadata():
id="fake_page",
with_classes=False,
with_metadata=True,
recursive=True,
)
return api_client
......