diff --git a/README.md b/README.md index 4096f9b9d5dff5256bbf7749c6c5153c9f6568bf..64b609b72c3f196f18a921a55f18cba559736991 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ Aside from the usual Django commands, some custom commands are available via `ma * `delete_corpus`: Delete a big corpus using a Ponos task; * `reindex`: Reindex elements into Solr; * `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics. +* `move_lines_to_parents`: Moves element children to their geographical parents; See `manage.py <command> --help` to view more details about a specific command. diff --git a/arkindex/documents/management/commands/move_lines_to_parent.py b/arkindex/documents/management/commands/move_lines_to_parent.py new file mode 100644 index 0000000000000000000000000000000000000000..7b34bb6801b93eab5dc25510beb5851112f1ab79 --- /dev/null +++ b/arkindex/documents/management/commands/move_lines_to_parent.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +from typing import NamedTuple + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction + +from arkindex.documents.models import ElementType +from arkindex.project.argparse import CorpusArgument + +# x and y of top left and bottom right points +BBox = NamedTuple( + "BBox", [("left", int), ("top", int), ("right", int), ("bottom", int)] +) + + +def compute_polygon_area(polygon: BBox): + return (polygon.bottom - polygon.top) * (polygon.right - polygon.left) + + +def reorder_polygon(polygon): + """Reorders the points composing the polygon and convert it to a LineBox. + This function looks for the top-left and the bottom-right corners. + To do so, we look for the points + - closest to (0,0) using the cartesian distance -> this is the top-left corner + - furthest to (0,0) using the cartesian distance -> this is the bottom-right corner + """ + ordered_polygon = sorted(polygon, key=lambda p: p[0] * p[0] + p[1] * p[1]) + return BBox( + left=ordered_polygon[0][0], + top=ordered_polygon[0][1], + right=ordered_polygon[-1][0], + bottom=ordered_polygon[-1][1], + ) + + +def compute_area_overlap(parent_polygon: BBox, child_polygon: BBox): + """Compute intersection pourcentage of child_polygon in parent polygon + """ + left = max(parent_polygon.left, child_polygon.left) + top = max(parent_polygon.top, child_polygon.top) + right = min(parent_polygon.right, child_polygon.right) + bottom = min(parent_polygon.bottom, child_polygon.bottom) + height = bottom - top + width = right - left + if min(height, width) <= 0: + # otherwise might get reversed rectangles that are bigger than cells + return 0 + return height * width + + +def is_overlapping_enough(parent_polygon: BBox, child_polygon: list, threshold: float): + """ + Checks if the intersection between the child's polygon and the parent's polygon is higher than the threshold. + """ + # Reorder the child's polygon and convert it to a BBox + polygon = reorder_polygon(child_polygon) + + child_area = compute_polygon_area(polygon) + + # Check the intersection value + area_overlap = compute_area_overlap(parent_polygon=parent_polygon, + child_polygon=polygon) + + return (area_overlap / child_area) > threshold + + +class Command(BaseCommand): + help = 'Move elements to closest geographical parent.' + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '-c', + '--corpus', + help='ID of the corpus to find types on.', + type=CorpusArgument(), + required=True, + ) + parser.add_argument( + '-s', + '--single-page-type', + help='Slug of the type corresponding to a Single Page.', + required=False, + default="single_page" + ) + parser.add_argument( + '-d', + '--double-page-type', + help="Slug of the type corresponding to a Double Page.", + required=False, + default="double_page" + ) + + @transaction.atomic + def handle(self, corpus, single_page_type, double_page_type, verbosity=0, **options): + # Check types + try: + double_page_element_type = corpus.types.get(slug=double_page_type) + except ElementType.DoesNotExist: + raise CommandError(f'Double page type {double_page_type} not found in {corpus}') + + try: + single_page_element_type = corpus.types.get(slug=single_page_type) + except ElementType.DoesNotExist: + raise CommandError(f'Single page type {single_page_type} not found in {corpus}') + + # Retrieve all double pages + double_pages = corpus.elements.filter(type=double_page_element_type) + total = double_pages.count() + self.stdout.write(f"Found {len(double_pages)} double page(s) to process.") + + for i, double_page in enumerate(double_pages): + self.stdout.write(f"Processing double page {double_page.id} ({i}/{total})") + # Retrieve all children that are not a single page + moved_children = corpus.elements.filter(paths__path__last=double_page.id).exclude(type=single_page_element_type).order_by('name') + self.stdout.write(f"Found {len(moved_children)} child element that are not single pages.") + + # Retrieve all single pages + single_pages = corpus.elements.filter(paths__path__last=double_page.id, type=single_page_element_type).order_by('name') + self.stdout.write(f"Found {len(single_pages)} child single page(s).") + + for single_page in single_pages: + self.stdout.write(f"Processing single page {single_page.id}") + # Convert polygon to BBox to make computation easier + parent_polygon = reorder_polygon(single_page.polygon) + + # find all children that are inside that page polygon (intersection over 50%) + close_enough_children = [children for children in moved_children if is_overlapping_enough(parent_polygon=parent_polygon, child_polygon=children.polygon, threshold=0.5)] + self.stdout.write(f"Found {len(close_enough_children)} child(ren) to move to this single page.") + + # Update their path to make the single page their parent + for child in close_enough_children: + double_page.remove_child(child) + child.add_parent(single_page) diff --git a/arkindex/documents/tests/commands/test_move_lines_to_parent.py b/arkindex/documents/tests/commands/test_move_lines_to_parent.py new file mode 100644 index 0000000000000000000000000000000000000000..a568d4e6eea4d5e1ac83fe9e34ee3faad32290eb --- /dev/null +++ b/arkindex/documents/tests/commands/test_move_lines_to_parent.py @@ -0,0 +1,86 @@ +from django.core.management import CommandError, call_command + +from arkindex.documents.models import ElementPath +from arkindex.images.models import Image +from arkindex.project.tests import FixtureTestCase + + +class TestMoveLinesToParentCommand(FixtureTestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.single_page_type = cls.corpus.types.create(slug="single_page") + cls.double_page_type = cls.corpus.types.create(slug="double_page") + cls.paragraph_type = cls.corpus.types.create(slug="paragraph") + cls.text_line_type = cls.corpus.types.get(slug="text_line") + # Double page element + cls.double_page = cls.corpus.elements.create(name="1", type=cls.double_page_type, polygon=[[0, 0], [0, 1000], [2000, 1000], [2000, 0], [0, 0]], image=Image.objects.first()) + # A single page element, child of the double page + cls.single_page = cls.corpus.elements.create(name="1", type=cls.single_page_type, polygon=[[0, 0], [0, 1000], [1000, 1000], [1000, 0], [0, 0]], image=Image.objects.first()) + cls.single_page.add_parent(cls.double_page) + + def test_merge_types_missing_single_page(self): + with self.assertRaisesMessage(CommandError, 'Single page type 404 not found in Unit Tests'): + call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='404', double_page_type='double_page') + + def test_merge_types_missing_double_page(self): + with self.assertRaisesMessage(CommandError, 'Double page type 404 not found in Unit Tests'): + call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='404') + + def test_move_lines_geographically_to_closest_parent(self): + """This text_line overlaps enough with the given single page, it is moved + """ + # Text line close to single page overlapping enough + overlapping_text_line = self.corpus.elements.create(name="1", type=self.text_line_type, polygon=[[600, 0], [1100, 0], [600, 500], [1100, 500], [600, 0]], image=Image.objects.first()) + overlapping_text_line.add_parent(self.double_page) + parent_paths = ElementPath.objects.filter(element_id=overlapping_text_line.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}]) + + with self.assertNumQueries(24): + call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page') + + # Make sure that lines have been moved + parent_paths = ElementPath.objects.filter(element_id=overlapping_text_line.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id, self.single_page.id]}]) + + def test_move_paragraph_geographically_to_closest_parent(self): + """ + This paragraph overlaps enough with the given single page, it is moved + """ + # Paragraph close to single page + paragraph = self.corpus.elements.create(name="1", type=self.paragraph_type, polygon=[[10, 10], [10, 500], [500, 500], [500, 10], [10, 10]], image=Image.objects.first()) + paragraph.add_parent(self.double_page) + parent_paths = ElementPath.objects.filter(element_id=paragraph.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}]) + + with self.assertNumQueries(24): + call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page') + + # Make sure that paragraph have been moved + parent_paths = ElementPath.objects.filter(element_id=paragraph.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id, self.single_page.id]}]) + + def test_do_not_move_line_not_close_enough(self): + """ + This text line does not overlap enough with the single page, it stays under the double page + """ + # Text line not close to single page not overlapping enough + not_overlapping_text_line = self.corpus.elements.create(name="2", type=self.text_line_type, polygon=[[1500, 0], [1500, 100], [2000, 100], [2000, 0], [1500, 0]], image=Image.objects.first()) + not_overlapping_text_line.add_parent(self.double_page) + parent_paths = ElementPath.objects.filter(element_id=not_overlapping_text_line.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}]) + + with self.assertNumQueries(9): + call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page') + + not_overlapping_text_line.refresh_from_db() + # Make sure that line has not been moved + parent_paths = ElementPath.objects.filter(element_id=not_overlapping_text_line.id) + self.assertEqual(len(parent_paths), 1) + self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}])