Skip to content
Snippets Groups Projects
Commit c13979a3 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Bastien Abadie
Browse files

New command to move elements to closest parent

parent 8f0a5fdc
No related branches found
No related tags found
1 merge request!1742New command to move elements to closest parent
......@@ -109,6 +109,7 @@ Aside from the usual Django commands, some custom commands are available via `ma
* `delete_corpus`: Delete a big corpus using a Ponos task;
* `reindex`: Reindex elements into Solr;
* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics.
* `move_lines_to_parents`: Moves element children to their geographical parents;
See `manage.py <command> --help` to view more details about a specific command.
......
#!/usr/bin/env python3
from typing import NamedTuple
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from arkindex.documents.models import ElementType
from arkindex.project.argparse import CorpusArgument
# x and y of top left and bottom right points
BBox = NamedTuple(
"BBox", [("left", int), ("top", int), ("right", int), ("bottom", int)]
)
def compute_polygon_area(polygon: BBox):
return (polygon.bottom - polygon.top) * (polygon.right - polygon.left)
def reorder_polygon(polygon):
"""Reorders the points composing the polygon and convert it to a LineBox.
This function looks for the top-left and the bottom-right corners.
To do so, we look for the points
- closest to (0,0) using the cartesian distance -> this is the top-left corner
- furthest to (0,0) using the cartesian distance -> this is the bottom-right corner
"""
ordered_polygon = sorted(polygon, key=lambda p: p[0] * p[0] + p[1] * p[1])
return BBox(
left=ordered_polygon[0][0],
top=ordered_polygon[0][1],
right=ordered_polygon[-1][0],
bottom=ordered_polygon[-1][1],
)
def compute_area_overlap(parent_polygon: BBox, child_polygon: BBox):
"""Compute intersection pourcentage of child_polygon in parent polygon
"""
left = max(parent_polygon.left, child_polygon.left)
top = max(parent_polygon.top, child_polygon.top)
right = min(parent_polygon.right, child_polygon.right)
bottom = min(parent_polygon.bottom, child_polygon.bottom)
height = bottom - top
width = right - left
if min(height, width) <= 0:
# otherwise might get reversed rectangles that are bigger than cells
return 0
return height * width
def is_overlapping_enough(parent_polygon: BBox, child_polygon: list, threshold: float):
"""
Checks if the intersection between the child's polygon and the parent's polygon is higher than the threshold.
"""
# Reorder the child's polygon and convert it to a BBox
polygon = reorder_polygon(child_polygon)
child_area = compute_polygon_area(polygon)
# Check the intersection value
area_overlap = compute_area_overlap(parent_polygon=parent_polygon,
child_polygon=polygon)
return (area_overlap / child_area) > threshold
class Command(BaseCommand):
help = 'Move elements to closest geographical parent.'
def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
'-c',
'--corpus',
help='ID of the corpus to find types on.',
type=CorpusArgument(),
required=True,
)
parser.add_argument(
'-s',
'--single-page-type',
help='Slug of the type corresponding to a Single Page.',
required=False,
default="single_page"
)
parser.add_argument(
'-d',
'--double-page-type',
help="Slug of the type corresponding to a Double Page.",
required=False,
default="double_page"
)
@transaction.atomic
def handle(self, corpus, single_page_type, double_page_type, verbosity=0, **options):
# Check types
try:
double_page_element_type = corpus.types.get(slug=double_page_type)
except ElementType.DoesNotExist:
raise CommandError(f'Double page type {double_page_type} not found in {corpus}')
try:
single_page_element_type = corpus.types.get(slug=single_page_type)
except ElementType.DoesNotExist:
raise CommandError(f'Single page type {single_page_type} not found in {corpus}')
# Retrieve all double pages
double_pages = corpus.elements.filter(type=double_page_element_type)
total = double_pages.count()
self.stdout.write(f"Found {len(double_pages)} double page(s) to process.")
for i, double_page in enumerate(double_pages):
self.stdout.write(f"Processing double page {double_page.id} ({i}/{total})")
# Retrieve all children that are not a single page
moved_children = corpus.elements.filter(paths__path__last=double_page.id).exclude(type=single_page_element_type).order_by('name')
self.stdout.write(f"Found {len(moved_children)} child element that are not single pages.")
# Retrieve all single pages
single_pages = corpus.elements.filter(paths__path__last=double_page.id, type=single_page_element_type).order_by('name')
self.stdout.write(f"Found {len(single_pages)} child single page(s).")
for single_page in single_pages:
self.stdout.write(f"Processing single page {single_page.id}")
# Convert polygon to BBox to make computation easier
parent_polygon = reorder_polygon(single_page.polygon)
# find all children that are inside that page polygon (intersection over 50%)
close_enough_children = [children for children in moved_children if is_overlapping_enough(parent_polygon=parent_polygon, child_polygon=children.polygon, threshold=0.5)]
self.stdout.write(f"Found {len(close_enough_children)} child(ren) to move to this single page.")
# Update their path to make the single page their parent
for child in close_enough_children:
double_page.remove_child(child)
child.add_parent(single_page)
from django.core.management import CommandError, call_command
from arkindex.documents.models import ElementPath
from arkindex.images.models import Image
from arkindex.project.tests import FixtureTestCase
class TestMoveLinesToParentCommand(FixtureTestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.single_page_type = cls.corpus.types.create(slug="single_page")
cls.double_page_type = cls.corpus.types.create(slug="double_page")
cls.paragraph_type = cls.corpus.types.create(slug="paragraph")
cls.text_line_type = cls.corpus.types.get(slug="text_line")
# Double page element
cls.double_page = cls.corpus.elements.create(name="1", type=cls.double_page_type, polygon=[[0, 0], [0, 1000], [2000, 1000], [2000, 0], [0, 0]], image=Image.objects.first())
# A single page element, child of the double page
cls.single_page = cls.corpus.elements.create(name="1", type=cls.single_page_type, polygon=[[0, 0], [0, 1000], [1000, 1000], [1000, 0], [0, 0]], image=Image.objects.first())
cls.single_page.add_parent(cls.double_page)
def test_merge_types_missing_single_page(self):
with self.assertRaisesMessage(CommandError, 'Single page type 404 not found in Unit Tests'):
call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='404', double_page_type='double_page')
def test_merge_types_missing_double_page(self):
with self.assertRaisesMessage(CommandError, 'Double page type 404 not found in Unit Tests'):
call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='404')
def test_move_lines_geographically_to_closest_parent(self):
"""This text_line overlaps enough with the given single page, it is moved
"""
# Text line close to single page overlapping enough
overlapping_text_line = self.corpus.elements.create(name="1", type=self.text_line_type, polygon=[[600, 0], [1100, 0], [600, 500], [1100, 500], [600, 0]], image=Image.objects.first())
overlapping_text_line.add_parent(self.double_page)
parent_paths = ElementPath.objects.filter(element_id=overlapping_text_line.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}])
with self.assertNumQueries(24):
call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page')
# Make sure that lines have been moved
parent_paths = ElementPath.objects.filter(element_id=overlapping_text_line.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id, self.single_page.id]}])
def test_move_paragraph_geographically_to_closest_parent(self):
"""
This paragraph overlaps enough with the given single page, it is moved
"""
# Paragraph close to single page
paragraph = self.corpus.elements.create(name="1", type=self.paragraph_type, polygon=[[10, 10], [10, 500], [500, 500], [500, 10], [10, 10]], image=Image.objects.first())
paragraph.add_parent(self.double_page)
parent_paths = ElementPath.objects.filter(element_id=paragraph.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}])
with self.assertNumQueries(24):
call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page')
# Make sure that paragraph have been moved
parent_paths = ElementPath.objects.filter(element_id=paragraph.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id, self.single_page.id]}])
def test_do_not_move_line_not_close_enough(self):
"""
This text line does not overlap enough with the single page, it stays under the double page
"""
# Text line not close to single page not overlapping enough
not_overlapping_text_line = self.corpus.elements.create(name="2", type=self.text_line_type, polygon=[[1500, 0], [1500, 100], [2000, 100], [2000, 0], [1500, 0]], image=Image.objects.first())
not_overlapping_text_line.add_parent(self.double_page)
parent_paths = ElementPath.objects.filter(element_id=not_overlapping_text_line.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}])
with self.assertNumQueries(9):
call_command('move_lines_to_parent', corpus=self.corpus, single_page_type='single_page', double_page_type='double_page')
not_overlapping_text_line.refresh_from_db()
# Make sure that line has not been moved
parent_paths = ElementPath.objects.filter(element_id=not_overlapping_text_line.id)
self.assertEqual(len(parent_paths), 1)
self.assertListEqual(list(parent_paths.values('path')), [{"path": [self.double_page.id]}])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment