diff --git a/document_processing/transcription.py b/document_processing/transcription.py new file mode 100644 index 0000000000000000000000000000000000000000..49bd8e2ecf2fef4a53f1f98fdb57cffff87eea47 --- /dev/null +++ b/document_processing/transcription.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +import itertools +from typing import List + +from document_processing.utils import BoundingBox, Point, TextOrientation, bbox_to_polygon, bounding_rect + + +class Transcription: + def __init__( + self, + element_id, + polygon, + text, + confidence, + orientation=TextOrientation.HorizontalLeftToRight, + rotation_class=None, + rotation_class_confidence=None, + ): + self.element_id = element_id + self.polygon = polygon + self.text = text + self.confidence = confidence + if isinstance(orientation, TextOrientation): + self.orientation = orientation + else: + self.orientation = TextOrientation(orientation) + self.rotation_class = rotation_class + self.rotation_class_confidence = rotation_class_confidence + + @property + def rect(self): + return BoundingBox._make(bounding_rect(self.polygon)) + + def __repr__(self): + return str(vars(self)) + + def merge(self, other): + """ + Extend inner data (polygon, text, confidence) with another line data. + + Keep a polygon that contains both lines' polygon. + Text is concatenated, starting from the leftmost line. + Confidences are averaged if there are any. + """ + assert isinstance(other, Transcription) + + if other.rect.x < self.rect.x: + self.text = other.text + " " + self.text + else: + self.text += " " + other.text + + if self.confidence and other.confidence: + self.confidence = (self.confidence + other.confidence) / 2 + elif other.confidence: + self.confidence = other.confidence + + self.polygon = get_global_polygon(self.polygon, other.polygon) + + @property + def center(self) -> Point: + """Compute barycenter of this polygon""" + return Point( + round((self.rect.x + self.rect.width) / 2), + round((self.rect.y + self.rect.height) / 2), + ) + + +def get_global_polygon(a: BoundingBox, b: BoundingBox): + """ + Look for most extreme points on lines + """ + xa, ya = zip(*a) + xb, yb = zip(*b) + + minx = min(xa + xb) + miny = min(ya + yb) + maxx = max(xa + xb) + maxy = max(ya + yb) + return bbox_to_polygon( + BoundingBox( + x=minx, + y=miny, + width=maxx - minx, + height=maxy - miny, + ) + ) + + +def merge_close_lines(lines: List[Transcription], threshold=0.5): + """ + Merge lines that are close to each other (on vertical axis) + """ + if threshold is None: + threshold = 0.5 + + removed = [] + temp = lines.copy() + for a, b in itertools.combinations(temp, r=2): + if b in removed or a in removed: + continue + + ratio = abs((b.center.y - a.center.y) / a.rect.height) + if ratio <= threshold: + a.merge(b) + removed.append(b) + + return [line for line in lines if line not in removed] diff --git a/document_processing/utils.py b/document_processing/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2d44d25efc57aa2c40830948233975250d04e45 --- /dev/null +++ b/document_processing/utils.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +from enum import Enum +from typing import List, NamedTuple + +BoundingBox = NamedTuple("BoundingBox", x=int, y=int, width=int, height=int) +Point = NamedTuple("Point", x=int, y=int) + + +class TextOrientation(Enum): + """ + Orientation of a transcription's text. + Copied from https://gitlab.com/teklia/workers/base-worker/-/blob/master/arkindex_worker/worker/transcription.py + """ + + HorizontalLeftToRight = "horizontal-lr" + """ + The text is read from top to bottom then left to right. + This is the default when no orientation is specified. + """ + + HorizontalRightToLeft = "horizontal-rl" + """ + The text is read from top to bottom then right to left. + """ + + VerticalRightToLeft = "vertical-rl" + """ + The text is read from right to left then top to bottom. + """ + + VerticalLeftToRight = "vertical-lr" + """ + The text is read from left to right then top to bottom. + """ + + +def bounding_rect(polygon: list) -> BoundingBox: + """Compute the bounding rectangle from polygon. + + :returns x,y of top left corner and width and height + """ + xs = [int(point[0]) for point in polygon] + ys = [int(point[1]) for point in polygon] + x, y, x2, y2 = min(xs), min(ys), max(xs), max(ys) + + height = y2 - y + width = x2 - x + return BoundingBox(x, y, width, height) + + +def bbox_to_polygon(bbox: BoundingBox) -> List[List[int]]: + x, y, width, height = bbox + return [ + [x, y], + [x + width, y], + [x + width, y + height], + [x, y + height], + [x, y], + ] diff --git a/tests/test_dummy.py b/tests/test_dummy.py deleted file mode 100644 index f4f53619168f8993841e5a85193b424a60085554..0000000000000000000000000000000000000000 --- a/tests/test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - assert True diff --git a/tests/test_transcription.py b/tests/test_transcription.py new file mode 100644 index 0000000000000000000000000000000000000000..258b0b98ff16d4beb250ab1ce498d2baab686ff4 --- /dev/null +++ b/tests/test_transcription.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +from document_processing.transcription import Transcription, merge_close_lines + + +def test_simple_merge(): + """ + Check if the polygons are merged and if the text is in the correct order + """ + + # Polygon on the upper left corner + left_line = Transcription( + element_id="Fake_element_1", + confidence=1.0, + polygon=[ + [0, 0], + [60, 0], + [60, 20], + [0, 20], + ], + text="Hello", + ) + + # Polygon right after A + right_line = Transcription( + element_id="Fake_element_2", + confidence=0.5, + polygon=[ + [200, 10], + [580, 10], + [580, 30], + [200, 30], + ], + text="World !", + ) + + # And another point on another line + below_line = Transcription( + element_id="Fake_element_3", + confidence=1.0, + polygon=[ + [100, 100], + [100, 200], + [400, 200], + [400, 100], + ], + text="Another line far away", + ) + + up_paragraph, down_paragraph = merge_close_lines([left_line, right_line, below_line]) + + # Top lines are merged + assert up_paragraph.polygon == [ + [0, 0], + [580, 0], + [580, 30], + [0, 30], + [0, 0], + ] + assert up_paragraph.text == "Hello World !" + assert up_paragraph.confidence == 0.75 + + # Bottom paragraph only has one line + assert down_paragraph.polygon == [ + [100, 100], + [100, 200], + [400, 200], + [400, 100], + ] + assert down_paragraph.text == "Another line far away" + assert down_paragraph.confidence == 1.0 + + +def test_reversed_merge(): + """ + Check if the polygons are merged and if the merged text used + the leftmost line first. + """ + # Polygon on the right compared to the other one + right_line = Transcription( + element_id="Fake_element_2", + confidence=0.5, + polygon=[ + [200, 10], + [580, 10], + [580, 30], + [200, 30], + ], + text="World !", + ) + # Polygon on the left compared to the other one + left_line = Transcription( + element_id="Fake_element_1", + confidence=1.0, + polygon=[ + [0, 0], + [60, 0], + [60, 20], + [0, 20], + ], + text="Hello", + ) + + # And another point on another line + below_line = Transcription( + element_id="Fake_element_3", + confidence=1.0, + polygon=[ + [100, 100], + [100, 200], + [400, 200], + [400, 100], + ], + text="Another line far away", + ) + down_para, up_paragraph = merge_close_lines([below_line, right_line, left_line]) + + # A and B are merged + assert up_paragraph.polygon == [ + [0, 0], + [580, 0], + [580, 30], + [0, 30], + [0, 0], + ] + assert up_paragraph.text == "Hello World !" + assert up_paragraph.confidence == 0.75 + + # Paragraph below only has one line + assert down_para.polygon == [ + [100, 100], + [100, 200], + [400, 200], + [400, 100], + ] + assert down_para.text == "Another line far away" + assert down_para.confidence == 1.0