diff --git a/document_processing/transcription.py b/document_processing/transcription.py index 49bd8e2ecf2fef4a53f1f98fdb57cffff87eea47..144591e525ae8849159208ba606d27f338688873 100644 --- a/document_processing/transcription.py +++ b/document_processing/transcription.py @@ -96,6 +96,8 @@ def merge_close_lines(lines: List[Transcription], threshold=0.5): removed = [] temp = lines.copy() + # Sort lines to try the lines on the left before those on the right + temp.sort(key=lambda line: line.rect.x) for a, b in itertools.combinations(temp, r=2): if b in removed or a in removed: continue diff --git a/tests/test_transcription.py b/tests/test_transcription.py index 258b0b98ff16d4beb250ab1ce498d2baab686ff4..51d3c8b3979fece08ef0fc9c22f12f3a579b44f9 100644 --- a/tests/test_transcription.py +++ b/tests/test_transcription.py @@ -134,3 +134,74 @@ def test_reversed_merge(): ] assert down_para.text == "Another line far away" assert down_para.confidence == 1.0 + + +def test_merge_multiple_lines(): + """ + We have line_1, line_2, line_3, line_4 in that order geographically + but line_1, line_4, line_3, line_2 in the list of lines + + We don't want to merge: + - 4 into 1 + - 3 into 1+4 + - 2 into 1+4+3 + + But: + - 2 into 1 + - 3 into 1+2 + - 4 into 1+2+3 + """ + lines = [ + # Line 1 + Transcription( + element_id="line_1", + confidence=1.0, + polygon=[ + [0, 0], + [10, 0], + [10, 10], + [0, 10], + ], + text="1", + ), + # Line 4 + Transcription( + element_id="line_4", + confidence=1.0, + polygon=[ + [60, 0], + [70, 0], + [70, 10], + [60, 10], + ], + text="4", + ), + # Line 3 + Transcription( + element_id="line_3", + confidence=1.0, + polygon=[ + [40, 0], + [50, 0], + [50, 10], + [40, 10], + ], + text="3", + ), + # Line 2 + Transcription( + element_id="line_2", + confidence=1.0, + polygon=[ + [20, 0], + [30, 0], + [30, 10], + [20, 10], + ], + text="2", + ), + ] + + merged_lines = merge_close_lines(lines, threshold=0.5) + assert len(merged_lines) == 1 + assert merged_lines[0].text == "1 2 3 4"