From 6f2c9dcf8fad21b3850016248422a830c2956fa3 Mon Sep 17 00:00:00 2001
From: Erwan Rouchet <rouchet@teklia.com>
Date: Fri, 16 Nov 2018 14:32:18 +0000
Subject: [PATCH] Compute page text from lines

---
 arkindex/dataimport/tasks.py            |  9 +++---
 arkindex/dataimport/tests/test_tasks.py |  2 ++
 arkindex/documents/models.py            |  9 ++++++
 arkindex/documents/tests/test_page.py   | 42 +++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 arkindex/documents/tests/test_page.py

diff --git a/arkindex/dataimport/tasks.py b/arkindex/dataimport/tasks.py
index 9d4a21a233..8a6257d256 100644
--- a/arkindex/dataimport/tasks.py
+++ b/arkindex/dataimport/tasks.py
@@ -214,10 +214,6 @@ def save_ml_results(self, results, **kwargs):
         if result is None:
             continue
 
-        page.classification = result['classification']
-        page.save()
-        self.report_message("Updated ML results for {}".format(page))
-
         tr_items = result['zones']
         # Parse transcription types
         for item in tr_items:
@@ -235,6 +231,11 @@ def save_ml_results(self, results, **kwargs):
             index_transcriptions(transcriptions)
             self.report_message('Indexed transcriptions for {}'.format(page))
 
+        page.build_text()
+        page.classification = result['classification']
+        page.save()
+        self.report_message("Updated ML results for {}".format(page))
+
     return list(map(str, results.keys()))
 
 
diff --git a/arkindex/dataimport/tests/test_tasks.py b/arkindex/dataimport/tests/test_tasks.py
index f32dbcc703..60927626f3 100644
--- a/arkindex/dataimport/tests/test_tasks.py
+++ b/arkindex/dataimport/tests/test_tasks.py
@@ -72,6 +72,7 @@ class TestTasks(FixtureMixin, RedisMockAPITestCase):
             'label': 'dog',
             'probability': 0.9,
         }])
+        self.assertEqual(dog.text, '')
         dog_ts = dog.transcriptions.get()
         self.assertEqual(dog_ts.type, TranscriptionType.Word)
         self.assertEqual(dog_ts.text, 'woof')
@@ -83,6 +84,7 @@ class TestTasks(FixtureMixin, RedisMockAPITestCase):
             'label': 'cat',
             'probability': 0.8,
         }])
+        self.assertEqual(cat.text, 'meow')
         cat_ts = cat.transcriptions.get()
         self.assertEqual(cat_ts.type, TranscriptionType.Line)
         self.assertEqual(cat_ts.text, 'meow')
diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py
index 48937a7061..895c282809 100644
--- a/arkindex/documents/models.py
+++ b/arkindex/documents/models.py
@@ -437,6 +437,15 @@ class Page(Element):
         # Wait for result
         self.text = task.get()
 
+    def build_text(self):
+        self.text = '\n'.join(
+            t.text
+            for t in sorted(
+                self.transcriptions.filter(type=TranscriptionType.Line).prefetch_related('zone'),
+                key=lambda t: (t.zone.polygon.center.y, t.zone.polygon.center.x),
+            )
+        )
+
 
 class Act(Element):
     """
diff --git a/arkindex/documents/tests/test_page.py b/arkindex/documents/tests/test_page.py
new file mode 100644
index 0000000000..ebb16a2767
--- /dev/null
+++ b/arkindex/documents/tests/test_page.py
@@ -0,0 +1,42 @@
+from arkindex.project.tests import FixtureTestCase
+from arkindex.project.polygon import Polygon
+from arkindex.documents.models import Page, TranscriptionType
+
+
+class TestPage(FixtureTestCase):
+
+    def test_build_text(self):
+        img = self.imgsrv.images.create(path='build_text_test')
+        page = Page.objects.create(
+            corpus=self.corpus,
+            name='Page text test',
+            zone=img.zones.create(
+                polygon=Polygon.from_coords(0, 0, 1000, 1000),
+            ),
+        )
+        # Two lines next to each other
+        page.transcriptions.create(
+            type=TranscriptionType.Line,
+            zone=img.zones.create(polygon=Polygon.from_coords(0, 0, 500, 100)),
+            text='Ligne 1',
+        )
+        page.transcriptions.create(
+            type=TranscriptionType.Line,
+            zone=img.zones.create(polygon=Polygon.from_coords(600, 0, 100, 100)),
+            text='Ligne 2',
+        )
+        # One line under them
+        page.transcriptions.create(
+            type=TranscriptionType.Line,
+            zone=img.zones.create(polygon=Polygon.from_coords(0, 200, 42, 42)),
+            text='Ligne 3',
+        )
+        # A word that should not appear
+        page.transcriptions.create(
+            type=TranscriptionType.Word,
+            zone=img.zones.create(polygon=Polygon.from_coords(0, 0, 42, 42)),
+            text='Mot',
+        )
+
+        page.build_text()
+        self.assertEqual(page.text, "Ligne 1\nLigne 2\nLigne 3")
-- 
GitLab