Image index importer

88583ad2 · Erwan Rouchet · 06aee500 · 88583ad2
Commit 88583ad2 authored 6 years ago by Erwan Rouchet
--- a/arkindex/images/importer.py
+++ b/arkindex/images/importer.py
 from arkindex.images.models import Zone, Image
-from arkindex.documents.models import Transcription, ElementLink
+from arkindex.documents.models import Transcription, ElementLink, ElementType, Page
 from arkindex.project.tools import BoundingBox
 import os
 import re
 import gzip
 import logging
+import fnmatch

 REGEX_INDEX = re.compile(
    b'^(?:line_(\d+) )?(.+) \d+ ([\de\-\.]+) (\d+) (\d+) (\d+) (\d+)')
@@ -12,13 +13,14 @@ REGEX_INDEX = re.compile(
 logger = logging.getLogger(__name__)


-def import_indexes(image, index_path, extension='jpg'):
+def import_indexes(image, page, index_path, extension='jpg'):
    """
    Import indexes from file
    One gzipped index file per image
    Format: #line word n/a score-[0,1] x1 y1 x2 y2
    """
    assert isinstance(image, Image)
+    assert isinstance(page, Page)
    assert os.path.exists(index_path), \
        'Missing index {}'.format(index_path)
    assert index_path.endswith('.idx.gz')
@@ -54,7 +56,7 @@ def import_indexes(image, index_path, extension='jpg'):
    logger.info('Created {} zones'.format(len(new_zones)))

    # Create transcriptions
-    new_transcriptions = bulk_transcriptions(image, lines)
+    new_transcriptions = bulk_transcriptions(image, page, lines)
    logger.info('Created {} transcriptions '.format(len(new_transcriptions)))

    # Index all transcriptions into ES
@@ -135,3 +137,36 @@ def bulk_transcriptions(image, page, items):
    )

    return all_ts
+
+
+class IndexImporter(object):
+    """Import index files (.idx.gz) as transcriptions."""
+
+    def __init__(self, path):
+        assert os.path.exists(path)
+        self.path = path
+
+    def get_index_paths(self):
+        # Support single file & directories
+        if os.path.isdir(self.path):
+            for root, _, filenames in os.walk(self.path):
+                for filename in fnmatch.filter(filenames, "*.idx.gz"):
+                    yield os.path.join(root, filename)
+        else:
+            yield os.path.realpath(self.path)
+
+    def get_image(self, path):
+        # Remove path and .idx.gz extension
+        image_id = '.'.join(os.path.basename(path).split('.')[:-2])
+        return Image.objects.get(path__contains=image_id)
+
+    def get_page(self, image):
+        assert isinstance(image, Image)
+        return image.elements.filter(type=ElementType.Page).first().page
+
+    def run(self):
+        for index_path in self.get_index_paths():
+            logger.info("Parsing index file {}".format(index_path))
+            image = self.get_image(index_path)
+            page = self.get_page(image)
+            import_indexes(image, page, index_path)