Skip to content
Snippets Groups Projects
Commit 88583ad2 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Image index importer

parent 06aee500
No related branches found
No related tags found
1 merge request!22Add score to transcriptions
from arkindex.images.models import Zone, Image
from arkindex.documents.models import Transcription, ElementLink
from arkindex.documents.models import Transcription, ElementLink, ElementType, Page
from arkindex.project.tools import BoundingBox
import os
import re
import gzip
import logging
import fnmatch
REGEX_INDEX = re.compile(
b'^(?:line_(\d+) )?(.+) \d+ ([\de\-\.]+) (\d+) (\d+) (\d+) (\d+)')
......@@ -12,13 +13,14 @@ REGEX_INDEX = re.compile(
logger = logging.getLogger(__name__)
def import_indexes(image, index_path, extension='jpg'):
def import_indexes(image, page, index_path, extension='jpg'):
"""
Import indexes from file
One gzipped index file per image
Format: #line word n/a score-[0,1] x1 y1 x2 y2
"""
assert isinstance(image, Image)
assert isinstance(page, Page)
assert os.path.exists(index_path), \
'Missing index {}'.format(index_path)
assert index_path.endswith('.idx.gz')
......@@ -54,7 +56,7 @@ def import_indexes(image, index_path, extension='jpg'):
logger.info('Created {} zones'.format(len(new_zones)))
# Create transcriptions
new_transcriptions = bulk_transcriptions(image, lines)
new_transcriptions = bulk_transcriptions(image, page, lines)
logger.info('Created {} transcriptions '.format(len(new_transcriptions)))
# Index all transcriptions into ES
......@@ -135,3 +137,36 @@ def bulk_transcriptions(image, page, items):
)
return all_ts
class IndexImporter(object):
"""Import index files (.idx.gz) as transcriptions."""
def __init__(self, path):
assert os.path.exists(path)
self.path = path
def get_index_paths(self):
# Support single file & directories
if os.path.isdir(self.path):
for root, _, filenames in os.walk(self.path):
for filename in fnmatch.filter(filenames, "*.idx.gz"):
yield os.path.join(root, filename)
else:
yield os.path.realpath(self.path)
def get_image(self, path):
# Remove path and .idx.gz extension
image_id = '.'.join(os.path.basename(path).split('.')[:-2])
return Image.objects.get(path__contains=image_id)
def get_page(self, image):
assert isinstance(image, Image)
return image.elements.filter(type=ElementType.Page).first().page
def run(self):
for index_path in self.get_index_paths():
logger.info("Parsing index file {}".format(index_path))
image = self.get_image(index_path)
page = self.get_page(image)
import_indexes(image, page, index_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment