Skip to content
Snippets Groups Projects
Commit 56df3cfe authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Add custom mask and regex mask

parent 3fde7df0
No related branches found
No related tags found
1 merge request!22Add score to transcriptions
#!/usr/bin/env python3
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError
from arkindex.images.importer import IndexImporter
import logging
import re
logging.basicConfig(
level=logging.INFO,
......@@ -24,6 +25,16 @@ class Command(BaseCommand):
action='store_true',
default=False,
)
parser.add_argument(
'--mask',
help='A mask to identify images from the index file path. Cannot be used with --regex.'
'Syntax: "something<ID>something" '
'<ID> will be used as the image ID.'
)
parser.add_argument(
'--regex',
help='A regex to use as a mask for more complex cases. Must have only one capturing group. Cannot be used with --mask.'
)
def handle(self, *args, **options):
# Handle verbosity level
......@@ -32,7 +43,20 @@ class Command(BaseCommand):
if verbosity > 1:
root_logger.setLevel(logging.DEBUG)
importer = IndexImporter(options['index_folder'])
# Handle mask
if options['mask'] is not None and options['regex'] is not None:
raise CommandError('--mask and --regex cannot be used simultaneously.')
# If --regex is set, use it, else use the default mask.
mask_regex = options['regex'] or IndexImporter.DEFAULT_MASK
if options['mask'] is not None:
mask = options['mask']
assert mask.count('<ID>') == 1
# Replace <ID> with (.+) and escape the rest
mask_regex = '^' + r'(.+)'.join(re.escape(p) for p in mask.split('<ID>')) + '$'
importer = IndexImporter(options['index_folder'], mask=mask_regex)
if options['dry_run']:
importer.dry_run()
else:
......
......@@ -142,9 +142,12 @@ def bulk_transcriptions(image, page, items):
class IndexImporter(object):
"""Import index files (.idx.gz) as transcriptions."""
def __init__(self, path):
DEFAULT_MASK = r'(?:.*/)?([^/]+)\.idx\.gz'
def __init__(self, path, mask=DEFAULT_MASK):
assert os.path.exists(path)
self.path = path
self.mask = re.compile(mask)
def get_index_paths(self):
# Support single file & directories
......@@ -156,10 +159,12 @@ class IndexImporter(object):
yield os.path.realpath(self.path)
def get_image(self, path):
# Remove path and .idx.gz extension
image_id = '.'.join(os.path.basename(path).split('.')[:-2])
if image_id.endswith('_L'):
image_id = image_id[:-2]
try:
image_id = self.mask.findall(path)[0]
logger.debug('Matched {} for path {}'.format(image_id, path))
except IndexError:
logger.debug('Mask did not match path {}'.format(path))
raise Image.DoesNotExist
return Image.objects.get(path__icontains=image_id)
def get_page(self, image):
......@@ -167,6 +172,7 @@ class IndexImporter(object):
return image.elements.filter(type=ElementType.Page).first().page
def run(self):
logger.debug('Using mask {}'.format(self.mask))
for index_path in self.get_index_paths():
logger.info("Parsing index file {}".format(index_path))
try:
......@@ -181,8 +187,8 @@ class IndexImporter(object):
logger.warning("Multiple associated images found for file {}".format(index_path))
def dry_run(self):
image = None
for index_path in self.get_index_paths():
image = None
try:
image = self.get_image(index_path)
except (Image.DoesNotExist, Image.MultipleObjectsReturned):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment