Skip to content
Snippets Groups Projects
Commit 43235f20 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Move script to Django admin command

parent ca239e78
No related branches found
No related tags found
1 merge request!24Import volumes from CSV
#!/usr/bin/env python3
"""
Import Himanis volumes from CSV file
"""
from django.core.management.base import BaseCommand
from multiprocessing.pool import Pool
from arkindex.documents.models import Element, ElementType
from arkindex.documents.importer import URLManifestsImporter
from arkindex.images.importer import IndexImporter, GallicaIndexImporter
import os
import django
import csv
import logging
logging.basicConfig(
format='[%(levelname)s] %(processName)-9s: %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Import Himanis volumes from CSV file. May require the Arkindex backend in PATH."
def import_manifest(url, name):
"Import a manifest from a given URL with a given volume name."
logger.info('Importing volume {} from {}'.format(name, url))
URLManifestsImporter([], url, offline=True, volume_name=name).run()
def import_annotations(source, raw_path, name, index_root):
"""
Import annotations.
source: 'bvmm' or 'gallica'
raw_path: Raw path to index file
name: Volume name
index_root: Root folder for index files
"""
if raw_path.startswith('/home/data/indexes'):
raw_path = raw_path[18:]
raw_path = raw_path.lstrip('/')
index_path = os.path.join(index_root, raw_path)
assert os.path.exists(index_path)
volume = Element.objects.get(name=name, type=ElementType.Volume)
if source == 'bvmm':
IndexImporter(index_path, volume, mask=r'(?:.*/)?([^/]+)_[A-Z]\.idx\.gz').run()
elif source == 'gallica':
GallicaIndexImporter(index_path, volume).run()
else: # Try anyway
IndexImporter(index_path, volume).run()
def add_arguments(self, parser):
parser.add_argument('csv')
parser.add_argument(
'-p', '--processes',
type=int,
help='Maximum number of worker processes for multiprocessing, defaults to 4',
default=4,
)
parser.add_argument(
'-i', '--index-root',
help='Root folder for indexes (/home/data/indexes)',
default='.',
)
def handle(self, *args, **options):
# Handle verbosity level
verbosity = int(options['verbosity'])
if verbosity > 1:
logger.setLevel(logging.DEBUG)
logger.debug('Opening CSV')
with open(options['csv']) as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
data = list(csvreader)
logger.debug('Setting up Django')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "arkindex.project.settings")
django.setup()
with Pool(options['processes']) as pool:
logger.info('Importing all manifests')
pool.starmap(self.import_manifest, [(row[1], row[0]) for row in data])
logger.info('Importing all annotations')
pool.starmap(self.import_annotations, [(row[3], row[2], row[0], args['index_root']) for row in data])
#!/usr/bin/env python3
"""
Import Himanis volumes from CSV file
"""
import argparse
import os
import django
import csv
import logging
import re
from multiprocessing.pool import Pool
logging.basicConfig(format='[%(levelname)s] %(processName)-9s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
def import_manifest(url, name):
"Import a manifest from a given URL with a given volume name."
from arkindex.documents.importer import URLManifestsImporter
logger.info('Importing volume {} from {}'.format(name, url))
URLManifestsImporter([], url, offline=True, volume_name=name).run()
def import_annotations(source, raw_path, name, index_root):
"""
Import annotations.
source: 'bvmm' or 'gallica'
raw_path: Raw path to index file
name: Volume name
index_root: Root folder for index files
"""
from arkindex.documents.models import Element, ElementType
from arkindex.images.models import Image
from arkindex.images.importer import BaseIndexImporter, IndexImporter
if raw_path.startswith('/home/data/indexes'):
raw_path = raw_path[18:]
raw_path = raw_path.lstrip('/')
index_path = os.path.join(index_root, raw_path)
assert os.path.exists(index_path)
volume = Element.objects.get(name=name, type=ElementType.Volume)
if source == 'bvmm':
IndexImporter(index_path, volume, mask=r'(?:.*/)?([^/]+)_[A-Z]\.idx\.gz').run()
elif source == 'gallica':
GallicaIndexImporter(index_path, volume).run()
else: # Try anyway
IndexImporter(index_path, volume).run()
def main():
parser = argparse.ArgumentParser(
description="Import Himanis volumes from CSV file. May require the Arkindex backend in PATH.")
parser.add_argument('csv')
parser.add_argument(
'-p', '--processes',
type=int,
help='Maximum number of worker processes for multiprocessing, defaults to 4',
default=4,
)
parser.add_argument(
'-i', '--index-root',
help='Root folder for indexes (/home/data/indexes)',
default='.',
)
args = vars(parser.parse_args())
logger.debug('Opening CSV')
with open(args['csv']) as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
data = list(csvreader)
logger.debug('Setting up Django')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "arkindex.project.settings")
django.setup()
with Pool(args['processes']) as pool:
logger.info('Importing all manifests')
pool.starmap(import_manifest, [(row[1], row[0]) for row in data])
logger.info('Importing all annotations')
pool.starmap(import_annotations, [(row[3], row[2], row[0], args['index_root']) for row in data])
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment