Skip to content
Snippets Groups Projects
Commit 9c249430 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'remove-from-csv' into 'master'

Remove ancestral Himanis import

See merge request !495
parents ec14fdea 43b41a4c
No related branches found
No related tags found
1 merge request!495Remove ancestral Himanis import
#!/usr/bin/env python3
"""
Import Himanis volumes from CSV file
"""
from django.core.management.base import BaseCommand
from arkindex.documents.tasks import import_manifest, import_annotations_csv
import logging
import csv
logging.basicConfig(
format='[%(levelname)s] %(processName)-9s: %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Import Himanis volumes from CSV file as an asynchronous task."
def add_arguments(self, parser):
parser.add_argument(
'csv',
help='CSV path with manifests + annotations',
type=open,
)
parser.add_argument(
'-i', '--index-root',
help='Root folder for indexes (/home/data/indexes)',
default='.',
)
parser.add_argument(
'--corpus-id',
required=True,
help='ID of corpus to import volumes into'
)
parser.add_argument(
'--source-id',
required=True,
help='ID of source to import index files into'
)
parser.add_argument(
'--col-name',
help='Index of the volume name column',
type=int,
default=0,
)
parser.add_argument(
'--col-url',
help='Index of the manifest URL column',
type=int,
default=1,
)
parser.add_argument(
'--col-path',
help='Index of the index files path column',
type=int,
default=2,
)
parser.add_argument(
'--col-source',
help='Index of the index source column',
type=int,
default=3,
)
def handle(self, *args, **options):
# Handle verbosity level
verbosity = int(options['verbosity'])
if verbosity > 1:
logger.setLevel(logging.DEBUG)
# Read CSV from management file
logger.debug('Opening CSV: {}'.format(options['csv'].name))
csvreader = csv.reader(options['csv'])
next(csvreader) # skip header
csv_data = list(filter(None, csvreader))
assert len(csv_data) > 0, 'No data in csv !'
col_name = options['col_name']
col_url = options['col_url']
col_path = options['col_path']
col_source = options['col_source']
# Import both manifests & annotations
# from a parsed csv file (or whatever table)
for row in csv_data:
try:
import_manifest(
row[col_url],
offline=True,
annotations=False,
volume_name=row[col_name],
corpus_id=options['corpus_id'],
)
except Exception as e:
logger.error('Failed to import manifest: {}'.format(e))
try:
import_annotations_csv(
row[col_name],
row[col_path],
row[col_source],
index_root=options['index_root'],
datasource_id=options['source_id'],
)
except Exception as e:
logger.error('Failed to import annotations: {}'.format(e))
from arkindex_common.ml_tool import MLToolType
from arkindex.documents.models import Element, ElementType, DataSource
from arkindex.images.importer import IndexImporter, GallicaIndexImporter
from arkindex.dataimport.iiif import IIIFParser
import logging
import os
logger = logging.getLogger(__name__)
def import_manifest(*args, **kwargs):
"""
Import a IIIF manifest or collection from a local file or a URL using IIIFParser.
All arguments are directly passed to IIIFParser's constructor
except build_thumbnail
"""
mp = IIIFParser.load(*args, **kwargs)
volume = mp.run()
return volume.id
def import_annotations_csv(name, raw_path, source, datasource_id, index_root):
"""
Import annotations from a CSV line
"""
if raw_path.startswith('/home/data/indexes'):
raw_path = raw_path[18:]
raw_path = raw_path.lstrip('/')
index_path = os.path.join(index_root, raw_path)
assert os.path.exists(index_path), \
'Index files path not found for volume {} : {}'.format(name, index_path)
volume = Element.objects.get(name=name, type=ElementType.Volume)
datasource = DataSource.objects.get(id=datasource_id, type=MLToolType.Recognizer)
if source == 'bvmm':
logger.info("Importing BVMM annotations for volume {}".format(name))
importer = IndexImporter(index_path, volume, datasource, mask=r'(?:.*/)?([^/]+)_[A-Z]\.idx\.gz')
elif source == 'gallica':
logger.info("Importing Gallica annotations for volume {}".format(name))
importer = GallicaIndexImporter(index_path, volume, datasource)
else: # Try anyway
logger.info("Importing unknown annotations for volume {}".format(name))
importer = IndexImporter(index_path, volume, datasource)
importer.run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment