Skip to content
Snippets Groups Projects
Commit d94722e9 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Use bulk insert for indexes

parent 44488df5
No related branches found
No related tags found
No related merge requests found
certifi==2017.7.27.1
chardet==3.0.4
Django==1.11.6
djangorestframework==3.7.1
elasticsearch==5.4.0
idna==2.6
olefile==0.44
Pillow==4.3.0
pkg-resources==0.0.0
pytz==2017.2
requests==2.18.4
urllib3==1.22
......@@ -30,8 +30,17 @@ class Command(BaseCommand):
)
def handle(self, *args, **options):
imported = 0
for path in options['files']:
self.import_indexes(path, options['iiif_base_url'])
try:
self.import_indexes(path, options['iiif_base_url'])
imported += 1
except Exception as e:
logger.error(e)
logger.info('Imported {}/{} index files'.format(
imported,
len(options['files']),
))
def import_indexes(self, index_path, iiif_base_url, extension='jpg'):
"""
......@@ -50,10 +59,13 @@ class Command(BaseCommand):
extension,
)
image, _ = Image.objects.get_or_create(iiif_url=url)
assert image.check_source(), \
'Image not found {}'.format(url)
logger.info('Image {}'.format(image))
# Import each index
with gzip.open(index_path) as f:
indexes = []
for line in f.readlines():
if line.decode('utf-8').startswith('#'):
continue
......@@ -63,7 +75,7 @@ class Command(BaseCommand):
else:
# Store index
data = index.groups()
image.add_index(
indexes.append(image.build_index(
line=int(data[0]),
word=data[1].decode('utf-8'),
score=float(data[2]),
......@@ -73,4 +85,6 @@ class Command(BaseCommand):
int(data[5]),
int(data[6]),
],
)
))
image.store_indexes(indexes)
from django.db import models
from django.conf import settings
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk as es_bulk, scan as es_scan
import requests
import hashlib
import json
import base64
......@@ -19,9 +21,17 @@ class Image(models.Model):
def __str__(self):
return '{} - {}'.format(self.id, self.iiif_url)
def add_index(self, line, word, score, box):
def check_source(self):
"""
Store an index in ElasticSearch
Check the image is available through this url
"""
info_url = '{}/info.json'.format(self.iiif_url)
resp = requests.get(info_url, allow_redirects=True)
return resp.ok
def build_index(self, line, word, score, box):
"""
Build an index to store in ElasticSearch
"""
assert isinstance(line, int)
assert isinstance(word, str)
......@@ -41,16 +51,26 @@ class Image(models.Model):
h.update(json.dumps(payload, sort_keys=True).encode('utf-8'))
index_id = base64.urlsafe_b64encode(h.digest()).decode('utf-8')
# Insert in ElasticSearch database
# To insert in ElasticSearch database
# using the calculated index to avoid duplicates
return (index_id, payload)
def store_indexes(self, indexes):
"""
Store indexes in Bulk mode
"""
assert isinstance(indexes, list)
# Build raw ElasticSearch insert
actions = [{
'_index': 'volume_xxx', # TODO: use volume ?
'_type': 'word',
'_source': index_data,
'_id': index_id,
} for index_id, index_data in indexes]
# Run actions in bulk
elastic = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS)
res = elastic.index(
index='volume_xxx', # TODO: use volume ?
doc_type='word',
id=index_id,
body=payload,
)
assert res['result'] in ('created', 'updated'), \
'Failed to insert word index {} : {}'.format(index_id, res)
return res['created']
nb_insert, _ = es_bulk(elastic, actions)
return nb_insert
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment