Skip to content
Snippets Groups Projects
Commit 272479dc authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'optimize-solr-sql' into 'master'

Optimize SQL queries for Solr reindexation

Closes #777

See merge request !1390
parents a2f81217 f3e3a194
No related branches found
No related tags found
1 merge request!1390Optimize SQL queries for Solr reindexation
......@@ -10,15 +10,55 @@ from SolrClient.exceptions import SolrError
from teklia_toolbox.time import Timer
from arkindex.documents.models import Element
from arkindex.project.tools import CounterIterator
logger = logging.getLogger(__name__)
solr = SolrClient(settings.SOLR_API_URL)
# The SQL query that yields all indexable elements.
# Both indexable elements and all their children can be indexed, so child elements keep a link to their parent element
# using the parent_id, parent_name and parent_type columns. For parent elements, the columns are therefore duplicated.
PARENT_QUERY = """
SELECT
element.id AS parent_id,
element.name AS parent_name,
elementtype.display_name AS parent_type,
element.id AS id,
element.name AS name,
elementtype.display_name AS type_name
FROM documents_element element
INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id)
WHERE element.corpus_id = %(corpus)s
AND elementtype.indexable
ORDER BY element.id
"""
# The SQL query that yields all child elements of indexable elements.
ELEMENTS_QUERY = f"""
WITH parent AS ({PARENT_QUERY})
SELECT
parent_id,
parent_name,
parent_type,
element.id as id,
element.name as name,
elementtype.display_name as type_name
FROM (SELECT * FROM parent LIMIT %(limit)s OFFSET %(offset)s) AS parent_chunk
INNER JOIN documents_elementpath as elementpath ON (elementpath.path @> ARRAY[parent_chunk.id])
INNER JOIN documents_element as element ON (elementpath.element_id = element.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id)
"""
class Indexer(object):
# Chunk of element processed (can generate many more documents)
# The query yielding all the elements to run on will look for all the child elements of all indexable elements
# The joins can take a very long time, so the query gets split into one to fetch all the indexable elements,
# then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET.
sql_chunk_size = 10000
# Number of elements to load in Python from all of the SQL queries (can generate many more documents)
elements_chunk_size = 100
# Chunk of documents sent to Solr
......@@ -123,35 +163,28 @@ class Indexer(object):
logger.info(f'Dropped index for {self.collection_name}')
def get_elements(self):
"""
Use a SQL query to keep the link between parent and child across element paths (a parent is also linked to itself)
This query lists the parent and child information (id, name and type name)
The WHERE clause limits the selection to a single corpus and limits the parent elements to indexable elements
"""
query = f"""
SELECT parent.id as parent_id,
parent.name as parent_name,
parenttype.display_name as parent_type,
element.id as id,
element.name as name,
elementtype.display_name as type_name
FROM documents_element as element
LEFT OUTER JOIN documents_elementpath as elementpath ON (element.id = elementpath.element_id)
LEFT OUTER JOIN documents_element as parent ON (
elementpath.path @> ARRAY[parent.id]
OR element.id = parent.id
)
INNER JOIN documents_elementtype parenttype ON (parent.type_id = parenttype.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id)
WHERE element.corpus_id = '{self.corpus_id}' AND parenttype.indexable
"""
return Element.objects.raw(query)
# First make the query that returns all indexable elements.
parents = Element.objects.raw(PARENT_QUERY, params={'corpus': self.corpus_id}).iterator()
# Keep track of the parents count without using SELECT COUNT(*) or storing everything in RAM,
# as we will need it to split the next queries into chunks.
iterator = CounterIterator(parents)
yield from iterator
# Execute one SQL query for each chunk of parent indexable elements.
# This query returns one row for each child element of each parent element,
# which can quickly makes joins very slow; chunking makes the issue manageable.
# We use LIMIT and OFFSET to perform the chunking as sending boatloads of element UUIDs
# back and forth between Python and the DB can be quite slow.
for offset in range(0, len(iterator), self.sql_chunk_size):
yield from Element.objects.raw(ELEMENTS_QUERY, params={
'corpus': self.corpus_id,
'limit': self.sql_chunk_size,
'offset': offset,
}).iterator()
def elements_chunk(self, elements):
element_iterator = elements.iterator()
while True:
chunk = tuple(itertools.islice(element_iterator, self.elements_chunk_size))
chunk = tuple(itertools.islice(elements, self.elements_chunk_size))
if not chunk:
return
yield chunk
......@@ -230,7 +263,7 @@ class Indexer(object):
def index(self):
"""
Insert items into Solr
Process elements with an indexable type and their children by chunck
Process elements with an indexable type and their children by chunk
For each chunk:
- load their dependencies (transcriptions, classifications...)
- serialize items in a Solr document
......
from collections.abc import Iterable, Sized
from collections.abc import Iterable, Iterator, Sized
from django.db.models import CharField, Func
from django.urls import reverse
......@@ -108,6 +108,28 @@ class BulkMap(Sized, Iterable):
return iter(self.func(list(self.iterable)))
class CounterIterator(Sized, Iterator):
"""
Iterate over any iterator while keeping count of how many items were iterated on.
This allows accessing the total amount of items in an iterator once a full iteration is completed
without storing any item in memory.
"""
def __init__(self, iterable):
self.iterator = iter(iterable)
self.count = 0
def __next__(self):
# This can fail with StopIteration if the iteration ends.
item = next(self.iterator)
# It didn't fail, so we can increment the counter.
self.count += 1
return item
def __len__(self):
return self.count
class RTrimChr(Func):
"""
Django's RTrim function only allows trimming spaces.
......
SELECT parent.id as parent_id,
parent.name as parent_name,
parenttype.display_name as parent_type,
SELECT element.id AS parent_id,
element.name AS parent_name,
elementtype.display_name AS parent_type,
element.id AS id,
element.name AS name,
elementtype.display_name AS type_name
FROM documents_element element
INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
AND elementtype.indexable
ORDER BY element.id ;
WITH parent AS
(SELECT element.id AS parent_id,
element.name AS parent_name,
elementtype.display_name AS parent_type,
element.id AS id,
element.name AS name,
elementtype.display_name AS type_name
FROM documents_element element
INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id)
WHERE element.corpus_id = '{corpus_id}'::uuid
AND elementtype.indexable
ORDER BY element.id)
SELECT parent_id,
parent_name,
parent_type,
element.id as id,
element.name as name,
elementtype.display_name as type_name
FROM documents_element as element
LEFT OUTER JOIN documents_elementpath as elementpath ON (element.id = elementpath.element_id)
LEFT OUTER JOIN documents_element as parent ON (elementpath.path @ > ARRAY[parent.id]
OR element.id = parent.id)
INNER JOIN documents_elementtype parenttype ON (parent.type_id = parenttype.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id)
WHERE element.corpus_id = '{corpus_id}'
AND parenttype.indexable ;
FROM
(SELECT *
FROM parent
LIMIT 10000
OFFSET 0) AS parent_chunk
INNER JOIN documents_elementpath as elementpath ON (elementpath.path @ > ARRAY[parent_chunk.id])
INNER JOIN documents_element as element ON (elementpath.element_id = element.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id) ;
SELECT "documents_element"."id",
"documents_element"."zone_id"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment