Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Commits
272479dc
Commit
272479dc
authored
3 years ago
by
Bastien Abadie
Browse files
Options
Downloads
Plain Diff
Merge branch 'optimize-solr-sql' into 'master'
Optimize SQL queries for Solr reindexation Closes
#777
See merge request
!1390
parents
a2f81217
f3e3a194
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!1390
Optimize SQL queries for Solr reindexation
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
arkindex/documents/indexer_v2.py
+61
-28
61 additions, 28 deletions
arkindex/documents/indexer_v2.py
arkindex/project/tools.py
+23
-1
23 additions, 1 deletion
arkindex/project/tools.py
arkindex/sql_validation/indexer_prefetch_v2.sql
+35
-11
35 additions, 11 deletions
arkindex/sql_validation/indexer_prefetch_v2.sql
with
119 additions
and
40 deletions
arkindex/documents/indexer_v2.py
+
61
−
28
View file @
272479dc
...
...
@@ -10,15 +10,55 @@ from SolrClient.exceptions import SolrError
from
teklia_toolbox.time
import
Timer
from
arkindex.documents.models
import
Element
from
arkindex.project.tools
import
CounterIterator
logger
=
logging
.
getLogger
(
__name__
)
solr
=
SolrClient
(
settings
.
SOLR_API_URL
)
# The SQL query that yields all indexable elements.
# Both indexable elements and all their children can be indexed, so child elements keep a link to their parent element
# using the parent_id, parent_name and parent_type columns. For parent elements, the columns are therefore duplicated.
PARENT_QUERY
=
"""
SELECT
element.id AS parent_id,
element.name AS parent_name,
elementtype.display_name AS parent_type,
element.id AS id,
element.name AS name,
elementtype.display_name AS type_name
FROM documents_element element
INNER JOIN documents_elementtype elementtype ON (elementtype.id = element.type_id)
WHERE element.corpus_id = %(corpus)s
AND elementtype.indexable
ORDER BY element.id
"""
# The SQL query that yields all child elements of indexable elements.
ELEMENTS_QUERY
=
f
"""
WITH parent AS (
{
PARENT_QUERY
}
)
SELECT
parent_id,
parent_name,
parent_type,
element.id as id,
element.name as name,
elementtype.display_name as type_name
FROM (SELECT * FROM parent LIMIT %(limit)s OFFSET %(offset)s) AS parent_chunk
INNER JOIN documents_elementpath as elementpath ON (elementpath.path @> ARRAY[parent_chunk.id])
INNER JOIN documents_element as element ON (elementpath.element_id = element.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id)
"""
class
Indexer
(
object
):
# Chunk of element processed (can generate many more documents)
# The query yielding all the elements to run on will look for all the child elements of all indexable elements
# The joins can take a very long time, so the query gets split into one to fetch all the indexable elements,
# then one to fetch the child elements of {sql_chunk_size} indexable elements using LIMIT and OFFSET.
sql_chunk_size
=
10000
# Number of elements to load in Python from all of the SQL queries (can generate many more documents)
elements_chunk_size
=
100
# Chunk of documents sent to Solr
...
...
@@ -123,35 +163,28 @@ class Indexer(object):
logger
.
info
(
f
'
Dropped index for
{
self
.
collection_name
}
'
)
def
get_elements
(
self
):
"""
Use a SQL query to keep the link between parent and child across element paths (a parent is also linked to itself)
This query lists the parent and child information (id, name and type name)
The WHERE clause limits the selection to a single corpus and limits the parent elements to indexable elements
"""
query
=
f
"""
SELECT parent.id as parent_id,
parent.name as parent_name,
parenttype.display_name as parent_type,
element.id as id,
element.name as name,
elementtype.display_name as type_name
FROM documents_element as element
LEFT OUTER JOIN documents_elementpath as elementpath ON (element.id = elementpath.element_id)
LEFT OUTER JOIN documents_element as parent ON (
elementpath.path @> ARRAY[parent.id]
OR element.id = parent.id
)
INNER JOIN documents_elementtype parenttype ON (parent.type_id = parenttype.id)
INNER JOIN documents_elementtype elementtype ON (element.type_id = elementtype.id)
WHERE element.corpus_id =
'
{
self
.
corpus_id
}
'
AND parenttype.indexable
"""
return
Element
.
objects
.
raw
(
query
)
# First make the query that returns all indexable elements.
parents
=
Element
.
objects
.
raw
(
PARENT_QUERY
,
params
=
{
'
corpus
'
:
self
.
corpus_id
}).
iterator
()
# Keep track of the parents count without using SELECT COUNT(*) or storing everything in RAM,
# as we will need it to split the next queries into chunks.
iterator
=
CounterIterator
(
parents
)
yield
from
iterator
# Execute one SQL query for each chunk of parent indexable elements.
# This query returns one row for each child element of each parent element,
# which can quickly makes joins very slow; chunking makes the issue manageable.
# We use LIMIT and OFFSET to perform the chunking as sending boatloads of element UUIDs
# back and forth between Python and the DB can be quite slow.
for
offset
in
range
(
0
,
len
(
iterator
),
self
.
sql_chunk_size
):
yield
from
Element
.
objects
.
raw
(
ELEMENTS_QUERY
,
params
=
{
'
corpus
'
:
self
.
corpus_id
,
'
limit
'
:
self
.
sql_chunk_size
,
'
offset
'
:
offset
,
}).
iterator
()
def
elements_chunk
(
self
,
elements
):
element_iterator
=
elements
.
iterator
()
while
True
:
chunk
=
tuple
(
itertools
.
islice
(
element
_iterator
,
self
.
elements_chunk_size
))
chunk
=
tuple
(
itertools
.
islice
(
element
s
,
self
.
elements_chunk_size
))
if
not
chunk
:
return
yield
chunk
...
...
@@ -230,7 +263,7 @@ class Indexer(object):
def
index
(
self
):
"""
Insert items into Solr
Process elements with an indexable type and their children by chun
c
k
Process elements with an indexable type and their children by chunk
For each chunk:
- load their dependencies (transcriptions, classifications...)
- serialize items in a Solr document
...
...
This diff is collapsed.
Click to expand it.
arkindex/project/tools.py
+
23
−
1
View file @
272479dc
from
collections.abc
import
Iterable
,
Sized
from
collections.abc
import
Iterable
,
Iterator
,
Sized
from
django.db.models
import
CharField
,
Func
from
django.urls
import
reverse
...
...
@@ -108,6 +108,28 @@ class BulkMap(Sized, Iterable):
return
iter
(
self
.
func
(
list
(
self
.
iterable
)))
class
CounterIterator
(
Sized
,
Iterator
):
"""
Iterate over any iterator while keeping count of how many items were iterated on.
This allows accessing the total amount of items in an iterator once a full iteration is completed
without storing any item in memory.
"""
def
__init__
(
self
,
iterable
):
self
.
iterator
=
iter
(
iterable
)
self
.
count
=
0
def
__next__
(
self
):
# This can fail with StopIteration if the iteration ends.
item
=
next
(
self
.
iterator
)
# It didn't fail, so we can increment the counter.
self
.
count
+=
1
return
item
def
__len__
(
self
):
return
self
.
count
class
RTrimChr
(
Func
):
"""
Django
'
s RTrim function only allows trimming spaces.
...
...
This diff is collapsed.
Click to expand it.
arkindex/sql_validation/indexer_prefetch_v2.sql
+
35
−
11
View file @
272479dc
SELECT
parent
.
id
as
parent_id
,
parent
.
name
as
parent_name
,
parenttype
.
display_name
as
parent_type
,
SELECT
element
.
id
AS
parent_id
,
element
.
name
AS
parent_name
,
elementtype
.
display_name
AS
parent_type
,
element
.
id
AS
id
,
element
.
name
AS
name
,
elementtype
.
display_name
AS
type_name
FROM
documents_element
element
INNER
JOIN
documents_elementtype
elementtype
ON
(
elementtype
.
id
=
element
.
type_id
)
WHERE
element
.
corpus_id
=
'{corpus_id}'
::
uuid
AND
elementtype
.
indexable
ORDER
BY
element
.
id
;
WITH
parent
AS
(
SELECT
element
.
id
AS
parent_id
,
element
.
name
AS
parent_name
,
elementtype
.
display_name
AS
parent_type
,
element
.
id
AS
id
,
element
.
name
AS
name
,
elementtype
.
display_name
AS
type_name
FROM
documents_element
element
INNER
JOIN
documents_elementtype
elementtype
ON
(
elementtype
.
id
=
element
.
type_id
)
WHERE
element
.
corpus_id
=
'{corpus_id}'
::
uuid
AND
elementtype
.
indexable
ORDER
BY
element
.
id
)
SELECT
parent_id
,
parent_name
,
parent_type
,
element
.
id
as
id
,
element
.
name
as
name
,
elementtype
.
display_name
as
type_name
FROM
documents_element
as
element
LEFT
OUTER
JOIN
documents_elementpath
as
elementpath
ON
(
element
.
id
=
elementpath
.
element_id
)
LEFT
OUTER
JOIN
documents_element
as
parent
ON
(
elementpath
.
path
@
>
ARRAY
[
parent
.
id
]
OR
element
.
id
=
parent
.
id
)
INNER
JOIN
documents_elementtype
parenttype
ON
(
parent
.
type_id
=
parenttype
.
id
)
INNER
JOIN
documents_element
type
element
type
ON
(
element
.
type_id
=
elementtype
.
id
)
WHERE
element
.
corpus_id
=
'{corpus_id}'
AND
par
enttype
.
indexable
;
FROM
(
SELECT
*
FROM
parent
LIMIT
10000
OFFSET
0
)
AS
parent_chunk
INNER
JOIN
documents_element
path
as
element
path
ON
(
element
path
.
path
@
>
ARRAY
[
parent_chunk
.
id
]
)
INNER
JOIN
documents_element
as
element
ON
(
elementpath
.
element_id
=
element
.
id
)
INNER
JOIN
documents_elementtype
elementtype
ON
(
elem
ent
.
type
_id
=
elementtype
.
id
)
;
SELECT
"documents_element"
.
"id"
,
"documents_element"
.
"zone_id"
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment