Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Commits
26161ca2
Commit
26161ca2
authored
7 years ago
by
Bastien Abadie
Browse files
Options
Downloads
Patches
Plain Diff
Search API with images+indexes, sorted by best index first
parent
d94722e9
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
src/documents/api.py
+19
-0
19 additions, 0 deletions
src/documents/api.py
src/documents/models.py
+56
-2
56 additions, 2 deletions
src/documents/models.py
src/documents/serializers.py
+23
-0
23 additions, 0 deletions
src/documents/serializers.py
src/horae/api.py
+4
-0
4 additions, 0 deletions
src/horae/api.py
with
102 additions
and
2 deletions
src/documents/api.py
0 → 100644
+
19
−
0
View file @
26161ca2
from
rest_framework.generics
import
ListAPIView
from
documents.serializers
import
ImageSearchSerializer
from
documents.models
import
Image
class
ImagesSearch
(
ListAPIView
):
"""
Search through stored images
"""
serializer_class
=
ImageSearchSerializer
def
get_queryset
(
self
):
"""
Support local memory cache to avoid multiple hits on ES
"""
if
hasattr
(
self
,
'
_cache
'
):
return
self
.
_cache
self
.
_cache
=
Image
.
search
(
self
.
kwargs
[
'
query
'
])
return
self
.
_cache
This diff is collapsed.
Click to expand it.
src/documents/models.py
+
56
−
2
View file @
26161ca2
...
...
@@ -4,10 +4,14 @@ from elasticsearch import Elasticsearch
from
elasticsearch.helpers
import
bulk
as
es_bulk
,
scan
as
es_scan
import
requests
import
hashlib
import
itertools
import
json
import
base64
import
uuid
ES_DOC_INDEX_TYPE
=
'
word
'
ES_INDEX
=
'
volume_xxx
'
class
Image
(
models
.
Model
):
"""
A document image
...
...
@@ -21,6 +25,56 @@ class Image(models.Model):
def
__str__
(
self
):
return
'
{} - {}
'
.
format
(
self
.
id
,
self
.
iiif_url
)
@staticmethod
def
search
(
query
):
"""
Search into ElasticSearch
"""
query
=
{
'
bool
'
:
{
'
must
'
:
[
{
'
match
'
:
{
'
word
'
:
query
,
}
},
],
}
}
elastic
=
Elasticsearch
(
settings
.
ELASTIC_SEARCH_HOSTS
)
results
=
es_scan
(
client
=
elastic
,
index
=
ES_INDEX
,
query
=
{
'
query
'
:
query
},
doc_type
=
ES_DOC_INDEX_TYPE
,
)
# Group by images, as an in-memory dict
# dict keys are image ids
grouper
=
lambda
r
:
r
[
'
_source
'
][
'
image
'
]
results
=
sorted
(
results
,
key
=
grouper
)
indexes
=
{
image_id
:
[
i
[
'
_source
'
]
for
i
in
indexes
]
# in memory copy
for
image_id
,
indexes
in
itertools
.
groupby
(
results
,
grouper
)
}
# Load all images from results
images
=
Image
.
objects
.
filter
(
pk__in
=
indexes
.
keys
())
# Map images with their indexes
for
image
in
images
:
image
.
word_results
=
indexes
[
image
.
id
.
hex
]
# Sort images by best index score
def
_best_index
(
image
):
return
max
([
index
[
'
score
'
]
for
index
in
image
.
word_results
])
images
=
sorted
(
images
,
key
=
_best_index
,
reverse
=
True
)
return
images
def
check_source
(
self
):
"""
Check the image is available through this url
...
...
@@ -63,8 +117,8 @@ class Image(models.Model):
# Build raw ElasticSearch insert
actions
=
[{
'
_index
'
:
'
volume_xxx
'
,
# TODO: use volume ?
'
_type
'
:
'
word
'
,
'
_index
'
:
ES_INDEX
,
'
_type
'
:
ES_DOC_INDEX_TYPE
,
'
_source
'
:
index_data
,
'
_id
'
:
index_id
,
}
for
index_id
,
index_data
in
indexes
]
...
...
This diff is collapsed.
Click to expand it.
src/documents/serializers.py
0 → 100644
+
23
−
0
View file @
26161ca2
from
rest_framework
import
serializers
from
documents.models
import
Image
class
WordSerializer
(
serializers
.
Serializer
):
word
=
serializers
.
CharField
()
score
=
serializers
.
CharField
()
box
=
serializers
.
ListField
()
class
ImageSearchSerializer
(
serializers
.
ModelSerializer
):
"""
Serialises an image with its index
issued from a search
"""
word_results
=
WordSerializer
(
many
=
True
)
class
Meta
:
model
=
Image
fields
=
(
'
id
'
,
'
iiif_url
'
,
'
word_results
'
,
)
This diff is collapsed.
Click to expand it.
src/horae/api.py
+
4
−
0
View file @
26161ca2
from
django.conf.urls
import
url
,
include
from
documents.api
import
ImagesSearch
urlpatterns
=
[
# Images search using indexes
url
(
r
'
^images/search/(?P<query>[\w\s]+)$
'
,
ImagesSearch
.
as_view
(),
name
=
'
images-search
'
),
]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment