Skip to content
Snippets Groups Projects
Commit 0d34e569 authored by Bastien Abadie's avatar Bastien Abadie
Browse files

Merge branch 'tools-cleanup' into 'master'

Cleanup arkindex.project.tools

See merge request !868
parents 3d43ede7 64a23975
No related branches found
No related tags found
1 merge request!868Cleanup arkindex.project.tools
from itertools import groupby
from arkindex_common.tei import TeiElement, TeiParser as BaseTeiParser
from arkindex_common.enums import MetaType
from arkindex.project.tools import find_closest
from arkindex.project.triggers import reindex_start
from arkindex.documents.models import Element, Entity, DataSource, MLToolType
import logging
import Levenshtein
logger = logging.getLogger(__name__)
def find_closest(name, queryset, min_ratio=0.8):
'''
Find closest element using Levenshtein distance.
'''
assert isinstance(name, str)
scores = sorted([
(element, Levenshtein.ratio(name, element.name))
for element in queryset
], key=lambda x: x[1], reverse=True)
best_element, best_score = scores[0]
if best_score >= min_ratio:
return best_element
class TeiParser(BaseTeiParser):
def __init__(self, path, corpus, folder_type, element_type):
......
from unittest import TestCase
from arkindex.project.tools import elasticsearch_escape
class TestTools(TestCase):
def test_elasticsearch_escape(self):
self.assertEqual(elasticsearch_escape('abcdef'), 'abcdef')
self.assertEqual(elasticsearch_escape('aaaaa+b'), 'aaaaa\\\\+b')
self.assertEqual(elasticsearch_escape('\\'), '\\\\')
self.assertEqual(elasticsearch_escape('[]'), '\\\\[\\\\]')
self.assertEqual(elasticsearch_escape(' \" '), ' \\\\\\" ')
self.assertEqual(elasticsearch_escape('&|'), '&|')
self.assertEqual(elasticsearch_escape('&&||'), '\\\\&&\\\\||')
self.assertEqual(elasticsearch_escape('a<a>a'), 'aaa')
from collections.abc import Sized, Iterable
from django.urls import reverse
from arkindex.documents.models import Element
import Levenshtein
import random
import string
import re
def random_string(n):
'''
Build a random chars string of length n
'''
return ''.join([
random.choice(string.ascii_letters + string.digits)
for _ in range(n)
])
ES_ESCAPE_REGEX = re.compile(r'([+\-=\\><!(){}\[\]^"~*?:/]|\|\||&&)')
def elasticsearch_escape(s):
'''
Escape a string for use by ElasticSearch
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_reserved_characters # noqa
'''
def replace(matchobj):
if matchobj.group(0) in ['<', '>']:
return ''
if matchobj.group(0) == '\\':
return '\\\\'
if matchobj.group(0) == '"':
return '\\\\\\"'
return '\\\\' + matchobj.group(0)
return ES_ESCAPE_REGEX.sub(replace, s)
def find_closest(name, queryset, min_ratio=0.8):
'''
Find closest element using Levenshtein distance
'''
assert isinstance(name, str)
scores = sorted([
(element, Levenshtein.ratio(name, element.name))
for element in queryset
], key=lambda x: x[1], reverse=True)
best_element, best_score = scores[0]
if best_score >= min_ratio:
return best_element
def build_absolute_url(element, request, name, id_argument='pk', **kwargs):
"""
Build an absolute URL for a specified view using the element ID.
Build an absolute URL for a specified view using the element ID. Used by IIIF serializers.
"""
kwargs[id_argument] = str(element.id)
return request.build_absolute_uri(reverse(name, kwargs=kwargs))
def read_file_range(path, start, end):
"""
Read a specific range of lines from a file.
Line numbers start at 0. Like with Python slices, start is inclusive and end is exclusive.
"""
# TODO: Optimize dis
lines = []
with open(path) as f:
for i, line in enumerate(f):
if i < start:
continue
elif i >= end:
break
lines.append(line)
return ''.join(lines)
class disconnect_signal():
"""
Context manager to temporarily disconnect a signal
......@@ -94,22 +27,6 @@ class disconnect_signal():
self.signal.connect(**self.kwargs)
def get_or_instance(model, defaults={}, **filters):
"""
Like model.objects.get_or_create(),
except it creates a Python instance that is not saved into DB.
"""
try:
return model.objects.get(**filters), False
except model.DoesNotExist:
filters.update(defaults)
kwargs = { # Filter to remove Django lookups
k: v for k, v in filters.items()
if '__' not in k
}
return model(**kwargs), True
def build_tree(tree, *, corpus, type):
"""
Build Element and ElementPath instances from a tree described by
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment