Skip to content
Snippets Groups Projects
Commit a7607237 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Image re-check and statistics

parent 0003d22c
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
from django.core.management.base import BaseCommand
from django.conf import settings
from django.db.models import Count
from django.utils.text import slugify
from arkindex.documents.models import Element, ElementType, Transcription, Corpus
from arkindex.images.models import ImageServer, ImageStatus
from urllib.parse import urljoin
import time
import requests
class Command(BaseCommand):
help = 'Display statistics for a Telegraf agent'
def add_arguments(self, parser):
parser.add_argument(
'--influxdb',
help='Post statistics to the InfluxDB API at a given URL',
nargs='?',
const=settings.INFLUXDB_API_URL,
)
def handle(self, *args, **options):
self.options = options
# Total transcriptions
self.output(
......@@ -51,9 +64,18 @@ class Command(BaseCommand):
with_transcriptions=vol_with_transcriptions_count.get(corpus.id, 0),
)
for server in ImageServer.objects.annotate(img_total=Count('images')):
self.output(
'images',
tags={'imageserver': server.grafana_tag},
total=server.img_total,
checked=server.images.filter(status=ImageStatus.Checked).count(),
errors=server.images.filter(status=ImageStatus.Error).count(),
)
def dict_to_str(self, data):
return ','.join([
'{}={}'.format(k, v)
'{}={}'.format(k, str(v).replace(' ', '_'))
for (k, v) in data.items()
])
......@@ -67,3 +89,8 @@ class Command(BaseCommand):
values_str = self.dict_to_str(values)
line = '{}{} {} {:.0f}'.format(name, tags_str, values_str, timestamp)
self.stdout.write(line)
if not self.options.get('influxdb'):
return
resp = requests.post(urljoin(self.options['influxdb'], 'write'), params={'db': 'arkindex'}, data=line)
resp.raise_for_status()
from django.core.management import call_command
from arkindex.project.tests import FixtureTestCase
from unittest.mock import patch
import io
import re
class TestTasks(FixtureTestCase):
class TestTelegraf(FixtureTestCase):
"""Tests for telegraf reporting CLI"""
def test_command(self):
output = io.StringIO()
call_command('telegraf', stdout=output)
self.assertIsNotNone(output)
# Parse output
lines = list(filter(None, output.getvalue().split('\n')))
self.assertEqual(len(lines), 3)
regex = re.compile(r'^(\w+)(,[\w\-_,=]+)? ([\w\-_,=]+) (\d+)$')
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.influx_re = re.compile(r'^(\w+)(,[\w\-_,=]+)? ([\w\-_,=]+) (\d+)$')
cls.influx_measurements_re = re.compile(r'(\w+)=(\d+)')
cls.influx_tags_re = re.compile(r'(\w+)=([\w\-]+)')
cls.expected_output = {
'transcriptions': {
'total': 9,
'tags': {},
},
'pages': {
'total': 6,
'with_transcriptions': 3,
'tags': {},
},
'corpus': {
'total': 2,
'with_transcriptions': 1,
'tags': {'name': 'unit-tests'},
},
'images': {
'total': 6,
'checked': 0,
'errors': 0,
'tags': {'imageserver': 'Test_Server_1'},
}
}
def _parse_influx_lines(self, lines):
data = {}
for line in lines:
details = regex.search(line)
details = self.influx_re.search(line)
self.assertIsNotNone(details)
name, tags, values, t = details.groups()
self.assertTrue(int(t) > 0)
data[name] = {
k: int(v)
for k, v in re.findall(r'(\w+)=(\d+)', values)
for k, v in self.influx_measurements_re.findall(values)
}
data[name]["tags"] = {
k: v
for k, v in re.findall(r'(\w+)=([\w\-]+)', tags)
for k, v in self.influx_tags_re.findall(tags)
} if tags else {}
return data
def test_command(self):
output = io.StringIO()
call_command('telegraf', stdout=output)
self.assertIsNotNone(output)
lines = list(filter(None, output.getvalue().split('\n')))
self.assertEqual(len(lines), 4)
self.assertDictEqual(
self._parse_influx_lines(lines),
self.expected_output,
)
@patch('arkindex.documents.management.commands.telegraf.requests')
def test_post_influxdb(self, requests_mock):
output = io.StringIO()
call_command(
'telegraf',
influxdb='http://somewhere:8086/write',
stdout=output,
)
self.assertIsNotNone(output)
lines = list(filter(None, output.getvalue().split('\n')))
self.assertDictEqual(
self._parse_influx_lines(lines),
self.expected_output,
)
self.assertEqual(requests_mock.post.call_count, 4)
args, kwargs = zip(*requests_mock.post.call_args_list)
# All requests are to the same URL
self.assertSetEqual(set(args), {('http://somewhere:8086/write', )})
self.assertSequenceEqual(kwargs, [
{'params': {'db': 'arkindex'}, 'data': line}
for line in lines
])
# Data tests
self.assertEqual(data['transcriptions']['total'], 9)
self.assertEqual(data['transcriptions']['tags'], {})
self.assertEqual(data['pages']['total'], 6)
self.assertEqual(data['pages']['with_transcriptions'], 3)
self.assertEqual(data['pages']['tags'], {})
self.assertEqual(data['corpus']['total'], 2)
self.assertEqual(data['corpus']['with_transcriptions'], 1)
self.assertDictEqual(data['corpus']['tags'], {'name': 'unit-tests'})
# Raise exceptions on each POST request
self.assertEqual(requests_mock.post().raise_for_status.call_count, 4)
from django.core.management.base import CommandError
from django.conf import settings
from ponos.management.base import PonosCommand
from arkindex.project.argparse import CorpusArgument, ElementArgument
from arkindex.images.models import Image, ImageStatus
from arkindex.images.models import ImageServer, Image, ImageStatus
import logging
logging.basicConfig(
......@@ -32,10 +33,22 @@ class Command(PonosCommand):
type=ElementArgument(),
nargs='+',
)
parser.add_argument(
'--sample',
help='Also test a few already checked images from each server',
nargs='?',
const=settings.CHECK_IMAGES_SAMPLE_SIZE,
default=0,
)
def validate_args(self, corpus=None, element=None, force=False, **options):
def validate_args(self, corpus=None, element=None, force=False, sample=0, **options):
if corpus and element:
raise CommandError('--corpus and --element cannot be used together')
if (corpus or element) and sample:
raise CommandError('--corpus and --element cannot be used with --sample')
if force and sample:
raise CommandError('--force and --sample cannot be used together')
if corpus:
images = Image.objects.filter(zones__elements__corpus=corpus).distinct()
elif element:
......@@ -46,9 +59,28 @@ class Command(PonosCommand):
if not force:
images = images.exclude(status=ImageStatus.Checked)
return {'images': images}
return {'images': images, 'sample': sample}
def run(self, images=[], sample=0):
if sample >= 1:
# Re-check a few images from each server
for server in ImageServer.objects.all():
server_sample = server.images \
.filter(status=ImageStatus.Checked) \
.order_by('?')[:sample]
logger.info('Re-checking {} images in server {}'.format(len(server_sample), server.display_name))
self.check(server_sample)
self.check(images)
def check(self, images=[]):
successful, failed = 0, 0
def run(self, images=[]):
for image in images:
logger.info('Checking image {} at {}'.format(str(image.id), image.url))
image.perform_check(save=True)
if image.status == ImageStatus.Checked:
successful += 1
elif image.status == ImageStatus.Error:
failed += 1
return successful, failed
......@@ -170,6 +170,10 @@ class ImageServer(models.Model):
)
return new_server
@property
def grafana_tag(self):
return '{}_{}'.format(self.display_name, self.id)
class ImageStatus(Enum):
"""
......
......@@ -248,6 +248,9 @@ ES_INDEX_PAGES = 'pages'
# Silent logger for elasticsearch
logging.getLogger('elasticsearch').setLevel(logging.WARNING)
# InfluxDB API root
INFLUXDB_API_URL = os.environ.get('INFLUXDB_API_URL', 'http://localhost:8086/')
# SSLify proxy
SSLIFY_HOST = os.environ.get('SSLIFY_HOST')
......@@ -268,6 +271,9 @@ IIIF_TRANSCRIPTION_LIST = False
# See http://docs.python-requests.org/en/master/user/advanced/#timeouts
IIIF_DOWNLOAD_TIMEOUT = (30, 60)
# check_images sample size when checking all servers
CHECK_IMAGES_SAMPLE_SIZE = 20
TRANSCRIPTIONS_IMPORT_QUEUE_SIZE = 25000
TRANSCRIPTIONS_IMPORT_CHUNK_SIZE = 10000
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment