diff --git a/arkindex/dataimport/filetypes.py b/arkindex/dataimport/filetypes.py index 254a6eead75a938e7f9000a81f99a13f92c1f5c4..eea3b36f950d66c34f51c7f6c8357904dc48e024 100644 --- a/arkindex/dataimport/filetypes.py +++ b/arkindex/dataimport/filetypes.py @@ -5,6 +5,7 @@ from arkindex.documents.surface import SurfaceImporter from arkindex.documents.surface_link import CorpusSurfaceLinker from arkindex.documents.tei import TeiParser from arkindex.documents.date_parser import DateParser +from arkindex.documents.models import MetaData, MetaType import os.path import logging @@ -175,8 +176,10 @@ class MetadataFileType(FileType): db_metadatas = [] for db_elt, tei_elt in matches: db_metadatas += tei_elt.save(db_elt, self.flow.dataimport.revision) - # Date metadatas interpretation - DateParser(db_metadatas).run() + # run Date interpretation on Date MetaDatas queryset + md_ids = (md.id for md in db_metadatas) + date_metadatas = MetaData.objects.filter(id__in=md_ids, type=MetaType.Date) + DateParser(date_metadatas).run() class ConfigFileType(FileType): diff --git a/arkindex/documents/date_parser.py b/arkindex/documents/date_parser.py index 18bdcdf5a5bd388e79b999d87252a22df118c7b2..748fb2d2fd57e0e455f0004c0b050105dee05c3f 100644 --- a/arkindex/documents/date_parser.py +++ b/arkindex/documents/date_parser.py @@ -1,6 +1,7 @@ +from django.conf import settings from django.core.exceptions import ValidationError -from django.db import transaction -from arkindex.documents.models import DateType, InterpretedDate, MetaType, MetaData +from arkindex.documents.models import DateType, InterpretedDate, Act, Page +from arkindex.documents.indexer import Indexer import unicodedata import re import logging @@ -86,6 +87,50 @@ def year_month_str(raw_date): return None +DATE_FUNCTIONS_TABLE = ( + year, + year_month, + year_month_day, + year_month_str, +) + + +def _normalize(raw_date): + """ + Return an unaccented lowercase string from input + """ + return unicodedata.normalize('NFKD', raw_date) \ + .encode('ASCII', 'ignore') \ + .lower() \ + .decode("utf-8") + + +def parse_date(raw_date, functions_table=DATE_FUNCTIONS_TABLE): + """ + Return a list of interpreted dates from a raw date string + """ + assert isinstance(raw_date, str) + # Try to match regex one by one + for f in functions_table: + try: + date_elts = f(_normalize(raw_date)) + if date_elts: + assert isinstance(date_elts, dict) + date = InterpretedDate(type=DateType.Exact) + for k in date_elts: + setattr(date, k, int(date_elts[k])) + try: + date.full_clean(exclude=('metadata', )) + except ValidationError as e: + logger.warning('Date fields are incorrect : {}'.format(e)) + continue + return (date, ) + except Exception: + logger.warning('Failed parsing {} with function {}'.format(raw_date, f.__name__)) + logger.warning('Date not supported : {}'.format(raw_date)) + return () + + class DateParser(object): """ Interprete date elements from non-standard TEI input @@ -93,91 +138,51 @@ class DateParser(object): "s.d. [1323-1325]" """ - date_table = ( - year, - year_month, - year_month_day, - year_month_str, - ) - - def __init__(self, metadatas=[]): - if metadatas: - assert all(isinstance(m, MetaData) for m in metadatas), 'DateParser can only handle metadata list' - self.metadatas = metadatas - - def _normalize(self, raw_date): - """ - Return an unaccented lowercase string from input - """ - return unicodedata.normalize('NFKD', raw_date) \ - .encode('ASCII', 'ignore') \ - .lower() \ - .decode("utf-8") + def __init__(self, metadatas): + self.metadatas = metadatas.prefetch_related('dates') - def parse(self, raw_date): + def index(self): """ - Return a list of interpreted dates from a raw date string + Create or Update ElasticSearch index for indexed elements """ - assert isinstance(raw_date, str) - # Try to match regex one by one - for f in self.date_table: - try: - date_elts = f(self._normalize(raw_date)) - if date_elts: - assert isinstance(date_elts, dict) - date = InterpretedDate(type=DateType.Exact) - for k in date_elts: - setattr(date, k, int(date_elts[k])) - try: - date.full_clean(exclude=('metadata', )) - except ValidationError as e: - logger.warning('Date fields are incorrect : {}'.format(e)) - continue - return (date, ) - except Exception: - logger.warning('Failed parsing {} with function {}'.format(raw_date, f.__name__)) - logger.warning('Date not supported : {}'.format(raw_date)) - return () + acts = Act.objects.filter(metadatas__in=self.metadatas) + if acts.exists(): + Indexer().run_index(settings.ES_INDEX_ACTS, Act.INDEX_TYPE, acts) + pages = Page.objects.filter(metadatas__in=self.metadatas) + if pages.exists(): + Indexer().run_index(settings.ES_INDEX_PAGES, Page.INDEX_TYPE, pages) def delete(self): """ - Delete interpreted dates corresponding to a metadatas list + Delete a queryset of interpreted dates corresponding to a metadatas list """ - deleted_dates_num = 0 - with transaction.atomic(): - for md in self.metadatas: - deleted = md.dates.all().delete() - logger.debug('Deleted {} interpreted date(s) for {} Metadata'.format(deleted[0], md.value)) - deleted_dates_num += deleted[0] - if deleted_dates_num: - logger.info('{} old InterpretedDates deleted from database'.format(deleted_dates_num)) + dates = InterpretedDate.objects.filter(metadata__in=self.metadatas) + nb_deleted, _ = dates.delete() + logger.info('Deleted {} old interpreted dates in database'.format(nb_deleted)) def run(self, dry_run=False): """ Create and save new interpreted dates for a list of metadatas Optionnaly Delete old interpreted dates """ - date_metadatas = list(filter(lambda x: x.type == MetaType.Date, self.metadatas)) - len_dates_md = len(date_metadatas) - len_interpreted_dates = sum(m.dates.exists() for m in date_metadatas) - logger.info('Found {} MetaData elements corresponding to a Date ({} interpreted)'.format( - len_dates_md, - len_interpreted_dates - )) + old_coverage = self.metadatas.filter(dates__isnull=False).count() + new_coverage = 0 if not dry_run: self.delete() - covered_metadatas = 0 - for md in date_metadatas: - updated = 0 - for date in self.parse(md.value): - date.metadata = md - updated = 1 - if not dry_run: + for md in self.metadatas: + dates = parse_date(md.value) + if dates: + new_coverage += 1 + if not dry_run: + for date in dates: + date.metadata = md date.save() - covered_metadatas += updated + logger.info('{} interpreted dates have been interpreted'.format(new_coverage)) + total_metadatas = self.metadatas.count() + if total_metadatas: + logger.info('Coverage of Date Metadatas is now {:.2%} ({:+.2%})'.format( + new_coverage / total_metadatas, + (new_coverage - old_coverage) / total_metadatas + )) if not dry_run: - logger.info('{} values have been updated'.format(covered_metadatas)) - logger.info('Updated interpreted dates covers {:.2%} of date metadatas ({:+.2%})'.format( - covered_metadatas / len_dates_md, - (covered_metadatas - len_interpreted_dates) / len_dates_md - )) + self.index() diff --git a/arkindex/documents/serializers/search.py b/arkindex/documents/serializers/search.py index 3185f307bee2aca7927c0c21fc05348944bf7b6f..7fc969efe92893c248340b192fb8e7222e746e5e 100644 --- a/arkindex/documents/serializers/search.py +++ b/arkindex/documents/serializers/search.py @@ -4,7 +4,7 @@ from arkindex.documents.serializers.light import CorpusLightSerializer from arkindex.documents.serializers.elements import ElementLightSerializer from arkindex.documents.serializers.ml import TranscriptionSerializer from arkindex.project.serializer_fields import EnumField, SearchTermsField -from arkindex.documents.date_parser import DateParser +from arkindex.documents.date_parser import parse_date class SearchQuerySerializer(serializers.Serializer): @@ -22,8 +22,7 @@ class SearchQuerySerializer(serializers.Serializer): corpus = serializers.UUIDField(source='corpus_id', default='') def parse_date(self, raw_date): - parser = DateParser() - date = parser.parse(raw_date) + date = parse_date(raw_date) if len(date) != 1: raise serializers.ValidationError('Could not parse Date') return date[0] diff --git a/arkindex/documents/tests/commands/test_parse_dates.py b/arkindex/documents/tests/commands/test_parse_dates.py index af9336ad358a9d8d890c24bed51d7e939d3a828b..259d925912bbdb5c17c4a2822d18388064389029 100644 --- a/arkindex/documents/tests/commands/test_parse_dates.py +++ b/arkindex/documents/tests/commands/test_parse_dates.py @@ -16,7 +16,7 @@ class TestParseDates(FixtureTestCase): element=cls.corpus.elements.get(type=ElementType.Act, name='Act 1'), ) - @patch('arkindex.documents.date_parser.DateParser.parse') + @patch('arkindex.documents.date_parser.parse_date') def test_date_parser(self, parse_mock): call_command( 'parse_dates', diff --git a/arkindex/documents/tests/test_date_parser.py b/arkindex/documents/tests/test_date_parser.py index 07b19f0aeeb1c40b05daa86e2dbc3f415fe70a1e..6853634c1d99f3bfe126f7838a984cbb113e549d 100644 --- a/arkindex/documents/tests/test_date_parser.py +++ b/arkindex/documents/tests/test_date_parser.py @@ -1,5 +1,8 @@ from unittest.mock import patch -from arkindex.documents.date_parser import DateParser, year, year_month, year_month_day, year_month_str +from arkindex.documents.date_parser import ( + parse_date, year, year_month, year_month_day, + year_month_str, DATE_FUNCTIONS_TABLE, +) from arkindex.documents.models import DateType, InterpretedDate from django.test import TestCase @@ -16,7 +19,6 @@ class TestDateParser(TestCase): @classmethod def setUpTestData(cls): - cls.parser = DateParser() cls.test_dates = { '1343': ((1343, None, None, DateType.Exact), ), '1326-05': ((1326, 5, None, DateType.Exact), ), @@ -30,7 +32,7 @@ class TestDateParser(TestCase): def test_parser(self): for t in self.test_dates: self.assertEqual( - self.parser.parse(t), + parse_date(t), tuple([self._create_date(elts) for elts in self.test_dates[t]]) ) @@ -42,8 +44,8 @@ class TestDateParser(TestCase): """ Check that a error-raising function do not break table iteration """ - self.parser.date_table = (self._wrong_function, ) + self.parser.date_table - self.assertEqual(self.parser.parse('1343')[0], InterpretedDate(year=1343, type=DateType.Exact)) + functions = (self._wrong_function, ) + DATE_FUNCTIONS_TABLE + self.assertTupleEqual(parse_date('1343', functions), (InterpretedDate(year=1343, type=DateType.Exact), )) self.assertEqual(warning_mock.call_count, 1) def test_year(self): diff --git a/arkindex/documents/tests/test_interpreted_date.py b/arkindex/documents/tests/test_interpreted_date.py index 3de0e0539001296a1966c66c3536ca0a81ea6a50..caa4857650e9de7c2c8e2ba7d243bcf71f32909b 100644 --- a/arkindex/documents/tests/test_interpreted_date.py +++ b/arkindex/documents/tests/test_interpreted_date.py @@ -1,5 +1,5 @@ from arkindex.documents.models import MetaData, MetaType, DateType, InterpretedDate -from arkindex.documents.date_parser import DateParser +from arkindex.documents.date_parser import parse_date from django.test import TestCase @@ -11,8 +11,7 @@ class TestInterpretedDate(TestCase): cls.metadata = MetaData(name="date", type=MetaType.Date, value='1337-may') def test_date_str(self): - date_parser = DateParser() - interpreted_date = date_parser.parse(self.metadata.value) + interpreted_date = parse_date(self.metadata.value) self.assertEqual(1, len(interpreted_date)) self.assertEqual('1337-05', str(interpreted_date[0]))