Skip to content
Snippets Groups Projects
Commit 3aaf30f6 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Merge branch 'reindex-dates-during-git-import' into 'master'

reindex dates during git import

See merge request !285
parents f6bfb5a5 1c5a67d2
No related branches found
No related tags found
1 merge request!285reindex dates during git import
...@@ -5,6 +5,7 @@ from arkindex.documents.surface import SurfaceImporter ...@@ -5,6 +5,7 @@ from arkindex.documents.surface import SurfaceImporter
from arkindex.documents.surface_link import CorpusSurfaceLinker from arkindex.documents.surface_link import CorpusSurfaceLinker
from arkindex.documents.tei import TeiParser from arkindex.documents.tei import TeiParser
from arkindex.documents.date_parser import DateParser from arkindex.documents.date_parser import DateParser
from arkindex.documents.models import MetaData, MetaType
import os.path import os.path
import logging import logging
...@@ -175,8 +176,10 @@ class MetadataFileType(FileType): ...@@ -175,8 +176,10 @@ class MetadataFileType(FileType):
db_metadatas = [] db_metadatas = []
for db_elt, tei_elt in matches: for db_elt, tei_elt in matches:
db_metadatas += tei_elt.save(db_elt, self.flow.dataimport.revision) db_metadatas += tei_elt.save(db_elt, self.flow.dataimport.revision)
# Date metadatas interpretation # run Date interpretation on Date MetaDatas queryset
DateParser(db_metadatas).run() md_ids = (md.id for md in db_metadatas)
date_metadatas = MetaData.objects.filter(id__in=md_ids, type=MetaType.Date)
DateParser(date_metadatas).run()
class ConfigFileType(FileType): class ConfigFileType(FileType):
......
from django.conf import settings
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.db import transaction from arkindex.documents.models import DateType, InterpretedDate, Act, Page
from arkindex.documents.models import DateType, InterpretedDate, MetaType, MetaData from arkindex.documents.indexer import Indexer
import unicodedata import unicodedata
import re import re
import logging import logging
...@@ -86,6 +87,50 @@ def year_month_str(raw_date): ...@@ -86,6 +87,50 @@ def year_month_str(raw_date):
return None return None
DATE_FUNCTIONS_TABLE = (
year,
year_month,
year_month_day,
year_month_str,
)
def _normalize(raw_date):
"""
Return an unaccented lowercase string from input
"""
return unicodedata.normalize('NFKD', raw_date) \
.encode('ASCII', 'ignore') \
.lower() \
.decode("utf-8")
def parse_date(raw_date, functions_table=DATE_FUNCTIONS_TABLE):
"""
Return a list of interpreted dates from a raw date string
"""
assert isinstance(raw_date, str)
# Try to match regex one by one
for f in functions_table:
try:
date_elts = f(_normalize(raw_date))
if date_elts:
assert isinstance(date_elts, dict)
date = InterpretedDate(type=DateType.Exact)
for k in date_elts:
setattr(date, k, int(date_elts[k]))
try:
date.full_clean(exclude=('metadata', ))
except ValidationError as e:
logger.warning('Date fields are incorrect : {}'.format(e))
continue
return (date, )
except Exception:
logger.warning('Failed parsing {} with function {}'.format(raw_date, f.__name__))
logger.warning('Date not supported : {}'.format(raw_date))
return ()
class DateParser(object): class DateParser(object):
""" """
Interprete date elements from non-standard TEI input Interprete date elements from non-standard TEI input
...@@ -93,91 +138,51 @@ class DateParser(object): ...@@ -93,91 +138,51 @@ class DateParser(object):
"s.d. [1323-1325]" "s.d. [1323-1325]"
""" """
date_table = ( def __init__(self, metadatas):
year, self.metadatas = metadatas.prefetch_related('dates')
year_month,
year_month_day,
year_month_str,
)
def __init__(self, metadatas=[]):
if metadatas:
assert all(isinstance(m, MetaData) for m in metadatas), 'DateParser can only handle metadata list'
self.metadatas = metadatas
def _normalize(self, raw_date):
"""
Return an unaccented lowercase string from input
"""
return unicodedata.normalize('NFKD', raw_date) \
.encode('ASCII', 'ignore') \
.lower() \
.decode("utf-8")
def parse(self, raw_date): def index(self):
""" """
Return a list of interpreted dates from a raw date string Create or Update ElasticSearch index for indexed elements
""" """
assert isinstance(raw_date, str) acts = Act.objects.filter(metadatas__in=self.metadatas)
# Try to match regex one by one if acts.exists():
for f in self.date_table: Indexer().run_index(settings.ES_INDEX_ACTS, Act.INDEX_TYPE, acts)
try: pages = Page.objects.filter(metadatas__in=self.metadatas)
date_elts = f(self._normalize(raw_date)) if pages.exists():
if date_elts: Indexer().run_index(settings.ES_INDEX_PAGES, Page.INDEX_TYPE, pages)
assert isinstance(date_elts, dict)
date = InterpretedDate(type=DateType.Exact)
for k in date_elts:
setattr(date, k, int(date_elts[k]))
try:
date.full_clean(exclude=('metadata', ))
except ValidationError as e:
logger.warning('Date fields are incorrect : {}'.format(e))
continue
return (date, )
except Exception:
logger.warning('Failed parsing {} with function {}'.format(raw_date, f.__name__))
logger.warning('Date not supported : {}'.format(raw_date))
return ()
def delete(self): def delete(self):
""" """
Delete interpreted dates corresponding to a metadatas list Delete a queryset of interpreted dates corresponding to a metadatas list
""" """
deleted_dates_num = 0 dates = InterpretedDate.objects.filter(metadata__in=self.metadatas)
with transaction.atomic(): nb_deleted, _ = dates.delete()
for md in self.metadatas: logger.info('Deleted {} old interpreted dates in database'.format(nb_deleted))
deleted = md.dates.all().delete()
logger.debug('Deleted {} interpreted date(s) for {} Metadata'.format(deleted[0], md.value))
deleted_dates_num += deleted[0]
if deleted_dates_num:
logger.info('{} old InterpretedDates deleted from database'.format(deleted_dates_num))
def run(self, dry_run=False): def run(self, dry_run=False):
""" """
Create and save new interpreted dates for a list of metadatas Create and save new interpreted dates for a list of metadatas
Optionnaly Delete old interpreted dates Optionnaly Delete old interpreted dates
""" """
date_metadatas = list(filter(lambda x: x.type == MetaType.Date, self.metadatas)) old_coverage = self.metadatas.filter(dates__isnull=False).count()
len_dates_md = len(date_metadatas) new_coverage = 0
len_interpreted_dates = sum(m.dates.exists() for m in date_metadatas)
logger.info('Found {} MetaData elements corresponding to a Date ({} interpreted)'.format(
len_dates_md,
len_interpreted_dates
))
if not dry_run: if not dry_run:
self.delete() self.delete()
covered_metadatas = 0 for md in self.metadatas:
for md in date_metadatas: dates = parse_date(md.value)
updated = 0 if dates:
for date in self.parse(md.value): new_coverage += 1
date.metadata = md if not dry_run:
updated = 1 for date in dates:
if not dry_run: date.metadata = md
date.save() date.save()
covered_metadatas += updated logger.info('{} interpreted dates have been interpreted'.format(new_coverage))
total_metadatas = self.metadatas.count()
if total_metadatas:
logger.info('Coverage of Date Metadatas is now {:.2%} ({:+.2%})'.format(
new_coverage / total_metadatas,
(new_coverage - old_coverage) / total_metadatas
))
if not dry_run: if not dry_run:
logger.info('{} values have been updated'.format(covered_metadatas)) self.index()
logger.info('Updated interpreted dates covers {:.2%} of date metadatas ({:+.2%})'.format(
covered_metadatas / len_dates_md,
(covered_metadatas - len_interpreted_dates) / len_dates_md
))
...@@ -4,7 +4,7 @@ from arkindex.documents.serializers.light import CorpusLightSerializer ...@@ -4,7 +4,7 @@ from arkindex.documents.serializers.light import CorpusLightSerializer
from arkindex.documents.serializers.elements import ElementLightSerializer from arkindex.documents.serializers.elements import ElementLightSerializer
from arkindex.documents.serializers.ml import TranscriptionSerializer from arkindex.documents.serializers.ml import TranscriptionSerializer
from arkindex.project.serializer_fields import EnumField, SearchTermsField from arkindex.project.serializer_fields import EnumField, SearchTermsField
from arkindex.documents.date_parser import DateParser from arkindex.documents.date_parser import parse_date
class SearchQuerySerializer(serializers.Serializer): class SearchQuerySerializer(serializers.Serializer):
...@@ -22,8 +22,7 @@ class SearchQuerySerializer(serializers.Serializer): ...@@ -22,8 +22,7 @@ class SearchQuerySerializer(serializers.Serializer):
corpus = serializers.UUIDField(source='corpus_id', default='') corpus = serializers.UUIDField(source='corpus_id', default='')
def parse_date(self, raw_date): def parse_date(self, raw_date):
parser = DateParser() date = parse_date(raw_date)
date = parser.parse(raw_date)
if len(date) != 1: if len(date) != 1:
raise serializers.ValidationError('Could not parse Date') raise serializers.ValidationError('Could not parse Date')
return date[0] return date[0]
......
...@@ -16,7 +16,7 @@ class TestParseDates(FixtureTestCase): ...@@ -16,7 +16,7 @@ class TestParseDates(FixtureTestCase):
element=cls.corpus.elements.get(type=ElementType.Act, name='Act 1'), element=cls.corpus.elements.get(type=ElementType.Act, name='Act 1'),
) )
@patch('arkindex.documents.date_parser.DateParser.parse') @patch('arkindex.documents.date_parser.parse_date')
def test_date_parser(self, parse_mock): def test_date_parser(self, parse_mock):
call_command( call_command(
'parse_dates', 'parse_dates',
......
from unittest.mock import patch from unittest.mock import patch
from arkindex.documents.date_parser import DateParser, year, year_month, year_month_day, year_month_str from arkindex.documents.date_parser import (
parse_date, year, year_month, year_month_day,
year_month_str, DATE_FUNCTIONS_TABLE,
)
from arkindex.documents.models import DateType, InterpretedDate from arkindex.documents.models import DateType, InterpretedDate
from django.test import TestCase from django.test import TestCase
...@@ -16,7 +19,6 @@ class TestDateParser(TestCase): ...@@ -16,7 +19,6 @@ class TestDateParser(TestCase):
@classmethod @classmethod
def setUpTestData(cls): def setUpTestData(cls):
cls.parser = DateParser()
cls.test_dates = { cls.test_dates = {
'1343': ((1343, None, None, DateType.Exact), ), '1343': ((1343, None, None, DateType.Exact), ),
'1326-05': ((1326, 5, None, DateType.Exact), ), '1326-05': ((1326, 5, None, DateType.Exact), ),
...@@ -30,7 +32,7 @@ class TestDateParser(TestCase): ...@@ -30,7 +32,7 @@ class TestDateParser(TestCase):
def test_parser(self): def test_parser(self):
for t in self.test_dates: for t in self.test_dates:
self.assertEqual( self.assertEqual(
self.parser.parse(t), parse_date(t),
tuple([self._create_date(elts) for elts in self.test_dates[t]]) tuple([self._create_date(elts) for elts in self.test_dates[t]])
) )
...@@ -42,8 +44,8 @@ class TestDateParser(TestCase): ...@@ -42,8 +44,8 @@ class TestDateParser(TestCase):
""" """
Check that a error-raising function do not break table iteration Check that a error-raising function do not break table iteration
""" """
self.parser.date_table = (self._wrong_function, ) + self.parser.date_table functions = (self._wrong_function, ) + DATE_FUNCTIONS_TABLE
self.assertEqual(self.parser.parse('1343')[0], InterpretedDate(year=1343, type=DateType.Exact)) self.assertTupleEqual(parse_date('1343', functions), (InterpretedDate(year=1343, type=DateType.Exact), ))
self.assertEqual(warning_mock.call_count, 1) self.assertEqual(warning_mock.call_count, 1)
def test_year(self): def test_year(self):
......
from arkindex.documents.models import MetaData, MetaType, DateType, InterpretedDate from arkindex.documents.models import MetaData, MetaType, DateType, InterpretedDate
from arkindex.documents.date_parser import DateParser from arkindex.documents.date_parser import parse_date
from django.test import TestCase from django.test import TestCase
...@@ -11,8 +11,7 @@ class TestInterpretedDate(TestCase): ...@@ -11,8 +11,7 @@ class TestInterpretedDate(TestCase):
cls.metadata = MetaData(name="date", type=MetaType.Date, value='1337-may') cls.metadata = MetaData(name="date", type=MetaType.Date, value='1337-may')
def test_date_str(self): def test_date_str(self):
date_parser = DateParser() interpreted_date = parse_date(self.metadata.value)
interpreted_date = date_parser.parse(self.metadata.value)
self.assertEqual(1, len(interpreted_date)) self.assertEqual(1, len(interpreted_date))
self.assertEqual('1337-05', str(interpreted_date[0])) self.assertEqual('1337-05', str(interpreted_date[0]))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment