From cb56ffa36c642315197d871a9a1c6c4854d91f5a Mon Sep 17 00:00:00 2001 From: Valentin Rigal <rigal@teklia.com> Date: Mon, 25 Mar 2019 12:01:13 +0000 Subject: [PATCH] interpreted dates basic model --- arkindex/documents/admin.py | 14 +- arkindex/documents/date_parser.py | 131 ++++++++++++++++++ .../management/commands/parse_dates.py | 65 +++++++++ .../migrations/0002_interpreteddate.py | 47 +++++++ arkindex/documents/models.py | 41 ++++++ arkindex/documents/serializers/elements.py | 21 ++- arkindex/documents/tei.py | 17 ++- .../tests/commands/test_parse_dates.py | 28 ++++ .../tests/tei_samples/update_after.xml | 2 +- .../tests/tei_samples/update_before.xml | 2 +- arkindex/documents/tests/test_act.py | 9 +- arkindex/documents/tests/test_date_parser.py | 79 +++++++++++ .../documents/tests/test_interpreted_date.py | 12 ++ arkindex/documents/tests/test_tei.py | 2 + 14 files changed, 462 insertions(+), 8 deletions(-) create mode 100644 arkindex/documents/date_parser.py create mode 100644 arkindex/documents/management/commands/parse_dates.py create mode 100644 arkindex/documents/migrations/0002_interpreteddate.py create mode 100644 arkindex/documents/tests/commands/test_parse_dates.py create mode 100644 arkindex/documents/tests/test_date_parser.py create mode 100644 arkindex/documents/tests/test_interpreted_date.py diff --git a/arkindex/documents/admin.py b/arkindex/documents/admin.py index 8cf9d51749..ddac73de56 100644 --- a/arkindex/documents/admin.py +++ b/arkindex/documents/admin.py @@ -2,7 +2,7 @@ from django.contrib import admin from django.urls import path, reverse from django.utils.html import format_html from arkindex.documents.models import \ - Corpus, Page, Element, ElementType, Act, Transcription, MetaData, Classification, DataSource + Corpus, Page, Element, ElementType, Act, Transcription, MetaData, InterpretedDate, Classification, DataSource from arkindex.documents.views import DumpActs from arkindex.dataimport.models import Event from enumfields.admin import EnumFieldListFilter @@ -35,6 +35,17 @@ class PageAdmin(admin.ModelAdmin): inlines = (EventInline, ClassificationInline, ) +class DateInline(admin.TabularInline): + extra = 1 + model = InterpretedDate + + +class MetaDataAdmin(admin.ModelAdmin): + list_display = ('id', 'type', 'revision') + readonly_fields = ('id', 'revision', ) + inlines = (DateInline, ) + + class MetaDataInline(admin.TabularInline): model = MetaData @@ -89,3 +100,4 @@ admin.site.register(Page, PageAdmin) admin.site.register(Element, ElementAdmin) admin.site.register(Act, ActAdmin) admin.site.register(Transcription, TranscriptionAdmin) +admin.site.register(MetaData, MetaDataAdmin) diff --git a/arkindex/documents/date_parser.py b/arkindex/documents/date_parser.py new file mode 100644 index 0000000000..e12aef32cd --- /dev/null +++ b/arkindex/documents/date_parser.py @@ -0,0 +1,131 @@ +from django.core.exceptions import ValidationError +from arkindex.documents.models import DateType, InterpretedDate +import unicodedata +import re +import logging + +logger = logging.getLogger(__name__) + +MONTHS = { + 'en': ( + 'january', + 'february', + 'march', + 'april', + 'may', + 'june', + 'july', + 'august', + 'september', + 'october', + 'november', + 'december', + ), + 'fr': ( + 'janvier', + 'fevrier', + 'mars', + 'avril', + 'mai', + 'juin', + 'juillet', + 'aout', + 'septembre', + 'octobre', + 'novembre', + 'decembre', + ), +} + + +def year(raw_date): + """ + Matches 4-digit year only: '1320', '2010' + """ + match = re.match(r'^(?P<year>\d{4})$', raw_date) + return match.groupdict() if match else None + + +def year_month(raw_date): + """ + Matches 4-digit year followed by 1 or 2-digit month: '1320-3', '2010-02' + """ + match = re.match(r'^(?P<year>\d{4})-(?P<month>\d{1,2})$', raw_date) + return match.groupdict() if match else None + + +def year_month_day(raw_date): + """ + Matches 4-digit year followed by 1 or 2-digit month then + 1 or 2-digit day : '1320-3-12', '2010-02-31' + """ + match = re.match(r'^(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})$', raw_date) + return match.groupdict() if match else None + + +def year_month_str(raw_date): + """ + Matches a date with alpha-written month and 4-digit year + Both must be unique : '1320, march', 'february 2010' + Days are not interpreted yet if present in the raw date text + """ + years = [] + months = [] + for match in re.finditer(r'(?=(^|[^\d])(?P<year>\d{4})($|\D))', raw_date): + years.append(match.groupdict()['year']) + words = re.findall(r'[a-z]+', raw_date) + for w in words: + for month_lang in MONTHS: + if w in MONTHS[month_lang]: + months.append(MONTHS[month_lang].index(w) + 1) + break + # Exactly one year and 1 month are found + if len(years) == len(months) == 1: + return {'year': years[0], 'month': months[0]} + return None + + +class DateParser(object): + """ + Return interpreted date elements from non-standard TEI input + Input examples : "1325-11-04" + "s.d. [1323-1325]" + """ + + date_table = ( + year, + year_month, + year_month_day, + year_month_str, + ) + + def _normalize(self, raw_date): + """ + Return an unaccented lowercase string from input + """ + return unicodedata.normalize('NFKD', raw_date) \ + .encode('ASCII', 'ignore') \ + .lower() \ + .decode("utf-8") + + def parse(self, raw_date): + assert isinstance(raw_date, str) + # Try to match regex one by one + for f in self.date_table: + try: + date_elts = f(self._normalize(raw_date)) + if date_elts: + assert isinstance(date_elts, dict) + date = InterpretedDate(type=DateType.Exact) + for k in date_elts: + setattr(date, k, int(date_elts[k])) + try: + date.full_clean(exclude=('metadata', )) + except ValidationError as e: + logger.warning('Date fields are incorrect : {}'.format(e)) + continue + return (date, ) + except Exception as e: + logger.warning('Exception {} has been raised during parsing of date {}'.format(e, raw_date), f) + logger.warning('Date not supported : {}'.format(raw_date)) + return () diff --git a/arkindex/documents/management/commands/parse_dates.py b/arkindex/documents/management/commands/parse_dates.py new file mode 100644 index 0000000000..3655ca24a2 --- /dev/null +++ b/arkindex/documents/management/commands/parse_dates.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +from django.conf import settings +from django.db import transaction +from ponos.management.base import PonosCommand +from arkindex.project.argparse import CorpusArgument +from arkindex.documents.models import MetaData, MetaType +from arkindex.documents.date_parser import DateParser +import logging + +logging.basicConfig( + format='%(asctime)s [%(levelname)s] %(message)s', + level=logging.INFO, +) + +logger = logging.getLogger(__name__) + + +class Command(PonosCommand): + help = "Create fresh interpreted dates from metadatas" + docker_image = settings.ARKINDEX_APP_IMAGE + base_recipe = settings.PONOS_RECIPE + + def add_arguments(self, parser): + parser.add_argument( + '--corpus', + help='ID or part of the name of the corpus to fetch metadatas from', + type=CorpusArgument(), + ) + parser.add_argument( + '--dry-run', + action='store_true', + default=False, + help='Print number of dates to update', + ) + + def run(self, corpus=None, dry_run=False, **options): + metadatas = MetaData.objects.filter(type=MetaType.Date) + if corpus: + metadatas = metadatas.filter(element__corpus_id=corpus.id) + total_metadatas = len(metadatas) + covered_metadatas = sum(1 if m.dates.count() else 0 for m in metadatas) + logger.info('Found {} MetaData elements corresponding to a Date'.format(total_metadatas)) + date_parser = DateParser() + covered_metadatas_update = 0 + if not dry_run: + deleted_dates_num = 0 + with transaction.atomic(): + for metadata in metadatas: + deleted = metadata.dates.all().delete() + logger.debug('Deleted {} interpreted date(s) for {} Metadata'.format(deleted[0], metadata.value)) + deleted_dates_num += deleted[0] + logger.info('{} old InterpretedDates deleted from database'.format(deleted_dates_num)) + for metadata in metadatas: + updated = 0 + for date in date_parser.parse(metadata.value): + date.metadata = metadata + updated = 1 + if not dry_run: + date.save() + covered_metadatas_update += updated + logger.info('{} values have been updated'.format(covered_metadatas_update)) + logger.info('Updated coverage is {:.2%} ({:+.2%})'.format( + covered_metadatas_update / total_metadatas, + (covered_metadatas_update - covered_metadatas) / total_metadatas, + )) diff --git a/arkindex/documents/migrations/0002_interpreteddate.py b/arkindex/documents/migrations/0002_interpreteddate.py new file mode 100644 index 0000000000..d70aa5c050 --- /dev/null +++ b/arkindex/documents/migrations/0002_interpreteddate.py @@ -0,0 +1,47 @@ +# Generated by Django 2.1 on 2019-03-15 08:19 + +import arkindex.documents.models +import django.core.validators +from django.db import migrations, models +import django.db.models.deletion +import enumfields.fields +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='InterpretedDate', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('type', enumfields.fields.EnumField(enum=arkindex.documents.models.DateType, max_length=16)), + ('year', models.PositiveSmallIntegerField()), + ('month', models.PositiveSmallIntegerField( + blank=True, + null=True, + validators=[ + django.core.validators.MinValueValidator(1), + django.core.validators.MaxValueValidator(12), + ] + )), + ('day', models.PositiveSmallIntegerField( + blank=True, + null=True, + validators=[ + django.core.validators.MinValueValidator(1), + django.core.validators.MaxValueValidator(31), + ] + )), + ('metadata', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='dates', + to='documents.MetaData', + )), + ], + ), + ] diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py index 1b3e3327a7..5106882c6b 100644 --- a/arkindex/documents/models.py +++ b/arkindex/documents/models.py @@ -1,6 +1,7 @@ from django.db import models, transaction from django.contrib.postgres.indexes import GinIndex from django.utils.functional import cached_property +from django.core.validators import MinValueValidator, MaxValueValidator from enumfields import EnumField, Enum from arkindex_common.ml_tool import MLToolType from arkindex.project.models import IndexableModel @@ -13,6 +14,9 @@ import itertools logger = logging.getLogger(__name__) +# Language used for display +DEFAULT_LANGUAGE = 'en' + class Right(enum.Enum): Read = 'read' @@ -620,3 +624,40 @@ class MetaData(models.Model): def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) + + +class DateType(Enum): + Exact = 'exact' + Lower = 'lower' + Upper = 'upper' + Unknown = 'unknown' + + +class InterpretedDate(models.Model): + year = models.PositiveSmallIntegerField() + month = models.PositiveSmallIntegerField( + validators=[MinValueValidator(1), MaxValueValidator(12)], + null=True, + blank=True, + ) + day = models.PositiveSmallIntegerField( + validators=[MinValueValidator(1), MaxValueValidator(31)], + null=True, + blank=True, + ) + id = models.UUIDField(default=uuid.uuid4, primary_key=True) + metadata = models.ForeignKey(MetaData, related_name='dates', on_delete=models.CASCADE) + type = EnumField(DateType, max_length=16) + + def __iter__(self): + return iter((self.year, self.month, self.day)) + + def __eq__(self, other): + return tuple(self) == tuple(other) and self.type == other.type + + def __str__(self): + year, month, day = tuple(self) + if month: + from arkindex.documents.date_parser import MONTHS + month = MONTHS[DEFAULT_LANGUAGE][month - 1] + return ' '.join([str(e) for e in (year, ',', month, day) if e]) diff --git a/arkindex/documents/serializers/elements.py b/arkindex/documents/serializers/elements.py index 4b4120140c..6b57e7414a 100644 --- a/arkindex/documents/serializers/elements.py +++ b/arkindex/documents/serializers/elements.py @@ -1,6 +1,7 @@ from rest_framework import serializers from arkindex.documents.models import \ - Element, ElementType, Page, PageType, PageDirection, Act, Corpus, MetaData, MetaType + Element, ElementType, Page, PageType, PageDirection, Act, Corpus, \ + MetaData, MetaType, InterpretedDate, DateType from arkindex.images.serializers import ZoneSerializer, ImageSerializer from arkindex.documents.serializers.light import ElementLightSerializer, CorpusLightSerializer from arkindex.documents.serializers.ml import ClassificationSerializer, TranscriptionSerializer @@ -9,12 +10,29 @@ from arkindex.dataimport.models import EventType from arkindex.project.serializer_fields import EnumField +class InterpretedDateSerializer(serializers.ModelSerializer): + """ + Serialize a list of interpreted dates linked to a metadata + """ + type = EnumField(DateType) + + class Meta: + model = InterpretedDate + fields = ( + 'year', + 'month', + 'day', + 'type', + ) + + class MetaDataSerializer(serializers.ModelSerializer): """ Serialises some Metadata for any Element """ type = EnumField(MetaType) revision = RevisionSerializer() + dates = InterpretedDateSerializer(many=True) class Meta: model = MetaData @@ -24,6 +42,7 @@ class MetaDataSerializer(serializers.ModelSerializer): 'name', 'value', 'revision', + 'dates', ) diff --git a/arkindex/documents/tei.py b/arkindex/documents/tei.py index 811923bd2f..9ff7a7bb7c 100644 --- a/arkindex/documents/tei.py +++ b/arkindex/documents/tei.py @@ -4,6 +4,7 @@ from lxml import etree from arkindex.project.tools import find_closest, read_file_range from arkindex.documents.models import ElementType, Act, Element, MetaType from arkindex.dataimport.models import Revision, DataImportFailure +from arkindex.documents.date_parser import DateParser import logging import sys import os @@ -139,18 +140,33 @@ class TeiElement(object): ) db_metadatas.append(db_meta) if created: + self.post_save(db_meta, meta_type, value) continue if (db_meta.type, db_meta.value) == (meta_type, value) and db_meta.revision: # Nothing to update and revision is set continue db_meta.type, db_meta.value, db_meta.revision = meta_type, value, revision db_meta.save() + self.post_save(db_meta, meta_type, value) # Remove deleted metadatas db_elt.metadatas.exclude( id__in=[dbm.id for dbm in db_metadatas], ).delete() + def post_save(self, db_meta, meta_type, value): + ''' + Element-specific operations after being saved to db + ''' + if meta_type != MetaType.Date: + return + # Create new interpreted dates + db_meta.dates.all().delete() + date_parser = DateParser() + for date in date_parser.parse(value): + date.metadata = db_meta + date.save() + class Witness(TeiElement): optional = ('lang', ) @@ -188,7 +204,6 @@ class Date(TeiElement): ''' Convert to a human readable date ''' - # TODO: support common formats if self.when: return self.when return self.text diff --git a/arkindex/documents/tests/commands/test_parse_dates.py b/arkindex/documents/tests/commands/test_parse_dates.py new file mode 100644 index 0000000000..af9336ad35 --- /dev/null +++ b/arkindex/documents/tests/commands/test_parse_dates.py @@ -0,0 +1,28 @@ +from django.core.management import call_command +from arkindex.project.tests import FixtureTestCase +from arkindex.documents.models import ElementType, MetaData, MetaType +from unittest.mock import patch, call + + +class TestParseDates(FixtureTestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.metadata = MetaData.objects.create( + name='date', + type=MetaType.Date, + value='2020-10-20', + element=cls.corpus.elements.get(type=ElementType.Act, name='Act 1'), + ) + + @patch('arkindex.documents.date_parser.DateParser.parse') + def test_date_parser(self, parse_mock): + call_command( + 'parse_dates', + corpus=self.corpus, + ) + self.assertEqual( + parse_mock.call_args_list, + [call(self.metadata.value)], + ) diff --git a/arkindex/documents/tests/tei_samples/update_after.xml b/arkindex/documents/tests/tei_samples/update_after.xml index 6a0473dd04..38af152939 100644 --- a/arkindex/documents/tests/tei_samples/update_after.xml +++ b/arkindex/documents/tests/tei_samples/update_after.xml @@ -2,7 +2,7 @@ <front> <head>1</head> <docDate> - <origDate when="1337-01">1337, janvier</origDate> + <date when="1337-01">1337, janvier</date> <origPlace>Mars</origPlace> </docDate> <argument> diff --git a/arkindex/documents/tests/tei_samples/update_before.xml b/arkindex/documents/tests/tei_samples/update_before.xml index 22d44b5479..b1a47aafc3 100644 --- a/arkindex/documents/tests/tei_samples/update_before.xml +++ b/arkindex/documents/tests/tei_samples/update_before.xml @@ -2,7 +2,7 @@ <front> <head>1</head> <docDate> - <origDate when="1337-01">1337, janvier</origDate> + <date when="1337-01">1337, janvier</date> <origPlace>Mars</origPlace> </docDate> <argument> diff --git a/arkindex/documents/tests/test_act.py b/arkindex/documents/tests/test_act.py index cfd28f5544..2661f4d166 100644 --- a/arkindex/documents/tests/test_act.py +++ b/arkindex/documents/tests/test_act.py @@ -73,15 +73,18 @@ class TestAct(FixtureAPITestCase): 'name': 'origin', 'type': 'date', 'value': '2010/01', - 'revision': None}, + 'revision': None, + 'dates': []}, {'id': str(metas[2].id), 'name': 'place', 'type': 'location', 'value': 'somewhere', - 'revision': None}, + 'revision': None, + 'dates': []}, {'id': str(metas[0].id), 'name': 'test', 'type': 'text', 'value': 'aha', - 'revision': None}] + 'revision': None, + 'dates': []}] ) diff --git a/arkindex/documents/tests/test_date_parser.py b/arkindex/documents/tests/test_date_parser.py new file mode 100644 index 0000000000..cdb5084d99 --- /dev/null +++ b/arkindex/documents/tests/test_date_parser.py @@ -0,0 +1,79 @@ +from unittest.mock import patch, ANY +from arkindex.documents.date_parser import DateParser, year, year_month, year_month_day, year_month_str +from arkindex.documents.models import DateType, InterpretedDate +from django.test import TestCase + + +class TestDateParser(TestCase): + + def _create_date(self, elts): + return InterpretedDate( + year=elts[0], + month=elts[1], + day=elts[2], + type=elts[3], + ) + + @classmethod + def setUpTestData(cls): + cls.parser = DateParser() + cls.test_dates = { + '1343': ((1343, None, None, DateType.Exact), ), + '1326-05': ((1326, 5, None, DateType.Exact), ), + '1420-01-23': ((1420, 1, 23, DateType.Exact), ), + '1398 february': ((1398, 2, None, DateType.Exact), ), + 'Août 1300': ((1300, 8, None, DateType.Exact), ), + 'march 12 11223344': (), + '2222-22-22': (), + } + + def test_parser(self): + for t in self.test_dates: + self.assertEqual( + self.parser.parse(t), + tuple([self._create_date(elts) for elts in self.test_dates[t]]) + ) + + def _wrong_function(raw_date): + return 'not a dict' + + @patch('logging.Logger.warning') + def test_parser_exception(self, warning_mock): + """ + Check that a error-raising function do not break table iteration + """ + self.parser.date_table = (self._wrong_function, ) + self.parser.date_table + self.assertEqual(self.parser.parse('1343')[0], InterpretedDate(year=1343, type=DateType.Exact)) + warning_mock.assert_called_with(ANY, self._wrong_function) + + def test_year(self): + dates = ('1343', ) + wrong_dates = ('1343-03', ) + for d in dates: + self.assertIsInstance(year(d), dict) + for d in wrong_dates: + self.assertIsNone(year(d)) + + def test_year_month(self): + dates = ('1343-02', ) + wrong_dates = ('1343-03-26', '1546') + for d in dates: + self.assertIsInstance(year_month(d), dict) + for d in wrong_dates: + self.assertIsNone(year_month(d)) + + def test_year_month_day(self): + dates = ('1343-2-5', ) + wrong_dates = ('1343-03', '1088') + for d in dates: + self.assertIsInstance(year_month_day(d), dict) + for d in wrong_dates: + self.assertIsNone(year_month_day(d)) + + def test_year_month_str(self): + dates = ('1343 july', '23 october 1245') + wrong_dates = ('1343-03', 'july') + for d in dates: + self.assertIsInstance(year_month_str(d), dict) + for d in wrong_dates: + self.assertIsNone(year_month_str(d)) diff --git a/arkindex/documents/tests/test_interpreted_date.py b/arkindex/documents/tests/test_interpreted_date.py new file mode 100644 index 0000000000..7df723ca39 --- /dev/null +++ b/arkindex/documents/tests/test_interpreted_date.py @@ -0,0 +1,12 @@ +from arkindex.documents.models import MetaData, MetaType +from arkindex.documents.date_parser import DateParser +from django.test import TestCase + + +class TestInterpretedDate(TestCase): + + def test_date_str(self): + metadata = MetaData(name="date", type=MetaType.Date, value='1337-05-02') + date_parser = DateParser() + interpreted_date = date_parser.parse(metadata.value)[0] + self.assertIn('may', str(interpreted_date)) diff --git a/arkindex/documents/tests/test_tei.py b/arkindex/documents/tests/test_tei.py index 648beaca3e..29bdef5f26 100644 --- a/arkindex/documents/tests/test_tei.py +++ b/arkindex/documents/tests/test_tei.py @@ -52,6 +52,7 @@ class TestTeiElement(FixtureTestCase): act.metadatas.exclude(name='summary').values_list('name', 'index', 'value'), [ ('folio', 0, '1r'), + ('date', 0, '1337-01'), ('id', 0, 'test_update'), ('language', 0, 'lat.'), ('name', 0, '1'), @@ -68,6 +69,7 @@ class TestTeiElement(FixtureTestCase): act.metadatas.exclude(name='summary').values_list('name', 'index', 'value', 'revision_id'), [ ('folio', 0, '1r', rev1.id), + ('date', 0, '1337-01', rev1.id), ('id', 0, 'test_update', rev1.id), ('language', 0, 'lat.', rev1.id), ('name', 0, '1', rev1.id), -- GitLab