diff --git a/arkindex/dataimport/tasks.py b/arkindex/dataimport/tasks.py index 63f69c93594c8daadc6cfe20f3c742c901e0526d..f90cfd49173d5cf9380ea99eb76448c6df4496a2 100644 --- a/arkindex/dataimport/tasks.py +++ b/arkindex/dataimport/tasks.py @@ -145,19 +145,7 @@ def import_repo(self, dataimport): for db_elt, tei_elt in matches: with transaction.atomic(): - # Remove old metadatas - db_elt.metadatas.all().delete() - - # Create new ones - for name, meta in tei_elt.build_metadata().items(): - if not meta[1]: - continue - db_elt.metadatas.create( - name=name, - type=meta[0], - value=meta[1], - revision=dataimport.revision, - ) + tei_elt.save(db_elt, dataimport.revision) except Exception as e: self.report_message( diff --git a/arkindex/documents/tei.py b/arkindex/documents/tei.py index a5b4ded094af2cfeb342ff3646340a69b721f1f7..281a549ef12b9c982b758bdf211691d4be98704c 100644 --- a/arkindex/documents/tei.py +++ b/arkindex/documents/tei.py @@ -1,6 +1,7 @@ from lxml import etree from arkindex.project.tools import find_closest from arkindex.documents.models import ElementType, Act, Element, MetaType +from arkindex.dataimport.models import Revision import logging import sys import os @@ -110,6 +111,39 @@ class TeiElement(object): for name, value in self.data.items() } + def save(self, db_elt, revision): + assert isinstance(db_elt, Element) + assert isinstance(revision, Revision) + # Filter to remove empty data + parsed_metadatas = { + name: meta + for name, meta in self.build_metadata().items() + if meta[1] + } + + # Remove deleted metadatas + db_elt.metadatas.exclude( + name__in=parsed_metadatas.keys(), + ).delete() + + # Create/update + for name, meta in parsed_metadatas.items(): + db_meta, created = db_elt.metadatas.get_or_create( + name=name, + defaults={ + 'type': meta[0], + 'value': meta[1], + 'revision': revision, + }, + ) + if created: + continue + if (db_meta.type, db_meta.value) == meta: # Nothing to update + continue + db_meta.type, db_meta.value = meta + db_meta.revision = revision + db_meta.save() + class Witness(TeiElement): optional = ('lang', ) @@ -240,7 +274,8 @@ class TeiParser(object): 'Invalid path {}'.format(path) root = etree.parse(path).getroot() - assert root.nsmap.get('tei') == NAMESPACES['tei'], \ + # Some files only have xmlns="..." and not xmlns:tei="...", the former maps to None + assert root.nsmap.get('tei') == NAMESPACES['tei'] or root.nsmap.get(None) == NAMESPACES['tei'], \ 'Missing tei XML namespace' # Start parsing diff --git a/arkindex/documents/tests/tei_samples/update_after.xml b/arkindex/documents/tests/tei_samples/update_after.xml new file mode 100644 index 0000000000000000000000000000000000000000..0ca700cffc64658b156848b98e8935fa52a6c740 --- /dev/null +++ b/arkindex/documents/tests/tei_samples/update_after.xml @@ -0,0 +1,31 @@ +<text xml:id="test_update" xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"> + <front> + <head>1</head> + <docDate> + <origDate when="1337-01">1337, janvier</origDate> + <origPlace>Mars</origPlace> + </docDate> + <argument> + <p>Argument</p> + </argument> + <div type="tradition"> + <listWit sortKey="copie"> + <witness> + <msDesc> + <msIdentifier> + <repository>Archives nationales</repository> + <idno>Volume 1</idno> + </msIdentifier> + </msDesc> + <locus scheme="folio">1r</locus> + <idno>1</idno> + <lang>lat.</lang> + </witness> + </listWit> + </div> + <index> + <term type="person">Someone else</term> + <term type="subject">Something</term> + </index> + </front> +</text> diff --git a/arkindex/documents/tests/tei_samples/update_before.xml b/arkindex/documents/tests/tei_samples/update_before.xml new file mode 100644 index 0000000000000000000000000000000000000000..945171d3e54b9e48e6e42e7dbcd8446f5cdd4673 --- /dev/null +++ b/arkindex/documents/tests/tei_samples/update_before.xml @@ -0,0 +1,31 @@ +<text xml:id="test_update" xmlns="http://www.tei-c.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"> + <front> + <head>1</head> + <docDate> + <origDate when="1337-01">1337, janvier</origDate> + <origPlace>Mars</origPlace> + </docDate> + <argument> + <p>Argument</p> + </argument> + <div type="tradition"> + <listWit sortKey="copie"> + <witness> + <msDesc> + <msIdentifier> + <repository>Archives nationales</repository> + <idno>Volume 1</idno> + </msIdentifier> + </msDesc> + <locus scheme="folio">1r</locus> + <idno>1</idno> + <lang>lat.</lang> + </witness> + </listWit> + </div> + <index> + <term type="person">Someone</term> + <term type="place">Somewhere</term> + </index> + </front> +</text> diff --git a/arkindex/documents/tests/test_tei.py b/arkindex/documents/tests/test_tei.py new file mode 100644 index 0000000000000000000000000000000000000000..edff9a8c7615d5e27a07f0dece0fbcf26e6c487d --- /dev/null +++ b/arkindex/documents/tests/test_tei.py @@ -0,0 +1,69 @@ +from lxml import etree +from arkindex.documents.models import Act +from arkindex.documents.tei import Text +from arkindex.dataimport.models import Repository, Revision +from arkindex.project.tests import FixtureTestCase +import os.path + +FIXTURES = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + 'tei_samples', +) + + +class TestTeiElement(FixtureTestCase): + + def test_apply_xslt(self): + tree = etree.parse(os.path.join(FIXTURES, 'arguments.xml')) + te = Text(tree.getroot()) + self.assertMultiLineEqual( + # Remove indentations + "".join(line.strip() for line in te.argument.splitlines()), + '<div xmlns:tei="http://www.tei-c.org/ns/1.0" class="argument">' + '<p>Argument</p>' + '<p><em>Italique</em></p>' + '<p><sup>Supérieur</sup></p>' + '<p><sup>Supérieur</sup></p>' + '<p><blockquote>Citation</blockquote></p>' + '</div>') + + def test_save_update(self): + """ + Check saving metadata for a TeiElement updates existing data + """ + act = Act.objects.get(number="1") + tree_before = etree.parse(os.path.join(FIXTURES, 'update_before.xml')) + tree_after = etree.parse(os.path.join(FIXTURES, 'update_after.xml')) + te_before = Text(tree_before.getroot()) + te_after = Text(tree_after.getroot()) + + repo = Repository.objects.create( + url='http://repo', + hook_token='token', + clone_user='user', + clone_token='token', + corpus=self.corpus, + user=self.user, + ) + rev1 = Revision.objects.create(repo=repo, hash='42', ref='ref/heads/master', message='a', author='me') + rev2 = Revision.objects.create(repo=repo, hash='43', ref='ref/heads/master', message='b', author='me') + + te_before.save(act, rev1) + location = act.metadatas.get(name="location") + persons = act.metadatas.get(name="persons") + self.assertEqual(location.value, 'Mars') + self.assertEqual(location.revision, rev1) + self.assertEqual(persons.value, 'Someone') + self.assertEqual(act.metadatas.get(name="places").value, 'Somewhere') + self.assertFalse(act.metadatas.filter(name="subjects").exists()) + + # Create subject, update person and delete place, leave location untouched + te_after.save(act, rev2) + location.refresh_from_db() + persons.refresh_from_db() + self.assertEqual(location.value, 'Mars') + self.assertEqual(location.revision, rev1) + self.assertEqual(persons.value, 'Someone else') + self.assertEqual(persons.revision, rev2) + self.assertFalse(act.metadatas.filter(name="places").exists()) + self.assertEqual(act.metadatas.get(name="subjects").value, 'Something') diff --git a/arkindex/documents/tests/test_tei_xslt.py b/arkindex/documents/tests/test_tei_xslt.py deleted file mode 100644 index 05801c44cb61e6f45c83d59d1f2aa825e37b4510..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/test_tei_xslt.py +++ /dev/null @@ -1,28 +0,0 @@ -from unittest import TestCase -from lxml import etree -from arkindex.documents.tei import Text -import os.path - -FIXTURES = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - 'tei_samples', -) - - -class TestTeiXslt(TestCase): - - def setUp(self): - self.tree = etree.parse(os.path.join(FIXTURES, 'arguments.xml')) - - def test_apply_xslt(self): - te = Text(self.tree.getroot()) - self.assertMultiLineEqual( - # Remove indentations - "".join(line.strip() for line in te.argument.splitlines()), - '<div xmlns:tei="http://www.tei-c.org/ns/1.0" class="argument">' - '<p>Argument</p>' - '<p><em>Italique</em></p>' - '<p><sup>Supérieur</sup></p>' - '<p><sup>Supérieur</sup></p>' - '<p><blockquote>Citation</blockquote></p>' - '</div>')