From 6955d89b2ed4edd3275e5d2faf0baf1232b661d4 Mon Sep 17 00:00:00 2001 From: manon blanco <blanco@teklia.com> Date: Thu, 19 Dec 2019 11:10:04 +0000 Subject: [PATCH] do not merge entities into XML page import --- arkindex/documents/pagexml.py | 40 ++++++------------------ arkindex/documents/tests/test_pagexml.py | 10 +++--- 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/arkindex/documents/pagexml.py b/arkindex/documents/pagexml.py index e8e8fc76e8..0a2a1c978e 100644 --- a/arkindex/documents/pagexml.py +++ b/arkindex/documents/pagexml.py @@ -243,7 +243,7 @@ class PageXmlParser(object): """ entities = [] for tags in blocks.values(): - new_entities, new_tags = self.create_entities(tags, element, types, ratio) + new_entities, new_tags = self.create_entities(tags, element, types) entities += new_entities self.create_link_transcription_entity(new_tags) @@ -251,42 +251,20 @@ class PageXmlParser(object): self.create_links(tags, path, ratio) return entities - def create_entities(self, tags, element, types, ratio): + def create_entities(self, tags, element, types): """ Create entity according to 'name' in tag """ entities = [] for tag in tags: if tag['name'] in types.keys(): - entities_name = Entity.objects.filter( - corpus__elements=element, - type=types[tag['name']] - ).values('name') - if entities_name: - names = [e['name'] for e in entities_name] - name = best_levenshtein( - tag['value'], - names - ) - score = name[1] - else: - score = 0 - - if score < ratio: - entity = Entity.objects.create( - name=tag['value'], - type=types[tag['name']], - corpus_id=element.corpus_id, - source_id=self.source.id - ) - entities.append(entity.id) - else: - entity = Entity.objects.distinct('name').get( - corpus__elements=element, - name=name[0], - type=types[tag['name']], - corpus_id=element.corpus_id - ) + entity = Entity.objects.create( + name=tag['value'], + type=types[tag['name']], + corpus_id=element.corpus_id, + source_id=self.source.id + ) + entities.append(entity.id) tag['entity_id'] = entity.id return entities, tags diff --git a/arkindex/documents/tests/test_pagexml.py b/arkindex/documents/tests/test_pagexml.py index 902a60cdfa..5614d1fef3 100644 --- a/arkindex/documents/tests/test_pagexml.py +++ b/arkindex/documents/tests/test_pagexml.py @@ -933,7 +933,7 @@ class TestPageXml(FixtureAPITestCase): entities = [] for block in blocks: - entities += parser.create_entities(blocks[block], self.page, self.types, 0.90)[0] + entities += parser.create_entities(blocks[block], self.page, self.types)[0] nb_entities = Entity.objects.filter(corpus_id=self.page.corpus.id).count() self.assertEqual(len(entities), nb_entities) @@ -945,7 +945,7 @@ class TestPageXml(FixtureAPITestCase): blocks = parser.merge(blocks) for block in blocks.values(): - entities, tags = parser.create_entities(block, self.page, self.types, 0.90) + entities, tags = parser.create_entities(block, self.page, self.types) parser.create_roles(tags, self.page, TRANSKRIBUS_ROLE, 0.90) nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() @@ -958,14 +958,14 @@ class TestPageXml(FixtureAPITestCase): blocks = parser.merge(blocks) for block in blocks.values(): - entities, tags = parser.create_entities(block, self.page, self.types, 0.90) + entities, tags = parser.create_entities(block, self.page, self.types) tags = parser.create_roles(tags, self.page, TRANSKRIBUS_ROLE, 0.90) parser.create_links(tags, TRANSKRIBUS_ROLE, 0.90) nb_entities = Entity.objects.filter(corpus_id=self.page.corpus.id).count() nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() nb_links = EntityLink.objects.filter(role__corpus__id=self.page.corpus.id).count() - self.assertEqual(3, nb_entities) + self.assertEqual(4, nb_entities) self.assertEqual(2, nb_roles) self.assertEqual(2, nb_links) @@ -980,7 +980,7 @@ class TestPageXml(FixtureAPITestCase): nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() nb_links = EntityLink.objects.filter(role__corpus__id=self.page.corpus.id).count() nb_transcription_entity = TranscriptionEntity.objects.filter(entity__id__in=entities_id).count() - self.assertEqual(3, nb_entities) + self.assertEqual(4, nb_entities) self.assertEqual(2, nb_roles) self.assertEqual(2, nb_links) self.assertEqual(5, nb_transcription_entity) -- GitLab