diff --git a/arkindex/documents/pagexml.py b/arkindex/documents/pagexml.py index e8e8fc76e8fa95addd4a58ae5d62e158b159f69b..0a2a1c978e41714b20fb99f49f8e090d5d57b437 100644 --- a/arkindex/documents/pagexml.py +++ b/arkindex/documents/pagexml.py @@ -243,7 +243,7 @@ class PageXmlParser(object): """ entities = [] for tags in blocks.values(): - new_entities, new_tags = self.create_entities(tags, element, types, ratio) + new_entities, new_tags = self.create_entities(tags, element, types) entities += new_entities self.create_link_transcription_entity(new_tags) @@ -251,42 +251,20 @@ class PageXmlParser(object): self.create_links(tags, path, ratio) return entities - def create_entities(self, tags, element, types, ratio): + def create_entities(self, tags, element, types): """ Create entity according to 'name' in tag """ entities = [] for tag in tags: if tag['name'] in types.keys(): - entities_name = Entity.objects.filter( - corpus__elements=element, - type=types[tag['name']] - ).values('name') - if entities_name: - names = [e['name'] for e in entities_name] - name = best_levenshtein( - tag['value'], - names - ) - score = name[1] - else: - score = 0 - - if score < ratio: - entity = Entity.objects.create( - name=tag['value'], - type=types[tag['name']], - corpus_id=element.corpus_id, - source_id=self.source.id - ) - entities.append(entity.id) - else: - entity = Entity.objects.distinct('name').get( - corpus__elements=element, - name=name[0], - type=types[tag['name']], - corpus_id=element.corpus_id - ) + entity = Entity.objects.create( + name=tag['value'], + type=types[tag['name']], + corpus_id=element.corpus_id, + source_id=self.source.id + ) + entities.append(entity.id) tag['entity_id'] = entity.id return entities, tags diff --git a/arkindex/documents/tests/test_pagexml.py b/arkindex/documents/tests/test_pagexml.py index 902a60cdfaa7b679525e36c309fd490c2c9477c7..5614d1fef36d7564d15a222abf373979a7c1eb57 100644 --- a/arkindex/documents/tests/test_pagexml.py +++ b/arkindex/documents/tests/test_pagexml.py @@ -933,7 +933,7 @@ class TestPageXml(FixtureAPITestCase): entities = [] for block in blocks: - entities += parser.create_entities(blocks[block], self.page, self.types, 0.90)[0] + entities += parser.create_entities(blocks[block], self.page, self.types)[0] nb_entities = Entity.objects.filter(corpus_id=self.page.corpus.id).count() self.assertEqual(len(entities), nb_entities) @@ -945,7 +945,7 @@ class TestPageXml(FixtureAPITestCase): blocks = parser.merge(blocks) for block in blocks.values(): - entities, tags = parser.create_entities(block, self.page, self.types, 0.90) + entities, tags = parser.create_entities(block, self.page, self.types) parser.create_roles(tags, self.page, TRANSKRIBUS_ROLE, 0.90) nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() @@ -958,14 +958,14 @@ class TestPageXml(FixtureAPITestCase): blocks = parser.merge(blocks) for block in blocks.values(): - entities, tags = parser.create_entities(block, self.page, self.types, 0.90) + entities, tags = parser.create_entities(block, self.page, self.types) tags = parser.create_roles(tags, self.page, TRANSKRIBUS_ROLE, 0.90) parser.create_links(tags, TRANSKRIBUS_ROLE, 0.90) nb_entities = Entity.objects.filter(corpus_id=self.page.corpus.id).count() nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() nb_links = EntityLink.objects.filter(role__corpus__id=self.page.corpus.id).count() - self.assertEqual(3, nb_entities) + self.assertEqual(4, nb_entities) self.assertEqual(2, nb_roles) self.assertEqual(2, nb_links) @@ -980,7 +980,7 @@ class TestPageXml(FixtureAPITestCase): nb_roles = EntityRole.objects.filter(corpus_id=self.page.corpus.id).count() nb_links = EntityLink.objects.filter(role__corpus__id=self.page.corpus.id).count() nb_transcription_entity = TranscriptionEntity.objects.filter(entity__id__in=entities_id).count() - self.assertEqual(3, nb_entities) + self.assertEqual(4, nb_entities) self.assertEqual(2, nb_roles) self.assertEqual(2, nb_links) self.assertEqual(5, nb_transcription_entity)