From f37ae1f05f61b508e4a4a6f55b86ea8bf4533522 Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Wed, 16 Nov 2022 17:22:19 +0000 Subject: [PATCH] final touches --- dan/datasets/extract/extract_from_arkindex.py | 63 ++++++++++--------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/dan/datasets/extract/extract_from_arkindex.py b/dan/datasets/extract/extract_from_arkindex.py index 1a2276b3..131bf8ce 100644 --- a/dan/datasets/extract/extract_from_arkindex.py +++ b/dan/datasets/extract/extract_from_arkindex.py @@ -233,6 +233,37 @@ class ArkindexExtractor: else: return self.split_names[2] + def extract_entities(self, transcription): + entities = self.client.request( + "ListTranscriptionEntities", + id=transcription["id"], + worker_version=self.entity_worker_version, + ) + if entities["count"] == 0: + logger.warning( + f"No entities found on transcription ({transcription['id']})." + ) + return + else: + text = transcription["text"] + + count = 0 + for entity in entities["results"]: + matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]] + start_token, end_token = ( + matching_tokens["start"], + matching_tokens["end"], + ) + text, count = insert_token( + text, + count, + start_token, + end_token, + offset=entity["offset"], + length=entity["length"], + ) + return text + def extract_transcription( self, element, @@ -251,37 +282,9 @@ class ArkindexExtractor: transcription = transcriptions["results"].pop() if self.load_entities: - entities = self.client.request( - "ListTranscriptionEntities", - id=transcription["id"], - worker_version=self.entity_worker_version, - ) - if entities["count"] == 0: - logger.warning( - f"No entities found on transcription ({transcription['id']})." - ) - return - else: - text = transcription["text"] - - count = 0 - for entity in entities["results"]: - matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]] - start_token, end_token = ( - matching_tokens["start"], - matching_tokens["end"], - ) - text, count = insert_token( - text, - count, - start_token, - end_token, - offset=entity["offset"], - length=entity["length"], - ) + return self.extract_entities(transcription) else: - text = transcription["text"].strip() - return text + return transcription["text"].strip() def process_element( self, -- GitLab