diff --git a/dan/datasets/extract/extract_from_arkindex.py b/dan/datasets/extract/extract_from_arkindex.py index ea7860de60df6e6670f172f8d988361b820a1821..1bc792c013daf8e0b3b47379675148d68abc02b0 100644 --- a/dan/datasets/extract/extract_from_arkindex.py +++ b/dan/datasets/extract/extract_from_arkindex.py @@ -180,6 +180,37 @@ class ArkindexExtractor: else: return self.split_names[2] + def extract_entities(self, transcription): + entities = self.client.request( + "ListTranscriptionEntities", + id=transcription["id"], + worker_version=self.entity_worker_version, + ) + if entities["count"] == 0: + logger.warning( + f"No entities found on transcription ({transcription['id']})." + ) + return + else: + text = transcription["text"] + + count = 0 + for entity in entities["results"]: + matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]] + start_token, end_token = ( + matching_tokens["start"], + matching_tokens["end"], + ) + text, count = insert_token( + text, + count, + start_token, + end_token, + offset=entity["offset"], + length=entity["length"], + ) + return text + def extract_transcription( self, element, @@ -198,37 +229,9 @@ class ArkindexExtractor: transcription = transcriptions["results"].pop() if self.load_entities: - entities = self.client.request( - "ListTranscriptionEntities", - id=transcription["id"], - worker_version=self.entity_worker_version, - ) - if entities["count"] == 0: - logger.warning( - f"No entities found on transcription ({transcription['id']})." - ) - return - else: - text = transcription["text"] - - count = 0 - for entity in entities["results"]: - matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]] - start_token, end_token = ( - matching_tokens["start"], - matching_tokens["end"], - ) - text, count = insert_token( - text, - count, - start_token, - end_token, - offset=entity["offset"], - length=entity["length"], - ) + return self.extract_entities(transcription) else: - text = transcription["text"].strip() - return text + return transcription["text"].strip() def process_element( self,