From f37ae1f05f61b508e4a4a6f55b86ea8bf4533522 Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Wed, 16 Nov 2022 17:22:19 +0000
Subject: [PATCH] final touches

---
 dan/datasets/extract/extract_from_arkindex.py | 63 ++++++++++---------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/dan/datasets/extract/extract_from_arkindex.py b/dan/datasets/extract/extract_from_arkindex.py
index 1a2276b3..131bf8ce 100644
--- a/dan/datasets/extract/extract_from_arkindex.py
+++ b/dan/datasets/extract/extract_from_arkindex.py
@@ -233,6 +233,37 @@ class ArkindexExtractor:
         else:
             return self.split_names[2]
 
+    def extract_entities(self, transcription):
+        entities = self.client.request(
+            "ListTranscriptionEntities",
+            id=transcription["id"],
+            worker_version=self.entity_worker_version,
+        )
+        if entities["count"] == 0:
+            logger.warning(
+                f"No entities found on transcription ({transcription['id']})."
+            )
+            return
+        else:
+            text = transcription["text"]
+
+        count = 0
+        for entity in entities["results"]:
+            matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]]
+            start_token, end_token = (
+                matching_tokens["start"],
+                matching_tokens["end"],
+            )
+            text, count = insert_token(
+                text,
+                count,
+                start_token,
+                end_token,
+                offset=entity["offset"],
+                length=entity["length"],
+            )
+        return text
+
     def extract_transcription(
         self,
         element,
@@ -251,37 +282,9 @@ class ArkindexExtractor:
 
         transcription = transcriptions["results"].pop()
         if self.load_entities:
-            entities = self.client.request(
-                "ListTranscriptionEntities",
-                id=transcription["id"],
-                worker_version=self.entity_worker_version,
-            )
-            if entities["count"] == 0:
-                logger.warning(
-                    f"No entities found on transcription ({transcription['id']})."
-                )
-                return
-            else:
-                text = transcription["text"]
-
-            count = 0
-            for entity in entities["results"]:
-                matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]]
-                start_token, end_token = (
-                    matching_tokens["start"],
-                    matching_tokens["end"],
-                )
-                text, count = insert_token(
-                    text,
-                    count,
-                    start_token,
-                    end_token,
-                    offset=entity["offset"],
-                    length=entity["length"],
-                )
+            return self.extract_entities(transcription)
         else:
-            text = transcription["text"].strip()
-        return text
+            return transcription["text"].strip()
 
     def process_element(
         self,
-- 
GitLab