final touches

85deca6d · Yoann Schneider · 58f3a2e3 · 85deca6d
Commit 85deca6d authored 2 years ago by Yoann Schneider
--- a/dan/datasets/extract/extract_from_arkindex.py
+++ b/dan/datasets/extract/extract_from_arkindex.py
@@ -180,6 +180,37 @@ class ArkindexExtractor:
        else:
            return self.split_names[2]

+    def extract_entities(self, transcription):
+        entities = self.client.request(
+            "ListTranscriptionEntities",
+            id=transcription["id"],
+            worker_version=self.entity_worker_version,
+        )
+        if entities["count"] == 0:
+            logger.warning(
+                f"No entities found on transcription ({transcription['id']})."
+            )
+            return
+        else:
+            text = transcription["text"]
+
+        count = 0
+        for entity in entities["results"]:
+            matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]]
+            start_token, end_token = (
+                matching_tokens["start"],
+                matching_tokens["end"],
+            )
+            text, count = insert_token(
+                text,
+                count,
+                start_token,
+                end_token,
+                offset=entity["offset"],
+                length=entity["length"],
+            )
+        return text
+
    def extract_transcription(
        self,
        element,
@@ -198,37 +229,9 @@ class ArkindexExtractor:

        transcription = transcriptions["results"].pop()
        if self.load_entities:
-            entities = self.client.request(
-                "ListTranscriptionEntities",
-                id=transcription["id"],
-                worker_version=self.entity_worker_version,
-            )
-            if entities["count"] == 0:
-                logger.warning(
-                    f"No entities found on transcription ({transcription['id']})."
-                )
-                return
-            else:
-                text = transcription["text"]
-
-            count = 0
-            for entity in entities["results"]:
-                matching_tokens = self.tokens[entity["entity"]["metas"]["subtype"]]
-                start_token, end_token = (
-                    matching_tokens["start"],
-                    matching_tokens["end"],
-                )
-                text, count = insert_token(
-                    text,
-                    count,
-                    start_token,
-                    end_token,
-                    offset=entity["offset"],
-                    length=entity["length"],
-                )
+            return self.extract_entities(transcription)
        else:
-            text = transcription["text"].strip()
-        return text
+            return transcription["text"].strip()

    def process_element(
        self,