From a0f977ebede15de365b8e70c652e0b8e700c3008 Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Wed, 29 Mar 2023 14:47:24 +0200 Subject: [PATCH] wip --- tests/test_worker.py | 4 +- worker_generic_training_dataset/worker.py | 78 ++++++++++++++++++++++- 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/tests/test_worker.py b/tests/test_worker.py index bb38787..b5880cc 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -9,5 +9,5 @@ def test_dummy(): def test_import(): """Import our newly created module, through importlib to avoid parsing issues""" worker = importlib.import_module("worker_generic_training_dataset.worker") - assert hasattr(worker, "Demo") - assert hasattr(worker.Demo, "process_element") + assert hasattr(worker, "DatasetExtractor") + assert hasattr(worker.DatasetExtractor, "process_element") diff --git a/worker_generic_training_dataset/worker.py b/worker_generic_training_dataset/worker.py index 8489d56..5b18a7b 100644 --- a/worker_generic_training_dataset/worker.py +++ b/worker_generic_training_dataset/worker.py @@ -1,14 +1,86 @@ # -*- coding: utf-8 -*- +import logging +import operator + +from apistar.exceptions import ErrorResponse +from arkindex_worker.cache import create_tables, create_version_table, init_cache_db from arkindex_worker.worker import ElementsWorker +logger = logging.getLogger(__name__) + + +class DatasetExtractor(ElementsWorker): + def configure(self): + super().configure() + + # database arg is mandatory in dev mode + assert ( + not self.is_read_only or self.args.database is not None + ), "`--database` arg is mandatory in developer mode." + + # Download corpus + self.download_latest_export() + + # Initialize db that will be written + self.initialize_database() + + def initialize_database(self): + # Create db at + # - self.workdir.parent / self.task_id in Arkindex mode + # - self.args.database in dev mode + database_path = ( + self.args.database + if self.is_read_only + else self.workdir.parent / self.task_id + ) + + init_cache_db(database_path) + + create_version_table() + + create_tables() + + def download_latest_export(self): + # Find export of corpus + try: + exports = self.api_client.request( + "ListExports", + id=self.corpus_id, + )["results"] + except ErrorResponse as e: + logger.error( + f"Could not list exports of corpus ({self.corpus_id}): {str(e)}" + ) + + # Find latest that is in "done" state + exports = sorted( + list(filter(lambda exp: exp["state"] == "done", exports)), + key=operator.itemgetter("updated"), + ) + assert len(exports) > 0, "No available exports found." + + # Download latest it in a tmpfile + try: + export_id = exports[0]["id"] + download_url = self.api_client.request( + "DownloadExport", + id=export_id, + )["results"] + except ErrorResponse as e: + logger.error( + f"Could not download export ({export_id}) of corpus ({self.corpus_id}): {str(e)}" + ) + print(download_url) -class Demo(ElementsWorker): def process_element(self, element): - print("Demo processing element", element) + ... + + # List Transcriptions, Metas + # def main(): - Demo( + DatasetExtractor( description="Fill base-worker cache with information about dataset and extract images" ).run() -- GitLab