Skip to content
Snippets Groups Projects
Verified Commit a0f977eb authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

wip

parent 876fcd3b
No related branches found
No related tags found
1 merge request!2Implement worker
Pipeline #81791 passed
...@@ -9,5 +9,5 @@ def test_dummy(): ...@@ -9,5 +9,5 @@ def test_dummy():
def test_import(): def test_import():
"""Import our newly created module, through importlib to avoid parsing issues""" """Import our newly created module, through importlib to avoid parsing issues"""
worker = importlib.import_module("worker_generic_training_dataset.worker") worker = importlib.import_module("worker_generic_training_dataset.worker")
assert hasattr(worker, "Demo") assert hasattr(worker, "DatasetExtractor")
assert hasattr(worker.Demo, "process_element") assert hasattr(worker.DatasetExtractor, "process_element")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging
import operator
from apistar.exceptions import ErrorResponse
from arkindex_worker.cache import create_tables, create_version_table, init_cache_db
from arkindex_worker.worker import ElementsWorker from arkindex_worker.worker import ElementsWorker
logger = logging.getLogger(__name__)
class DatasetExtractor(ElementsWorker):
def configure(self):
super().configure()
# database arg is mandatory in dev mode
assert (
not self.is_read_only or self.args.database is not None
), "`--database` arg is mandatory in developer mode."
# Download corpus
self.download_latest_export()
# Initialize db that will be written
self.initialize_database()
def initialize_database(self):
# Create db at
# - self.workdir.parent / self.task_id in Arkindex mode
# - self.args.database in dev mode
database_path = (
self.args.database
if self.is_read_only
else self.workdir.parent / self.task_id
)
init_cache_db(database_path)
create_version_table()
create_tables()
def download_latest_export(self):
# Find export of corpus
try:
exports = self.api_client.request(
"ListExports",
id=self.corpus_id,
)["results"]
except ErrorResponse as e:
logger.error(
f"Could not list exports of corpus ({self.corpus_id}): {str(e)}"
)
# Find latest that is in "done" state
exports = sorted(
list(filter(lambda exp: exp["state"] == "done", exports)),
key=operator.itemgetter("updated"),
)
assert len(exports) > 0, "No available exports found."
# Download latest it in a tmpfile
try:
export_id = exports[0]["id"]
download_url = self.api_client.request(
"DownloadExport",
id=export_id,
)["results"]
except ErrorResponse as e:
logger.error(
f"Could not download export ({export_id}) of corpus ({self.corpus_id}): {str(e)}"
)
print(download_url)
class Demo(ElementsWorker):
def process_element(self, element): def process_element(self, element):
print("Demo processing element", element) ...
# List Transcriptions, Metas
#
def main(): def main():
Demo( DatasetExtractor(
description="Fill base-worker cache with information about dataset and extract images" description="Fill base-worker cache with information about dataset and extract images"
).run() ).run()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment