diff --git a/.arkindex.yml b/.arkindex.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1f435a4bbdf7f6476795a1af19d462a5ac1b4331
--- /dev/null
+++ b/.arkindex.yml
@@ -0,0 +1,11 @@
+---
+version: 2
+
+type: worker
+
+workers:
+  - slug: generic-training-dataset
+    name: Generic Training Dataset Extractor
+    type: data-extract
+    docker:
+      build: Dockerfile
diff --git a/.cookiecutter.json b/.cookiecutter.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cbe71b55868b43f8b6d930af081deb77531d38e
--- /dev/null
+++ b/.cookiecutter.json
@@ -0,0 +1,8 @@
+{
+  "slug": "generic-training-dataset",
+  "name": "Generic Training Dataset Extractor",
+  "description": "Fill base-worker cache with information about dataset and extract images",
+  "worker_type": "data-extract",
+  "author": "Teklia",
+  "email": "contact@teklia.com"
+}
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..e64c35dd6b8480f97c8b387bb681f762b705895b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+.tox
+.git
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000000000000000000000000000000000..7a3797fc6b71677df500d8386cf70e3173a7f79f
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 150
+exclude = .git,__pycache__
+ignore = E203,E501,W503
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1287c575804425c85bfb8ea466ca806fd0810199
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*.egg-info/
+.tox/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..23f062fa642bf0a4068aa3e274a8861b8625c0ba
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,84 @@
+stages:
+  - test
+  - build
+  - release
+
+test:
+  image: python:3
+
+  stage: test
+  cache:
+    paths:
+      - .cache/pip
+
+  variables:
+    PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
+    ARKINDEX_API_SCHEMA_URL: schema.yml
+
+  before_script:
+    - pip install tox
+
+    # Download OpenAPI schema from last backend build
+    - curl https://assets.teklia.com/arkindex/openapi.yml > schema.yml
+
+  except:
+    - schedules
+
+  script:
+    - tox -- --junitxml=test-report.xml --durations=50
+
+lint:
+  image: python:3
+
+  cache:
+    paths:
+      - .cache/pip
+      - .cache/pre-commit
+
+  variables:
+    PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
+    PRE_COMMIT_HOME: "$CI_PROJECT_DIR/.cache/pre-commit"
+
+  before_script:
+    - pip install pre-commit
+
+  except:
+    - schedules
+
+  script:
+    - pre-commit run -a
+
+docker-build:
+  stage: build
+  image: docker:19.03.1
+  services:
+    - docker:dind
+  variables:
+    DOCKER_DRIVER: overlay2
+    DOCKER_HOST: tcp://docker:2375/
+
+  except:
+    - schedules
+
+  script:
+    - ci/build.sh
+
+release-notes:
+  stage: release
+  image: registry.gitlab.com/teklia/devops:latest
+
+  only:
+    - tags
+
+  script:
+    - devops release-notes
+
+bump-python-deps:
+  stage: release
+  image: registry.gitlab.com/teklia/devops:latest
+
+  only:
+    - schedules
+
+  script:
+    - devops python-deps requirements.txt
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..f5257078ed83767ea2142f9099320f0f4319ee56
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,7 @@
+[settings]
+# Compatible with black
+profile = black
+
+default_section=FIRSTPARTY
+known_first_party = arkindex,arkindex_worker
+known_third_party = pytest,setuptools
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93d07e6993e07300b856e6c8457e9074805baa49
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,41 @@
+repos:
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/ambv/black
+    rev: 23.1.0
+    hooks:
+    - id: black
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+        additional_dependencies:
+          - 'flake8-coding==1.3.2'
+          - 'flake8-debugger==4.1.2'
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-ast
+      - id: check-docstring-first
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: debug-statements
+      - id: trailing-whitespace
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: ['--django']
+      - id: check-json
+      - id: requirements-txt-fixer
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.2
+    hooks:
+      - id: codespell
+        args: ['--write-changes']
+  - repo: meta
+    hooks:
+      - id: check-useless-excludes
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..90a7be25544148b18184577eea0397eb6ffa8262
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3
+
+WORKDIR /src
+
+# Install worker as a package
+COPY worker_generic_training_dataset worker_generic_training_dataset
+COPY requirements.txt setup.py VERSION ./
+RUN pip install .
+
+# Add archi local CA
+RUN curl https://assets.teklia.com/teklia_dev_ca.pem > /usr/local/share/ca-certificates/arkindex-dev.crt && update-ca-certificates
+ENV REQUESTS_CA_BUNDLE /etc/ssl/certs/ca-certificates.crt
+
+CMD ["worker-generic-training-dataset"]
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..fd959fa8501e56bc4f1869e363b4a2118a86edce
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include requirements.txt
+include VERSION
diff --git a/README.md b/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..af907a18f989a703b0039b869f4abd64c2a7f243 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
+# generic-training-dataset
+
+Fill base-worker cache with information about dataset and extract images
+
+### Development
+
+For development and tests purpose it may be useful to install the worker as a editable package with pip.
+
+```shell
+pip3 install -e .
+```
+
+### Linter
+
+Code syntax is analyzed before submitting the code.\
+To run the linter tools suite you may use pre-commit.
+
+```shell
+pip install pre-commit
+pre-commit run -a
+```
+
+### Run tests
+
+Tests are executed with tox using [pytest](https://pytest.org).
+
+```shell
+pip install tox
+tox
+```
+
+To recreate tox virtual environment (e.g. a dependencies update), you may run `tox -r`
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000000000000000000000000000000000000..6e8bf73aa550d4c57f6f35830f1bcdc7a4a62f38
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/ci/build.sh b/ci/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f29f50f27b88056216e6f880bb713012fc4e9956
--- /dev/null
+++ b/ci/build.sh
@@ -0,0 +1,32 @@
+#!/bin/sh -e
+# Build the tasks Docker image.
+# Requires CI_PROJECT_DIR and CI_REGISTRY_IMAGE to be set.
+# VERSION defaults to latest.
+# Will automatically login to a registry if CI_REGISTRY, CI_REGISTRY_USER and CI_REGISTRY_PASSWORD are set.
+# Will only push an image if $CI_REGISTRY is set.
+
+if [ -z "$VERSION" ]; then
+	VERSION=${CI_COMMIT_TAG:-latest}
+fi
+
+if [ -z "$VERSION" -o -z "$CI_PROJECT_DIR" -o -z "$CI_REGISTRY_IMAGE" ]; then
+	echo Missing environment variables
+	exit 1
+fi
+
+IMAGE_TAG="$CI_REGISTRY_IMAGE:$VERSION"
+
+cd $CI_PROJECT_DIR
+docker build -f Dockerfile . -t "$IMAGE_TAG"
+
+# Publish the image on the main branch or on a tag
+if [ "$CI_COMMIT_REF_NAME" = "main" -o -n "$CI_COMMIT_TAG" ]; then
+  if [ -n "$CI_REGISTRY" -a -n "$CI_REGISTRY_USER" -a -n "$CI_REGISTRY_PASSWORD" ]; then
+    echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY
+    docker push $IMAGE_TAG
+  else
+    echo "Missing environment variables to log in to the container registryâ€¦"
+  fi
+else
+  echo "The build was not published to the repository registry (only for main branch or tags)â€¦"
+fi
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ff1be6e274e323aa1ab7d829d7653efed63ff8c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+arkindex-base-worker==0.3.2
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..130a32ccefa323ccd88c9d3cdf8afaeb27ff5ba1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+MODULE = "worker_generic_training_dataset"
+COMMAND = "worker-generic-training-dataset"
+
+
+def parse_requirements_line(line):
+    """Special case for git requirements"""
+    if line.startswith("git+http"):
+        assert "@" in line, "Branch should be specified with suffix (ex: @master)"
+        assert (
+            "#egg=" in line
+        ), "Package name should be specified with suffix (ex: #egg=kraken)"
+        package_name = line.split("#egg=")[-1]
+        return f"{package_name} @ {line}"
+    else:
+        return line
+
+
+def parse_requirements():
+    path = Path(__file__).parent.resolve() / "requirements.txt"
+    assert path.exists(), f"Missing requirements: {path}"
+    return list(
+        map(parse_requirements_line, map(str.strip, path.read_text().splitlines()))
+    )
+
+
+setup(
+    name=MODULE,
+    version=open("VERSION").read(),
+    description="Fill base-worker cache with information about dataset and extract images",
+    author="Teklia",
+    author_email="contact@teklia.com",
+    install_requires=parse_requirements(),
+    entry_points={"console_scripts": [f"{COMMAND}={MODULE}.worker:main"]},
+    packages=find_packages(),
+)
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..14dfec177864ca1da9e6470d5bcb84f1a51e530d
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+import os
+
+import pytest
+
+from arkindex.mock import MockApiClient
+from arkindex_worker.worker.base import BaseWorker
+
+
+@pytest.fixture(autouse=True)
+def setup_environment(responses, monkeypatch):
+    """Setup needed environment variables"""
+
+    # Allow accessing remote API schemas
+    # defaulting to the prod environment
+    schema_url = os.environ.get(
+        "ARKINDEX_API_SCHEMA_URL",
+        "https://arkindex.teklia.com/api/v1/openapi/?format=openapi-json",
+    )
+    responses.add_passthru(schema_url)
+
+    # Set schema url in environment
+    os.environ["ARKINDEX_API_SCHEMA_URL"] = schema_url
+    # Setup a fake worker run ID
+    os.environ["ARKINDEX_WORKER_RUN_ID"] = "1234-generic-training-dataset"
+
+    # Setup a mock api client instead of using a real one
+    monkeypatch.setattr(BaseWorker, "setup_api_client", lambda _: MockApiClient())
diff --git a/tests/test_worker.py b/tests/test_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb38787b81cbba08837e9c16035209e7c37fdcdd
--- /dev/null
+++ b/tests/test_worker.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import importlib
+
+
+def test_dummy():
+    assert True
+
+
+def test_import():
+    """Import our newly created module, through importlib to avoid parsing issues"""
+    worker = importlib.import_module("worker_generic_training_dataset.worker")
+    assert hasattr(worker, "Demo")
+    assert hasattr(worker.Demo, "process_element")
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000000000000000000000000000000000000..dcc4e6c9ff0ae34f0df3801c145121eb61438eb4
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,12 @@
+[tox]
+envlist = worker-generic-training-dataset
+
+[testenv]
+passenv = ARKINDEX_API_SCHEMA_URL
+commands =
+  pytest {posargs}
+
+deps =
+  pytest
+  pytest-responses
+  -rrequirements.txt
diff --git a/worker_generic_training_dataset/__init__.py b/worker_generic_training_dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/worker_generic_training_dataset/worker.py b/worker_generic_training_dataset/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8489d561aeba8354d60155803cdcb9c405d68112
--- /dev/null
+++ b/worker_generic_training_dataset/worker.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+from arkindex_worker.worker import ElementsWorker
+
+
+class Demo(ElementsWorker):
+    def process_element(self, element):
+        print("Demo processing element", element)
+
+
+def main():
+    Demo(
+        description="Fill base-worker cache with information about dataset and extract images"
+    ).run()
+
+
+if __name__ == "__main__":
+    main()