diff --git a/.dockerignore b/.dockerignore index 83782ddb8a0746a03fdd5441c69cfd62706fa63b..47fc5cf3ae128d135b9bc472869b95c21f0532a4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,9 @@ .git .eggs *.egg -logs **/__pycache__/ **/*.pyc +docker/ +Makefile +test-report.xml +arkindex/config.yml diff --git a/.gitignore b/.gitignore index 7cde2291adad29a847d6444d485d04730962cc73..eb7fbe52d583a8f98cc01ed484fd4f3dc7691df2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ htmlcov *.key arkindex/config.yml test-report.xml +docker/ssl/*.pem diff --git a/Makefile b/Makefile index ce25922e9888a3d2e41bc7ebf357b0eb5867c152..75c24b64eacb139c08e384fce043459ec541a78e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) IMAGE_TAG=registry.gitlab.teklia.com/arkindex/backend -.PHONY: all release +.PHONY: all release services all: clean build @@ -20,8 +20,8 @@ worker: test-fixtures: $(eval export PGPASSWORD=devdata) - psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1 - psql -h 127.0.0.1 -p 9100 -U devuser -c 'CREATE DATABASE arkindex_dev' template1 + psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev RENAME TO arkindex_tmp_fixtures' template1 + psql -h 127.0.0.1 -p 5432 -U devuser -c 'CREATE DATABASE arkindex_dev' template1 # A "try...finally" block in a Makefile: ensure we bring back the dev database even when test-fixtures fails -$(MAKE) test-fixtures-run $(MAKE) test-fixtures-restore @@ -33,9 +33,9 @@ test-fixtures-run: test-fixtures-restore: # This first renaming ensures that arkindex_tmp_fixtures exists; we don't want to drop arkindex_dev without a backup - psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1 - psql -h 127.0.0.1 -p 9100 -U devuser -c 'DROP DATABASE arkindex_dev' template1 - psql -h 127.0.0.1 -p 9100 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1 + psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_tmp_fixtures RENAME TO arkindex_dev_replace' template1 + psql -h 127.0.0.1 -p 5432 -U devuser -c 'DROP DATABASE arkindex_dev' template1 + psql -h 127.0.0.1 -p 5432 -U devuser -c 'ALTER DATABASE arkindex_dev_replace RENAME TO arkindex_dev' template1 require-version: @if [ ! "$(version)" ]; then echo "Missing version to publish"; exit 1; fi @@ -50,3 +50,14 @@ release: git commit VERSION -m "Version $(version)" git tag $(version) git push origin master $(version) + +stack: docker/ssl/ark-cert.pem + docker compose -p arkindex up --build + +services: docker/ssl/ark-cert.pem + docker compose -p arkindex -f docker/docker-compose.services.yml up + +docker/ssl/ark-cert.pem: + $(eval export CAROOT=$(ROOT_DIR)/docker/ssl) + mkcert -install + mkcert -cert-file=$(ROOT_DIR)/docker/ssl/ark-cert.pem -key-file=$(ROOT_DIR)/docker/ssl/ark-key.pem ark.localhost *.ark.localhost *.iiif.ark.localhost diff --git a/README.md b/README.md index 7c9bfdcbfd544b9a13ca2cb6a6ff3cfcf0b7724d..0e7b72a7d04628c276f55bddc4cad564036d4763 100644 --- a/README.md +++ b/README.md @@ -6,43 +6,82 @@ Backend for Historical Manuscripts Indexing ## Requirements -* Clone of the [architecture](https://gitlab.teklia.com/arkindex/architecture) * Git * Make -* Python 3.6+ +* Python 3.10+ * pip * [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/) +* Docker 24+ +* [mkcert](https://github.com/FiloSottile/mkcert?tab=readme-ov-file#installation) -## Dev Setup +## Setup for developers -``` +You'll also need the [Arkindex frontend](https://gitlab.teklia.com/arkindex/frontend) to be able to develop on the whole platform. + +```console git clone git@gitlab.teklia.com:arkindex/backend.git +git clone git@gitlab.teklia.com:arkindex/frontend.git cd backend -mkvirtualenv ark -a . +mkvirtualenv ark -a . -p /usr/bin/python3.10 pip install -e .[test] ``` -When the [architecture](https://gitlab.teklia.com/arkindex/architecture) is running locally to provide required services: +The Arkindex backend relies on some open-source services to store data and communicate to asynchronous workers. +To run all the required services, please run in a dedicated shell: +```console +make services ``` + +On a first run, you'll need to: + +1. Configure the instance by enabling the sample configuration. +2. Populate the database structure. +3. Initialize some fields in the database. +4. Create an administration account. + +All of these steps are done through: + +```console +cp config.yml.sample arkindex/config.yml arkindex migrate +arkindex bootstrap arkindex createsuperuser ``` -### Local configuration +Finally, you can run the backend: + +```console +arkindex runserver +``` + +At this stage, you can use `http://localhost:8000/admin` to access the administration interface. + +### Asycnhronous tasks + +To run asynchronous tasks, run in another shell: + +```console +make worker +``` -For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://wiki.vpn/en/arkindex/deploy/configuration). +### Dockerized stack +It is also possible to run the whole Arkindex stack through Docker containers. This is useful to quickly test the platform. -Another mean to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances. +This command will build all the required Docker images (backend & frontend) and run them as Docker containers: -### ImageMagick setup +```console +make stack +``` + +You'll be able to access the platform at the url `https://ark.localhost`. -PDF and image imports in Arkindex will require ImageMagick. Due to its ability to take any computer down if you give it the right parameters (for example, converting a 1000-page PDF file into JPEG files at 30 000 DPI), it has a security policy file. By default, on Ubuntu, PDF conversion is forbidden. +### Local configuration -You will need to edit the ImageMagick policy file to get PDF and Image imports to work in Arkindex. The file is located at `/etc/ImageMagick-6/policy.xml`. +For development purposes, you can customize the Arkindex settings by adding a YAML file as `arkindex/config.yml`. This file is not tracked by Git; if it exists, any configuration directive set in this file will be used for exposed settings from `settings.py`. You can view the full list of settings [on the wiki](https://redmine.teklia.com/projects/arkindex/wiki/Backend_configuration). -The line that sets the PDF policy is `<policy domain="coder" rights="none" pattern="PDF" />`. Replace `none` with `read|write` for it to work. See [this StackOverflow question](https://stackoverflow.com/questions/52998331) for more info. +Another way to customize your Arkindex instance is to add a Python file in `arkindex/project/local_settings.py`. Here you are not limited to exposed settings, and can customize any setting, or even load Python dependencies at boot time. This is not recommended, as your customization may not be available to real-world Arkindex instances. ### Local image server @@ -54,7 +93,7 @@ local_imageserver_id: 999 Here is how to quickly create the ImageServer using the shell: -``` +```python $ arkindex shell >>> from arkindex.images.models import ImageServer >>> ImageServer.objects.create(id=1, display_name='local', url='https://ark.localhost/iiif') @@ -62,11 +101,6 @@ $ arkindex shell Note that this local server will only work inside Docker. -### User groups - -We use a custom group model in `arkindex.users.models` (not the `django.contrib.auth` one). -In this early version groups do not define any right yet. - ## Usage ### Makefile @@ -83,14 +117,10 @@ At the root of the repository is a Makefile that provides commands for common op Aside from the usual Django commands, some custom commands are available via `arkindex`: -* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`); -* `from_csv`: Import manifests and index files from a CSV list; -* `import_annotations`: Import index files from a folder into a specific volume; -* `import_acts`: Import XML surface files and CSV act files; -* `delete_corpus`: Delete a big corpus using an RQ task; -* `reindex`: Reindex elements into Solr; -* `telegraf`: A special command with InfluxDB-compatible output for Grafana statistics. -* `move_lines_to_parents`: Moves element children to their geographical parents; +* `build_fixtures`: Create a set of database elements designed for use by unit tests in a fixture (see `make test-fixtures`). +* `delete_corpus`: Delete a big corpus using an RQ task. +* `reindex`: Reindex elements into Solr. +* `move_lines_to_parents`: Moves element children to their geographical parents. See `arkindex <command> --help` to view more details about a specific command. @@ -108,9 +138,9 @@ We use [pre-commit](https://pre-commit.com/) to check the Python source code syn To be efficient, you should run pre-commit before committing (hence the name...). -To do that, run once : +To do that, run once: -``` +```console pip install pre-commit pre-commit install ``` @@ -127,9 +157,9 @@ IPython will give you a nicer shell with syntax highlighting, auto reloading and [Django Debug Toolbar](https://django-debug-toolbar.readthedocs.io/en/latest/) provides you with a neat debug sidebar that will help diagnosing slow API endpoints or weird template bugs. Since the Arkindex frontend is completely decoupled from the backend, you will need to browse to an API endpoint to see the debug toolbar. -[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports most of the backend's enums and some special QuerySet features: +[Django Extensions](https://django-extensions.readthedocs.io/en/latest/) adds a *lot* of `arkindex` commands ; the most important one is `arkindex shell_plus` which runs the usual shell but with all the available models pre-imported. You can add your own imports with the `local_settings.py` file. Here is an example that imports some of the backend's enums and some special QuerySet features: -``` python +```python SHELL_PLUS_POST_IMPORTS = [ ('django.db.models', ('Value', )), ('django.db.models.functions', '*'), @@ -138,7 +168,7 @@ SHELL_PLUS_POST_IMPORTS = [ 'Right', )), ('arkindex.process.models', ( - 'DataImportMode', + 'ProcessMode', )), ('arkindex.project.aws', ( 'S3FileStatus', @@ -148,23 +178,29 @@ SHELL_PLUS_POST_IMPORTS = [ ## Asynchronous tasks -We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`. The following tasks exist: +We use [rq](https://python-rq.org/), integrated via [django-rq](https://pypi.org/project/django-rq/), to run tasks without blocking an API request or causing timeouts. To call them in Python code, you should use the trigger methods in `arkindex.project.triggers`; those will do some safety checks to make catching some errors easier in dev. The actual tasks are in `arkindex.documents.tasks`, or in other `tasks` modules within each Django app. The following tasks exist: * Delete a corpus: `corpus_delete` * Delete a list of elements: `element_trash` * Delete worker results (transcriptions, classifications, etc. of a worker version): `worker_results_delete` * Move an element to another parent: `move_element` -* Create `WorkerActivity` instances for all elements of a process: `intitialize_activity` +* Create `WorkerActivity` instances for all elements of a process: `initialize_activity` * Delete a process and its worker activities: `process_delete` * Export a corpus to an SQLite database: `export_corpus` -To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make slim` or `make` in the architecture will provide it. `make` in the architecture also provides a RQ worker running in Docker from a binary build. - -Process tasks are run in RQ by default (Community Edition). Two RQ workers must be running at the same time to actually run a process with worker activities, so the initialisation task can wait for the worker activity task to finish: -```sh -$ arkindex rqworker -v 3 default high & arkindex rqworker -v 3 tasks -``` +To run them, use `make worker` to start a RQ worker. You will need to have Redis running; `make services` will provide it. `make stack` also provides an RQ worker running in Docker from a binary build. ## Metrics + The application serves metrics for Prometheus under the `/metrics` prefix. A specific port can be used by setting the `PROMETHEUS_METRICS_PORT` environment variable, thus separating the application from the metrics API. + +## Migration from `architecture` setup + +If you were using the `architecture` repository previously to run Arkindex, you'll need to migrate MinIO data from a static path on your computer towards a new docker volume. + +```console +docker volume create arkindex_miniodata +mv /usr/share/arkindex/s3/data/iiif /var/lib/docker/volumes/arkindex_miniodata/_data/uploads +mv /usr/share/arkindex/s3/data/{export,iiif-cache,ponos-logs,ponos-artifacts,staging,thumbnails,training} /var/lib/docker/volumes/arkindex_miniodata/_data/ +``` diff --git a/arkindex/documents/management/commands/bootstrap.py b/arkindex/documents/management/commands/bootstrap.py index 580ea91591c98f96245eef3f1c6988b3596b4315..3ca2dacd4a0fbdfc5e63450c7c7c5e2d4664dbd2 100644 --- a/arkindex/documents/management/commands/bootstrap.py +++ b/arkindex/documents/management/commands/bootstrap.py @@ -14,9 +14,14 @@ from arkindex.process.models import FeatureUsage, Repository, Worker, WorkerType from arkindex.users.models import User # Constants used in architecture project -IMAGE_SERVER_ID = 12345 -IMAGE_SERVER_BUCKET = "iiif" -IMAGE_SERVER_REGION = "local" +UPLOADS_IMAGE_SERVER_ID = 12345 +UPLOADS_IMAGE_SERVER_URL = "https://uploads.iiif.ark.localhost/iiif/2" +UPLOADS_IMAGE_SERVER_BUCKET = "uploads" +UPLOADS_IMAGE_SERVER_REGION = "local" +INGEST_IMAGE_SERVER_ID = 67890 +INGEST_IMAGE_SERVER_URL = "https://ingest.iiif.ark.localhost/iiif/2" +INGEST_IMAGE_SERVER_BUCKET = "ingest" +INGEST_IMAGE_SERVER_REGION = "local" PONOS_FARM_ID = "001e411a-1111-2222-3333-444455556666" PONOS_FARM_NAME = "Bootstrap farm" PONOS_FARM_SEED = "b12868101dab84984481741663d809d2393784894d6e807ceee0bd95051bf971" @@ -52,6 +57,46 @@ class Command(BaseCommand): user.save() self.warn(f"Updated user {user} to admin") + def create_image_server(self, id, url, bucket, region, display_name): + try: + server = ImageServer.objects.get(Q(id=id) | Q(url=url)) + if server.id != id: + # Migrate existing images & server id in a single transaction + with transaction.atomic(): + server.images.update(server_id=id) + ImageServer.objects.filter(id=server.id).update(id=id) + self.warn(f"Image server {server.id} updated to {id}") + + # Update internal reference for updates below + server.id = id + + if server.url != url: + server.url = url + server.save() + + # Update base settings + if server.s3_bucket != bucket or server.s3_region != region: + server.s3_bucket = bucket + server.s3_region = region + server.save() + self.warn(f"Updated image server {server.id} S3 settings") + else: + self.success(f"Image server {server.id} valid") + except ImageServer.DoesNotExist: + try: + server = ImageServer.objects.create( + id=id, + url=url, + s3_bucket=bucket, + s3_region=region, + display_name=display_name, + ) + self.success(f"Image server {server.id} created") + except IntegrityError as e: + self.fail(f"Failed to create image server: {e}") + return + return server + def handle(self, **options): # Never allow running this script in production if not settings.DEBUG: @@ -108,47 +153,18 @@ class Command(BaseCommand): self.success(f"Created token {ADMIN_API_TOKEN}") # an image server for local cantaloupe https://ark.localhost/iiif/2 - try: - server = ImageServer.objects.get(url="https://ark.localhost/iiif/2") - if server.id != IMAGE_SERVER_ID: - # Migrate existing images & server id in a single transaction - with transaction.atomic(): - server.images.update(server_id=IMAGE_SERVER_ID) - ImageServer.objects.filter(id=server.id).update(id=IMAGE_SERVER_ID) - self.warn(f"Image server {server.id} updated to {IMAGE_SERVER_ID}") - - # Update internal reference for updates below - server.id = IMAGE_SERVER_ID - - # Update base settings - if server.s3_bucket != IMAGE_SERVER_BUCKET or server.s3_region != IMAGE_SERVER_REGION: - server.s3_bucket = IMAGE_SERVER_BUCKET - server.s3_region = IMAGE_SERVER_REGION - server.save() - self.warn("Updated image server S3 settings") - else: - self.success(f"Image server {server.id} valid") - except ImageServer.DoesNotExist: - try: - server = ImageServer.objects.create( - id=IMAGE_SERVER_ID, - url="https://ark.localhost/iiif/2", - s3_bucket=IMAGE_SERVER_BUCKET, - s3_region=IMAGE_SERVER_REGION, - display_name="Development local server", - ) - self.success("Image server created") - except IntegrityError as e: - self.fail(f"Failed to create image server: {e}") - return + uploads_server = self.create_image_server(UPLOADS_IMAGE_SERVER_ID , UPLOADS_IMAGE_SERVER_URL, UPLOADS_IMAGE_SERVER_BUCKET , UPLOADS_IMAGE_SERVER_REGION , "Local IIIF server for user uploaded files through frontend") + if uploads_server is None: + return + self.create_image_server(INGEST_IMAGE_SERVER_ID , INGEST_IMAGE_SERVER_URL, INGEST_IMAGE_SERVER_BUCKET , INGEST_IMAGE_SERVER_REGION , "Local IIIF server for ingested files from minio") # Check there is not already a local server with invalid path # We'll merge its image into the new one # This bad server may have been created by automatic IIIF server detection try: - bad_server = ImageServer.objects.get(url="https://ark.localhost/iiif") - bad_server.merge_into(server) - self.warn(f"Merged images from {bad_server.id} into {server.id}") + bad_server = ImageServer.objects.get(url="https://uploads.iiif.ark.localhost/iiif") + bad_server.merge_into(uploads_server) + self.warn(f"Merged images from {bad_server.id} into {uploads_server.id}") bad_server.delete() self.warn("Deleted old server") @@ -194,17 +210,21 @@ class Command(BaseCommand): ) self.success(f"Created revision {revision.hash}") - version, created = worker.versions.get_or_create( - revision=revision, - defaults={ - "id": IMPORT_WORKER_VERSION_ID, - "configuration": {}, - "state": WorkerVersionState.Created, - "gpu_usage": FeatureUsage.Disabled, - "docker_image": None, - "docker_image_iid": None, - } - ) + try: + version = WorkerVersion.objects.get(id=IMPORT_WORKER_VERSION_ID) + created = False + except WorkerVersion.DoesNotExist: + version, created = worker.versions.get_or_create( + revision=revision, + defaults={ + "id": IMPORT_WORKER_VERSION_ID, + "configuration": {}, + "state": WorkerVersionState.Created, + "gpu_usage": FeatureUsage.Disabled, + "docker_image": None, + "docker_image_iid": None, + } + ) if created: self.success(f"Created worker version {version.slug}") else: diff --git a/arkindex/images/models.py b/arkindex/images/models.py index 17b3f44b0e709f1e24b75cdad3270f2529df077b..a6825414399ad31b889581815fb32e22ef5fe8d7 100644 --- a/arkindex/images/models.py +++ b/arkindex/images/models.py @@ -15,7 +15,7 @@ from django.utils.text import slugify from enumfields import EnumField from arkindex.images.managers import ImageServerManager -from arkindex.project.aws import S3FileMixin, S3FileStatus +from arkindex.project.aws import S3FileMixin, S3FileStatus, should_verify_cert from arkindex.project.fields import LStripTextField, MD5HashField, StripSlashURLField from arkindex.project.models import IndexableModel @@ -238,7 +238,7 @@ class Image(S3FileMixin, IndexableModel): requests_exception = None try: # Load info - resp = requests.get(info_url, timeout=15, allow_redirects=True) + resp = requests.get(info_url, timeout=15, allow_redirects=True, verify=should_verify_cert(info_url)) resp.raise_for_status() try: payload = resp.json() diff --git a/arkindex/project/aws.py b/arkindex/project/aws.py index 0fb148394589596ef9e4936e46d8548affa48ba0..ef0edfb7251a1c988cd35ddf823522acb460821c 100644 --- a/arkindex/project/aws.py +++ b/arkindex/project/aws.py @@ -1,6 +1,7 @@ import logging from functools import wraps from io import BytesIO +from urllib.parse import urlparse import boto3.session from botocore.config import Config @@ -13,6 +14,18 @@ from tenacity import retry, retry_if_exception, stop_after_delay logger = logging.getLogger(__name__) +def should_verify_cert(url): + """ + Skip SSL certification validation when hitting a development instance + """ + # Special case when no url is provided + if url is None: + return True + + host = urlparse(url).netloc + return not host.endswith("ark.localhost") + + def get_s3_resource( access_key_id=settings.AWS_ACCESS_KEY, secret_access_key=settings.AWS_SECRET_KEY, @@ -35,6 +48,7 @@ def get_s3_resource( "s3", endpoint_url=endpoint, config=config, + verify=should_verify_cert(endpoint), ) diff --git a/arkindex/project/config.py b/arkindex/project/config.py index 8e373583ac0f4ded09cd49ee77f34950b1c6892d..147e8528cc828f8d73f334f83f07da985065d686 100644 --- a/arkindex/project/config.py +++ b/arkindex/project/config.py @@ -97,7 +97,7 @@ def get_settings_parser(base_dir): database_parser = parser.add_subparser("database", default={}) database_parser.add_option("name", type=str, default="arkindex_dev") database_parser.add_option("host", type=str, default="localhost") - database_parser.add_option("port", type=int, default=9100) + database_parser.add_option("port", type=int, default=5432) database_parser.add_option("user", type=str, default="devuser") database_parser.add_option("password", type=str, default="devdata") diff --git a/arkindex/project/settings.py b/arkindex/project/settings.py index 4399e4c525b16c8ff019dd036ae7d60599a6c7a5..e9db6bcd612378d17492b9918f9f25b96163ed1d 100644 --- a/arkindex/project/settings.py +++ b/arkindex/project/settings.py @@ -27,7 +27,7 @@ BASE_DIR = Path(_base_dir) if _base_dir else Path(__file__).resolve().parent.par # Used for special cases during configuration parsing and settings loading TEST_ENV = "test" in sys.argv -# Matches ./manage.py shell[_plus] and arkindex shell[_plus] to disable Sentry reporting +# Matches arkindex shell[_plus] to disable Sentry reporting DJANGO_SHELL = len(sys.argv) > 1 and sys.argv[1] in ("shell", "shell_plus") CONFIG_PATH = Path(os.environ.get("CONFIG_PATH", BASE_DIR / "config.yml")) diff --git a/arkindex/project/tests/config_samples/defaults.yaml b/arkindex/project/tests/config_samples/defaults.yaml index bd19a0a3efbee8dc6770f782bc1d3927ad64986c..0fd43ab9933478d6dafb0434ea0e94781d2d4d47 100644 --- a/arkindex/project/tests/config_samples/defaults.yaml +++ b/arkindex/project/tests/config_samples/defaults.yaml @@ -24,7 +24,7 @@ database: host: localhost name: arkindex_dev password: devdata - port: 9100 + port: 5432 replica: null user: devuser docker: diff --git a/arkindex/project/tests/test_aws.py b/arkindex/project/tests/test_aws.py new file mode 100644 index 0000000000000000000000000000000000000000..7aae4cb45d7bb9c3654230aa3fd8d6d8ceff86c8 --- /dev/null +++ b/arkindex/project/tests/test_aws.py @@ -0,0 +1,12 @@ +from django.test import TestCase + +from arkindex.project.aws import should_verify_cert # noqa + + +class AWSTestCase(TestCase): + + def test_should_verify_cert(self): + self.assertTrue(should_verify_cert("https://google.fr/whatever")) + self.assertFalse(should_verify_cert("https://minio.ark.localhost/bucket/object")) + self.assertFalse(should_verify_cert("https://ark.localhost/corpus")) + self.assertTrue(should_verify_cert(None)) diff --git a/base/Dockerfile b/base/Dockerfile index 4f7b450ba7a7cbc29a6e07248dedc405bac66d1c..5bf809476cd2e090d7ec41a60b3a4184a4bad7db 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -6,8 +6,3 @@ RUN /bootstrap.sh # Add unprivilegied user RUN addgroup --gid 1000 teklia && adduser --disabled-login --uid 1000 --ingroup teklia ark - -# Add archi local CA -COPY ./dev-ca.pem /usr/local/share/ca-certificates/arkindex-dev.crt -RUN update-ca-certificates -ENV REQUESTS_CA_BUNDLE /etc/ssl/certs/ca-certificates.crt diff --git a/base/dev-ca.pem b/base/dev-ca.pem deleted file mode 100644 index 0184b64712a5a7f36d32127c13a8ce7d1aefcc95..0000000000000000000000000000000000000000 --- a/base/dev-ca.pem +++ /dev/null @@ -1,23 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIDzTCCArWgAwIBAgIJAMIk32qc3uH5MA0GCSqGSIb3DQEBCwUAMH0xCzAJBgNV -BAYTAkZSMREwDwYDVQQIDAhJc8ODwqhyZTERMA8GA1UEBwwIR3Jlbm9ibGUxDzAN -BgNVBAoMBlRla2xpYTERMA8GA1UEAwwIQXJraW5kZXgxJDAiBgkqhkiG9w0BCQEW -FWtlcm1vcnZhbnRAdGVrbGlhLmNvbTAeFw0xODA0MDkwODI1MzBaFw00MjExMzAw -ODI1MzBaMH0xCzAJBgNVBAYTAkZSMREwDwYDVQQIDAhJc8ODwqhyZTERMA8GA1UE -BwwIR3Jlbm9ibGUxDzANBgNVBAoMBlRla2xpYTERMA8GA1UEAwwIQXJraW5kZXgx -JDAiBgkqhkiG9w0BCQEWFWtlcm1vcnZhbnRAdGVrbGlhLmNvbTCCASIwDQYJKoZI -hvcNAQEBBQADggEPADCCAQoCggEBALDSzuXMJotLPqA8rK8c1GCK9G54VQKgieG8 -agLOd0a3ALh+Qz9uLSPEPz40zxjXLLMVIYqHW9CynP5su62gdcpZ0CVImF1e0bgF -U+x0RpNFtceh/RixNL5b9XA9Y3By67jpZfjLC9d0WRQOaIOSW/wUTGWUbW0y/OWg -dc5Qihn32icVit8ogfUCBoH8v0OypiF+AmJHFUq2rjCB0fmvLLZscSdMe4XsYLa0 -7eFRdnKesfE3ooQODnoL2zDkDqhY31PRsCrHquHLO0U7v5NhsfUJs5K9COQeCHW3 -q03kOIecoi1otPYGf07MWKn3AR399HifYHjm5+YYBZ9t7MhOkScCAwEAAaNQME4w -HQYDVR0OBBYEFLj6DCMMKOYYQE8KvRRjFJEwxbXUMB8GA1UdIwQYMBaAFLj6DCMM -KOYYQE8KvRRjFJEwxbXUMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEB -AHPpwfg/N4QNgzmK0BV//H6n96C+Vu5E3A71zKsAZj231K+pcwvOWEZHPV/h9fcV -jHQg5crQsZy7CoV2VdTKbprL/F7W+JsUEPrk3xnqnqqIexDm1m8pua1XCLurU+Sy -588XbzNlOGDzfI8kWhWS9rEJWpVvadQ4PhOlORlU7oRgAjCOXZHNLHs6IdS4yUH5 -TqG9Tv3n7503Jyc5gnLzjJdUg7a3r/7awGr6nWZrdSE29ErLVY+NAUQmjkD7A0kD -ds4tZOSq44zthZiwI33Jj56eGcN+MjRcqFuziZnJt/NWPX7F0+4XenGmqmcjvlKO -zFjKKiOsA01MRJmxSUw6CF8= ------END CERTIFICATE----- diff --git a/config.yml.sample b/config.yml.sample new file mode 100644 index 0000000000000000000000000000000000000000..539c78314dd191721d662aa4dad56088111b1bef --- /dev/null +++ b/config.yml.sample @@ -0,0 +1,36 @@ +--- + +public_hostname: http://localhost:8000 + +local_imageserver_id: 12345 + +s3: + access_key_id: minio1234 + secret_access_key: minio1234 + endpoint: https://minio.ark.localhost + region: localdev + +ponos: + default_env: + ARKINDEX_API_TOKEN: deadbeefTestToken + +features: + signup: yes + search: yes + +docker: + tasks_image: registry.gitlab.teklia.com/arkindex/tasks:latest + +imports_worker_version: f2bb8dd7-55e9-49ae-9bd9-b1d2e5d491b9 + +solr: + api_url: http://localhost:8983/solr/ + +ingest: + access_key_id: minio1234 + secret_access_key: minio1234 + endpoint: https://minio.ark.localhost + region: localdev + imageserver_id: 3 + extra_buckets: + - ingest diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..40e5a91e080811108aae3aa523c64118224e972b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,66 @@ +--- +include: + - docker/docker-compose.services.yml + +services: + + backend: + container_name: ark-backend + build: . + + depends_on: + - db + - redis + - lb + + labels: + traefik.enable: true + traefik.http.routers.backend.rule: Host(`ark.localhost`) && (PathPrefix(`/api/`) || PathPrefix(`/api-docs/`) || PathPrefix(`/admin/`) || PathPrefix(`/rq/`) || PathPrefix(`/static/`)) + traefik.http.routers.backend.tls: true + + environment: + CONFIG_PATH: /arkindex.yml + + volumes: + - ./docker/config.yml:/arkindex.yml:ro + + healthcheck: + # start_interval is not fully implemented in Docker, until then we will use a short interval all the time + # https://github.com/moby/moby/issues/45897 + interval: 5s + + worker: + container_name: ark-worker + build: . + command: arkindex rqworker-pool --num-workers 2 -v 1 default high tasks + + depends_on: + - db + - redis + - backend + environment: + CONFIG_PATH: /arkindex.yml + + volumes: + - ./docker/config.yml:/arkindex.yml:ro + + # Required to host temporary ponos data + # and share common paths between host and containers + - /tmp:/tmp + + # Required to run process tasks + - /var/run/docker.sock:/var/run/docker.sock + + front: + container_name: ark-front + + build: ../frontend + + depends_on: + - lb + - backend + + labels: + traefik.enable: true + traefik.http.routers.frontend.rule: Host(`ark.localhost`) + traefik.http.routers.frontend.tls: true diff --git a/docker/cantaloupe.properties b/docker/cantaloupe.properties new file mode 100644 index 0000000000000000000000000000000000000000..3b18a3497266dcd2cee6597497371477e37984d8 --- /dev/null +++ b/docker/cantaloupe.properties @@ -0,0 +1,43 @@ +# Base http setup behind traefik +http.enabled = true +http.host = 0.0.0.0 +http.port = 80 +http.http2.enabled = false +https.enabled = false + +# Explicitly run only IIIF 2 +endpoint.iiif.1.enabled = false +endpoint.iiif.2.enabled = true + +# Use minio with multiple buckets (for uploads and ingest) +source.static = S3Source +S3Source.endpoint = http://minio:9000 +S3Source.region = local +S3Source.top_domain = iiif.ark.localhost +S3Source.lookup_strategy = ScriptLookupStrategy +S3Source.BasicLookupStrategy.bucket.name = + +# Use minio also for cache, in a dedicated bucket, for a full week +cache.server.derivative.enabled = true +cache.server.derivative = S3Cache +cache.server.derivative.ttl_seconds = 604800 +S3Cache.endpoint = http://minio:9000 +S3Cache.region = local +S3Cache.bucket.name = iiif-cache + +# Display info level on console +log.application.level = info +log.application.ConsoleAppender.enabled = true + +# Log all 4xx/5xx errors. This can cause duplicate logs to show up, since some +# errors will already be logged by some exception handlers, but some errors +# might not be logged otherwise. +log_error_responses = true + +# Configure light heap cache +HeapCache.target_size = 2G +HeapCache.persist = false + +# Setup JPEG processors compatible with libjpegturbo +processor.downscale_filter = lanczos3 +processor.upscale_filter = lanczos3 diff --git a/docker/config.yml b/docker/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..f9a36f9de348945432b42da1d02070c3bdf84266 --- /dev/null +++ b/docker/config.yml @@ -0,0 +1,54 @@ +cache: + type: memory + +static: + static_root: /backend_static + +database: + host: ark-database + port: 5432 + name: arkindex_dev + user: devuser + password: devdata + +redis: + host: ark-redis + +s3: + access_key_id: minio1234 + secret_access_key: minio1234 + endpoint: https://minio.ark.localhost + region: local + +allowed_hosts: + - .ark.localhost + +public_hostname: https://ark.localhost + +session: + cookie_domain: ark.localhost + +csrf: + cookie_domain: ark.localhost + trusted_origins: + - 'https://*.ark.localhost' + +cors: + origin_whitelist: + - http://localhost:8080 + - http://127.0.0.1:8080 + - https://ark.localhost + +local_imageserver_id: 12345 +imports_worker_version: f2bb8dd7-55e9-49ae-9bd9-b1d2e5d491b9 + +features: + signup: yes + search: yes + +solr: + api_url: http://ark-solr:8983/solr/ + +ponos: + default_env: + ARKINDEX_API_URL: https://ark.localhost/api/v1/ diff --git a/docker/docker-compose.services.yml b/docker/docker-compose.services.yml new file mode 100644 index 0000000000000000000000000000000000000000..1dc148c7b6ce6716589b6e55dde7fd3d5c4d7d8e --- /dev/null +++ b/docker/docker-compose.services.yml @@ -0,0 +1,146 @@ +--- +services: + + lb: + container_name: ark-lb + image: traefik:2.11 + ports: + # No need to expose on 0.0.0.0 as other users would not resolve + # the .localhost domain on an external IP + - 127.0.0.1:80:80 + - 127.0.0.1:443:443 + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./traefik.toml:/traefik.toml + - ./ssl:/certs + + networks: + default: + aliases: + - ark.localhost + - traefik.ark.localhost + + # Required for backend to resolve services using the public DNS + - minio.ark.localhost + - uploads.iiif.ark.localhost + - ingest.iiif.ark.localhost + + labels: + # Expose traefik dashboard on traefik.ark.localhost + traefik.enable: true + traefik.http.routers.traefik.rule: Host(`traefik.ark.localhost`) + traefik.http.routers.traefik.service: api@internal + traefik.http.routers.traefik.tls: true + + redis: + container_name: ark-redis + image: redis:alpine + volumes: + - redisdata:/data + ports: + - 127.0.0.1:6379:6379 + + # Solr image + solr: + image: solr:9 + container_name: ark-solr + command: + - solr + - -f + - -cloud + - -noprompt + volumes: + - solrdata:/var/solr + ports: + - 127.0.0.1:8983:8983 + + # Dev exposes postgresql + # and uses local postgresql data + db: + container_name: ark-database + image: postgis/postgis:14-3.2 + shm_size: '512mb' + + ports: + - 127.0.0.1:5432:5432 + volumes: + - pgdata:/var/lib/postgresql/data + environment: + POSTGRES_USER: devuser + POSTGRES_PASSWORD: devdata + POSTGRES_DB: arkindex_dev + + # IIIF server + cantaloupe: + container_name: ark-cantaloupe + image: registry.gitlab.teklia.com/iiif/cantaloupe:5.0.5-p2 + expose: + - 80 + environment: + AWS_ACCESS_KEY_ID: minio1234 + AWS_SECRET_ACCESS_KEY: minio1234 + labels: + traefik.enable: true + traefik.http.routers.cantaloupe.rule: HostRegexp(`iiif.ark.localhost`, `{subdomain:[a-z]+}.iiif.ark.localhost`) + traefik.http.routers.cantaloupe.tls: true + volumes: + - ./cantaloupe.properties:/etc/cantaloupe.properties:ro + depends_on: + - minio + - lb + + minio: + container_name: ark-minio + image: minio/minio:RELEASE.2021-10-02T16-31-05Z.fips + command: server /data --compat --console-address :9001 + environment: + MINIO_BROWSER_REDIRECT_URL: https://minio-console.ark.localhost + MINIO_SERVER_URL: https://minio.ark.localhost + MINIO_ROOT_USER: minio1234 + MINIO_ROOT_PASSWORD: minio1234 + expose: + - 9000 + - 9001 + volumes: + - miniodata:/data + + # Embed our internal CA cert to allow the console + # to verify the backend through traefik using real urls + - type: bind + target: /root/.minio/certs/CAs/arkindex-dev.pem + source: ./ssl/rootCA.pem + labels: + traefik.enable: true + traefik.http.routers.minio.rule: Host(`minio.ark.localhost`) + traefik.http.routers.minio.tls: true + traefik.http.routers.minio.service: minio-service + traefik.http.services.minio-service.loadbalancer.server.port: 9000 + traefik.http.routers.minio-console.rule: Host(`minio-console.ark.localhost`) + traefik.http.routers.minio-console.tls: true + traefik.http.routers.minio-console.service: minio-console-service + traefik.http.services.minio-console-service.loadbalancer.server.port: 9001 + depends_on: + - lb + + minio-buckets: + container_name: ark-minio-buckets + image: minio/mc:latest + + environment: + MC_HOST_ark: http://minio1234:minio1234@minio:9000 + + # Create all required buckets on the minio described above + command: mb -p ark/export ark/uploads ark/iiif-cache ark/ponos-artifacts ark/ponos-logs ark/staging ark/thumbnails ark/ingest + + depends_on: + - minio + +volumes: + solrdata: + driver: local + redisdata: + driver: local + miniodata: + driver: local + pgdata: + driver: local diff --git a/docker/ssl/traefik.toml b/docker/ssl/traefik.toml new file mode 100644 index 0000000000000000000000000000000000000000..261e7b3225144370b47dc469f2736e02e84cd365 --- /dev/null +++ b/docker/ssl/traefik.toml @@ -0,0 +1,3 @@ +[[tls.certificates]] + certFile = "/certs/ark-cert.pem" + keyFile = "/certs/ark-key.pem" diff --git a/docker/traefik.toml b/docker/traefik.toml new file mode 100644 index 0000000000000000000000000000000000000000..0951d2e10de26ca2fc4a2e0314bc2e5d2b5dd411 --- /dev/null +++ b/docker/traefik.toml @@ -0,0 +1,29 @@ +defaultEntryPoints = ["websecure"] +logLevel = "DEBUG" + +[providers] + [providers.docker] + endpoint = "unix:///var/run/docker.sock" + defaultRule = "Host(`localhost`)" + watch = true + exposedByDefault = false + + [providers.file] + directory = "/certs" + watch = true + +[accessLog] + +[api] + dashboard = true + +[entryPoints] + [entryPoints.web] + address = ":80" + + [entryPoints.web.http.redirections.entryPoint] + to = "websecure" + scheme = "https" + + [entryPoints.websecure] + address = ":443"