Skip to content
Snippets Groups Projects
Verified Commit 5bb22d95 authored by Erwan Rouchet's avatar Erwan Rouchet
Browse files

Cleanup IIIF images on S3 that do not exist in DB

parent 06c4b4a6
No related branches found
No related tags found
1 merge request!1652Cleanup IIIF images on S3 that do not exist in DB
import re
from datetime import timedelta
from urllib.parse import quote
from botocore.exceptions import ClientError
from django.conf import settings
......@@ -9,6 +10,7 @@ from django.utils import timezone
from arkindex.dataimport.models import DataFile, GitRef, GitRefType, WorkerVersion, WorkerVersionState
from arkindex.documents.models import CorpusExport, CorpusExportState
from arkindex.images.models import Image, ImageServer
from arkindex.project.aws import s3
from ponos.models import Artifact, Task, Workflow
......@@ -28,6 +30,8 @@ class Command(BaseCommand):
self.cleanup_trashed_datafiles()
self.cleanup_local_images()
def cleanup_artifacts(self):
"""
Remove all Ponos artifacts that are not tied to a DataImport
......@@ -151,3 +155,25 @@ class Command(BaseCommand):
datafile.delete()
self.stdout.write(self.style.SUCCESS('Successfully cleaned up DataFiles marked as trashed.'))
def cleanup_local_images(self):
"""
Remove all images on the S3 bucket of the local IIIF server that are not in the database
"""
self.stdout.write('Removing orphaned local images…')
bucket = s3.Bucket(ImageServer.objects.local.s3_bucket)
for obj in bucket.objects.all():
try:
# Use quote() to translate S3 keys into IIIF paths.
# safe='' ensures that slashes are translated into %2F
ImageServer.objects.local.images.get(path=quote(obj.key, safe=''))
except Image.DoesNotExist:
# When no image is found, delete the S3 file
self.stdout.write(f'Removing image {obj.key}')
try:
obj.delete()
except ClientError as e:
self.stdout.write(self.style.ERROR(str(e)))
self.stdout.write(self.style.SUCCESS('Successfully cleaned up orphaned local images.'))
......@@ -10,6 +10,7 @@ from django.test import override_settings
from arkindex.dataimport.models import DataFile, GitRefType, Repository, RepositoryType, WorkerVersionState
from arkindex.documents.models import CorpusExport, CorpusExportState
from arkindex.images.models import ImageServer
from arkindex.project.tests import FixtureTestCase
from ponos.models import Artifact, Farm, Task, Workflow
......@@ -53,6 +54,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -83,6 +86,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -114,6 +119,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -152,6 +159,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -193,6 +202,8 @@ class TestCleanupCommand(FixtureTestCase):
Deleting 1 DataFiles marked as trashed from S3 and the database…
Deleting DataFile {trashed_df.id}
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -242,6 +253,8 @@ class TestCleanupCommand(FixtureTestCase):
Deleting DataFile {trashed_df.id}
Something went wrong
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -264,10 +277,11 @@ class TestCleanupCommand(FixtureTestCase):
broken_s3_artifact.key = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa/bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb/nope.zip'
broken_s3_artifact.delete.side_effect = ClientError({'Error': {'Code': '500'}}, 'delete_object')
cleanup_s3_mock.Bucket.return_value.objects.all.return_value = [
good_s3_artifact,
orphan_s3_artifact,
unsupported_s3_artifact,
cleanup_s3_mock.Bucket.return_value.objects.all.side_effect = [
# Bucket for Ponos artifacts
[good_s3_artifact, orphan_s3_artifact, unsupported_s3_artifact],
# Bucket for IIIF images
[]
]
self.assertEqual(
......@@ -290,12 +304,17 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
self.assertEqual(cleanup_s3_mock.Bucket.call_count, 1)
self.assertEqual(cleanup_s3_mock.Bucket.call_args, call('ponos-artifacts'))
self.assertEqual(cleanup_s3_mock.Bucket().objects.all.call_count, 1)
self.assertEqual(cleanup_s3_mock.Bucket.call_count, 2)
self.assertListEqual(cleanup_s3_mock.Bucket.call_args_list, [
call('ponos-artifacts'),
call('iiif')
])
self.assertEqual(cleanup_s3_mock.Bucket().objects.all.call_count, 2)
self.assertEqual(good_s3_artifact.delete.call_count, 0)
self.assertEqual(orphan_s3_artifact.delete.call_count, 1)
self.assertEqual(unsupported_s3_artifact.delete.call_count, 0)
......@@ -348,6 +367,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -437,6 +458,8 @@ class TestCleanupCommand(FixtureTestCase):
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Successfully cleaned up orphaned local images.
"""
).strip()
)
......@@ -465,3 +488,57 @@ class TestCleanupCommand(FixtureTestCase):
tagged_revision.refresh_from_db()
self.assertEqual(ponos_s3_mock.Object().delete.call_count, 4)
@patch('arkindex.documents.management.commands.cleanup.s3')
def test_cleanup_local_images(self, cleanup_s3_mock, s3_mock):
ImageServer.objects.local.images.create(path='path%2Fto%2Fimage.jpg')
img_object = MagicMock()
img_object.key = 'path/to/image.jpg'
broken_object = MagicMock()
broken_object.key = 'thisisfine.jpg'
broken_object.delete.side_effect = ClientError({'Error': {'Code': '500'}}, 'delete_object')
orphan_object = MagicMock()
orphan_object.key = 'jeanpaul2.jp2'
cleanup_s3_mock.Bucket.return_value.objects.all.side_effect = [
# Bucket for Ponos artifacts
[],
# Bucket for IIIF images
[broken_object, img_object, orphan_object]
]
self.assertEqual(
self.cleanup(),
dedent(
"""
Removing orphaned Ponos artifacts…
Successfully cleaned up orphaned Ponos artifacts.
Removing 0 artifacts of expired workflows from S3…
Removing logs for 0 tasks of expired workflows from S3…
Updating 0 available worker versions to the Error state…
Removing 0 artifacts of expired workflows…
Removing 0 tasks of expired workflows…
Removing 0 expired workflows…
Successfully cleaned up expired workflows.
Removing 0 old corpus exports from S3…
Removing 0 old corpus exports…
Successfully cleaned up old corpus exports.
Deleting 0 DataFiles marked as trashed from S3 and the database…
Successfully cleaned up DataFiles marked as trashed.
Removing orphaned local images…
Removing image thisisfine.jpg…
An error occurred (500) when calling the delete_object operation: Unknown
Removing image jeanpaul2.jp2…
Successfully cleaned up orphaned local images.
"""
).strip()
)
self.assertEqual(cleanup_s3_mock.Bucket.call_count, 2)
self.assertListEqual(cleanup_s3_mock.Bucket.call_args_list, [
call('ponos-artifacts'),
call('iiif')
])
self.assertEqual(cleanup_s3_mock.Bucket().objects.all.call_count, 2)
self.assertEqual(img_object.delete.call_count, 0)
self.assertEqual(broken_object.delete.call_count, 1)
self.assertEqual(orphan_object.delete.call_count, 1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment