diff --git a/arkindex/dataimport/api.py b/arkindex/dataimport/api.py index 08152866e1e64a222cd58a5d8455814237290827..ead830deaf1a7952b21815726f949e7d0bca797a 100644 --- a/arkindex/dataimport/api.py +++ b/arkindex/dataimport/api.py @@ -501,7 +501,7 @@ class DataFileList(CorpusACLMixin, ListAPIView): queryset = DataFile.objects.none() def get_queryset(self): - return DataFile.objects.filter(corpus=self.get_corpus(self.kwargs['pk'])) + return DataFile.objects.filter(corpus=self.get_corpus(self.kwargs['pk']), trashed=False) @extend_schema(tags=['files']) @@ -533,11 +533,10 @@ class DataFileRetrieve(CorpusACLMixin, RetrieveUpdateDestroyAPIView): def perform_destroy(self, instance): if not self.has_write_access(instance.corpus): raise PermissionDenied - try: - instance.s3_delete() - except Exception as e: - logger.warning(f'Could not delete DataFile from S3: {e}') - return super().perform_destroy(instance) + + # The file is simply marked as trashed here and will be deleted asynchronously through a cron job + instance.trashed = True + instance.save() @extend_schema_view(post=extend_schema(operation_id='CreateDataFile', tags=['files'])) diff --git a/arkindex/dataimport/migrations/0036_datafile_trashed.py b/arkindex/dataimport/migrations/0036_datafile_trashed.py new file mode 100644 index 0000000000000000000000000000000000000000..1950da120a24dbd62db1b988bd16e3fe1efb3f37 --- /dev/null +++ b/arkindex/dataimport/migrations/0036_datafile_trashed.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.6 on 2021-09-09 07:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('dataimport', '0035_corpus_version_cache'), + ] + + operations = [ + migrations.AddField( + model_name='datafile', + name='trashed', + field=models.BooleanField(default=False), + ), + ] diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 5f5ff566005247671b027510dd8926259f0908c3..af1ee145d2e90e1fd7f4ad7b675d3191f23bcbdf 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -403,6 +403,7 @@ class DataFile(S3FileMixin, models.Model): content_type = models.CharField(max_length=120) corpus = models.ForeignKey('documents.Corpus', on_delete=models.CASCADE, related_name='files') status = EnumField(S3FileStatus, default=S3FileStatus.Unchecked, max_length=50) + trashed = models.BooleanField(default=False) s3_bucket = settings.AWS_STAGING_BUCKET diff --git a/arkindex/dataimport/tests/test_files.py b/arkindex/dataimport/tests/test_files.py index e7d7b7facffaf33f83c15d74478b4ba76cfa79cd..a83da7eb713851301338cb38bab3ddf8ede0d744 100644 --- a/arkindex/dataimport/tests/test_files.py +++ b/arkindex/dataimport/tests/test_files.py @@ -1,6 +1,5 @@ from unittest.mock import patch -from botocore.exceptions import ClientError from django.test import override_settings from django.urls import reverse from rest_framework import status @@ -44,6 +43,21 @@ class TestFiles(FixtureAPITestCase): self.assertEqual(file['name'], self.df.name) self.assertEqual(file['size'], self.df.size) + def test_file_list_filter_out_trashed_files(self): + DataFile.objects.create( + name='test2.txt', + size=42, + content_type='text/plain', + corpus=self.corpus, + trashed=True + ) + self.assertEqual(DataFile.objects.count(), 2) + response = self.client.get(reverse('api:file-list', kwargs={'pk': self.corpus.id})) + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = response.json() + self.assertIn('results', data) + self.assertEqual(len(data['results']), 1) + def test_file_list_requires_login(self): self.client.logout() response = self.client.get(reverse('api:file-list', kwargs={'pk': self.corpus.id})) @@ -54,30 +68,9 @@ class TestFiles(FixtureAPITestCase): self.assertTrue(DataFile.objects.exists()) response = self.client.delete(reverse('api:file-retrieve', kwargs={'pk': self.df.id})) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - self.assertFalse(DataFile.objects.exists()) - self.assertEqual(s3_mock.Object().delete.call_count, 1) - - @patch('arkindex.project.aws.s3') - def test_file_delete_ignore_s3_errors(self, s3_mock): - """ - Test the DataFile deletion tries multiple times to delete from S3, but ignores errors if it fails - """ - exceptions = [ - ClientError({'Error': {'Code': '500'}}, 'delete_object'), - ClientError({'Error': {'Code': '500'}}, 'delete_object'), - ValueError, - ] - - def _raise(*args, **kwargs): - raise exceptions.pop(0) - - s3_mock.Object().delete.side_effect = _raise - self.assertTrue(DataFile.objects.exists()) - response = self.client.delete(reverse('api:file-retrieve', kwargs={'pk': self.df.id})) - self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) - self.assertFalse(DataFile.objects.exists()) - self.assertEqual(s3_mock.Object().delete.call_count, 3) + self.assertTrue(DataFile.objects.get().trashed) + self.assertEqual(s3_mock.Object().delete.call_count, 0) def test_file_delete_requires_login(self): """ diff --git a/arkindex/documents/management/commands/cleanup.py b/arkindex/documents/management/commands/cleanup.py index 1aafe4aea465c865c88b41ee8d8e8d0386c52b7f..f6e1162e7e936593d51517c7e8dbe1150ac0d370 100644 --- a/arkindex/documents/management/commands/cleanup.py +++ b/arkindex/documents/management/commands/cleanup.py @@ -3,11 +3,12 @@ from datetime import datetime, timedelta, timezone from botocore.exceptions import ClientError from django.core.management.base import BaseCommand +from arkindex.dataimport.models import DataFile from arkindex.documents.models import CorpusExport, CorpusExportState class Command(BaseCommand): - help = 'Clean up old exports' + help = 'Clean up old exports and DataFiles marked as trashed' def handle(self, *args, **options): # Corpus exports that are over 2 weeks old @@ -31,3 +32,16 @@ class Command(BaseCommand): old_exports.delete() self.stdout.write(self.style.SUCCESS('Successfully cleaned up old corpus exports.')) + + datafiles = DataFile.objects.filter(trashed=True) + self.stdout.write(f'Deleting {datafiles.count()} DataFiles marked as trashed from S3 and the database…') + for datafile in datafiles: + self.stdout.write(f"Deleting DataFile {datafile.id}…") + try: + datafile.s3_delete() + except Exception as e: + self.stdout.write(f'Could not delete DataFile {datafile.id} from S3: {e}') + else: + datafile.delete() + + self.stdout.write(self.style.SUCCESS('Successfully cleaned up DataFiles marked as trashed.')) diff --git a/arkindex/documents/tests/commands/test_cleanup.py b/arkindex/documents/tests/commands/test_cleanup.py index f159d5ddcc95f5ae750e9ad28cd77ad192d6d0ec..cb066414968dbe3d47dbf5c212646549986b0b91 100644 --- a/arkindex/documents/tests/commands/test_cleanup.py +++ b/arkindex/documents/tests/commands/test_cleanup.py @@ -7,6 +7,7 @@ from botocore.exceptions import ClientError from django.core.management import call_command from django.test import override_settings +from arkindex.dataimport.models import DataFile from arkindex.documents.models import CorpusExport, CorpusExportState from arkindex.project.tests import FixtureTestCase @@ -39,6 +40,8 @@ class TestCleanupCommand(FixtureTestCase): Removing export {done_export.id} from S3… Removing 2 old corpus exports… Successfully cleaned up old corpus exports. + Deleting 0 DataFiles marked as trashed from S3 and the database… + Successfully cleaned up DataFiles marked as trashed. """ ).strip() ) @@ -58,6 +61,8 @@ class TestCleanupCommand(FixtureTestCase): Removing 0 old corpus exports from S3… Removing 0 old corpus exports… Successfully cleaned up old corpus exports. + Deleting 0 DataFiles marked as trashed from S3 and the database… + Successfully cleaned up DataFiles marked as trashed. """ ).strip() ) @@ -79,6 +84,8 @@ class TestCleanupCommand(FixtureTestCase): Export {done_export.id} not found on S3, skipping Removing 1 old corpus exports… Successfully cleaned up old corpus exports. + Deleting 0 DataFiles marked as trashed from S3 and the database… + Successfully cleaned up DataFiles marked as trashed. """ ).strip() ) @@ -107,6 +114,8 @@ class TestCleanupCommand(FixtureTestCase): Removing export {done_export.id} from S3… Removing 1 old corpus exports… Successfully cleaned up old corpus exports. + Deleting 0 DataFiles marked as trashed from S3 and the database… + Successfully cleaned up DataFiles marked as trashed. """ ).strip() ) @@ -115,3 +124,80 @@ class TestCleanupCommand(FixtureTestCase): self.assertEqual(s3_object_mock.call_count, 1) self.assertEqual(s3_object_mock.call_args, call('export', str(done_export.id))) self.assertEqual(s3_object_mock.return_value.delete.call_count, 3) + + @patch('arkindex.project.aws.s3') + def test_cleanup_trashed_datafiles(self, s3_mock): + DataFile.objects.bulk_create([ + DataFile( + name=f'test{i}.txt', + size=42, + content_type='text/plain', + corpus=self.corpus, + trashed=True if i % 2 else False + ) for i in range(0, 2) + ]) + + trashed_df = DataFile.objects.filter(trashed=True).get() + self.assertEqual(DataFile.objects.count(), 2) + self.assertEqual( + self.cleanup(), + dedent( + f""" + Removing 0 old corpus exports from S3… + Removing 0 old corpus exports… + Successfully cleaned up old corpus exports. + Deleting 1 DataFiles marked as trashed from S3 and the database… + Deleting DataFile {trashed_df.id}… + Successfully cleaned up DataFiles marked as trashed. + """ + ).strip() + ) + self.assertEqual(DataFile.objects.count(), 1) + self.assertEqual(DataFile.objects.filter(trashed=True).count(), 0) + self.assertEqual(s3_mock.Object().delete.call_count, 1) + + @patch('arkindex.project.aws.s3') + def test_cleanup_trashed_datafiles_ignore_s3_errors(self, s3_mock): + """ + Test the cleanup command tries multiple times to delete from S3, but ignores errors if it fails + """ + DataFile.objects.bulk_create([ + DataFile( + name=f'test{i}.txt', + size=42, + content_type='text/plain', + corpus=self.corpus, + trashed=True if i % 2 else False + ) for i in range(0, 2) + ]) + + exceptions = [ + ClientError({'Error': {'Code': '500'}}, 'delete_object'), + ClientError({'Error': {'Code': '500'}}, 'delete_object'), + ValueError('Something went wrong'), + ] + + def _raise(*args, **kwargs): + raise exceptions.pop(0) + + s3_mock.Object().delete.side_effect = _raise + + trashed_df = DataFile.objects.filter(trashed=True).get() + self.assertEqual(DataFile.objects.count(), 2) + self.assertEqual( + self.cleanup(), + dedent( + f""" + Removing 0 old corpus exports from S3… + Removing 0 old corpus exports… + Successfully cleaned up old corpus exports. + Deleting 1 DataFiles marked as trashed from S3 and the database… + Deleting DataFile {trashed_df.id}… + Could not delete DataFile {trashed_df.id} from S3: Something went wrong + Successfully cleaned up DataFiles marked as trashed. + """ + ).strip() + ) + self.assertEqual(DataFile.objects.count(), 2) + self.assertEqual(DataFile.objects.filter(trashed=True).count(), 1) + self.assertEqual(s3_mock.Object().delete.call_count, 3)