diff --git a/arkindex/dataimport/models.py b/arkindex/dataimport/models.py index 4fc66f5b1076332ecd51e7acd5aee4dc5b01b818..e9db41ea12d4119999f7966979ab108b5c0bbdff 100644 --- a/arkindex/dataimport/models.py +++ b/arkindex/dataimport/models.py @@ -209,16 +209,14 @@ class DataImport(IndexableModel): ''' ml_workflow_chunks = 1 import_task_name = 'import' - if self.mode == DataImportMode.Workers: - import_task_name = 'initialisation' assert self.workflow is None, 'A workflow is already setup' - if self.mode == DataImportMode.Repository and self.revision is not None and not self.revision.repo.enabled: - raise ValidationError('Git repository does not have any valid credentials') if self.mode == DataImportMode.Repository: assert self.revision is not None, \ 'A revision is required to create an import workflow from GitLab repository' + if not self.revision.repo.enabled: + raise ValidationError('Git repository does not have any valid credentials') tasks = { import_task_name: { 'image': settings.ARKINDEX_TASKS_IMAGE, @@ -230,6 +228,7 @@ class DataImport(IndexableModel): } elif self.mode == DataImportMode.Workers: + import_task_name = 'initialisation' if chunks is not None: assert chunks <= settings.MAX_CHUNKS, f'Import distribution is limited to {settings.MAX_CHUNKS} chunks' ml_workflow_chunks = chunks @@ -257,18 +256,29 @@ class DataImport(IndexableModel): } elif self.mode == DataImportMode.Transkribus: - args = [ - 'python', '-m', 'arkindex_tasks.import_transkribus', - str(self.collection_id), - '--corpus', str(self.corpus.id), - ] + import_task_name = 'import_arkindex' tasks = { + 'export_transkribus': { + 'image': settings.ARKINDEX_TASKS_IMAGE, + 'command': f'python -m arkindex_tasks.export_transkribus {self.collection_id}' + }, import_task_name: { 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': ' '.join(args), + 'command': 'python -m arkindex_tasks.import_transkribus ' + f'/data/export_transkribus/transkribus_export.zip --corpus {self.corpus.id}', + 'parents': ['export_transkribus'], } } + # Import entities directy after import step + if self.build_entities: + tasks['build_entities'] = { + 'image': settings.ARKINDEX_TASKS_IMAGE, + 'command': 'python -m arkindex_tasks.build_entities ' + f'/data/{import_task_name}/transcriptions.json /data/export_transkribus/transkribus_export.zip', + 'parents': [import_task_name], + } + else: tasks = { import_task_name: { @@ -277,15 +287,6 @@ class DataImport(IndexableModel): }, } - # Import entities directy after import step - if self.mode == DataImportMode.Transkribus and self.build_entities: - transcriptions_path = shlex.quote(path.join('/data', import_task_name, str(self.collection_id), 'transcriptions.json')) - tasks['build_entities'] = { - 'image': settings.ARKINDEX_TASKS_IMAGE, - 'command': f'python -m arkindex_tasks.build_entities {transcriptions_path}', - 'parents': [import_task_name], - } - elts_chunk_files = ['elements.json'] if ml_workflow_chunks > 1: elts_chunk_files = [f'elements_chunk_{n}.json' for n in range(1, ml_workflow_chunks + 1)] diff --git a/arkindex/dataimport/tests/test_transkribus_import.py b/arkindex/dataimport/tests/test_transkribus_import.py index 0fdd9132cd302cf23fcc799e9a829f67aa303939..af6f88628efbaafb868a6fec07c60344011454c6 100644 --- a/arkindex/dataimport/tests/test_transkribus_import.py +++ b/arkindex/dataimport/tests/test_transkribus_import.py @@ -114,14 +114,19 @@ class TestTranskribusImport(FixtureAPITestCase): 'TRANSKRIBUS_WORKER_VERSION': settings.TRANSKRIBUS_WORKER_VERSION }, 'tasks': { - 'import': { - 'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id), - 'image': 'registry.gitlab.com/arkindex/tasks' + 'export_transkribus': { + 'command': 'python -m arkindex_tasks.export_transkribus 12345', + 'image': 'registry.gitlab.com/arkindex/tasks', + }, + 'import_arkindex': { + 'command': f'python -m arkindex_tasks.import_transkribus /data/export_transkribus/transkribus_export.zip --corpus {corpus.id}', + 'image': 'registry.gitlab.com/arkindex/tasks', + 'parents': ['export_transkribus'] }, 'thumbnails': { - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import/elements.json', + 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_arkindex/elements.json', 'image': 'registry.gitlab.com/arkindex/tasks', - 'parents': ['import'] + 'parents': ['import_arkindex'] } } }) @@ -173,19 +178,24 @@ class TestTranskribusImport(FixtureAPITestCase): 'TRANSKRIBUS_WORKER_VERSION': settings.TRANSKRIBUS_WORKER_VERSION }, 'tasks': { - 'import': { - 'command': 'python -m arkindex_tasks.import_transkribus {} --corpus {}'.format(dataimport.collection_id, corpus.id), - 'image': 'registry.gitlab.com/arkindex/tasks' + 'export_transkribus': { + 'command': 'python -m arkindex_tasks.export_transkribus 12345', + 'image': 'registry.gitlab.com/arkindex/tasks', + }, + 'import_arkindex': { + 'command': f'python -m arkindex_tasks.import_transkribus /data/export_transkribus/transkribus_export.zip --corpus {corpus.id}', + 'image': 'registry.gitlab.com/arkindex/tasks', + 'parents': ['export_transkribus'] }, 'build_entities': { - 'command': 'python -m arkindex_tasks.build_entities /data/import/{}/transcriptions.json'.format(dataimport.collection_id), + 'command': 'python -m arkindex_tasks.build_entities /data/import_arkindex/transcriptions.json /data/export_transkribus/transkribus_export.zip', 'image': 'registry.gitlab.com/arkindex/tasks', - 'parents': ['import'] + 'parents': ['import_arkindex'] }, 'thumbnails': { - 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import/elements.json', + 'command': 'python3 -m arkindex_tasks.generate_thumbnails /data/import_arkindex/elements.json', 'image': 'registry.gitlab.com/arkindex/tasks', - 'parents': ['import'] + 'parents': ['import_arkindex'] } } })