From b987ef0e5c7410cf653b84e5945fddb08127de4b Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Mon, 14 Oct 2019 16:21:21 +0200 Subject: [PATCH 1/3] Use new element types in files import --- arkindex_tasks/import_files/base.py | 48 ++++++------- tests/import_files/test_base.py | 101 ++++++++++++++-------------- 2 files changed, 76 insertions(+), 73 deletions(-) diff --git a/arkindex_tasks/import_files/base.py b/arkindex_tasks/import_files/base.py index 4dae6f03..bd1f3afc 100644 --- a/arkindex_tasks/import_files/base.py +++ b/arkindex_tasks/import_files/base.py @@ -24,13 +24,13 @@ DOWNLOAD_CHUNK_SIZE = 8192 class FileImport(object): """ - A generic file import: takes a DataImport and creates pages in a volume from its files. + A generic file import: takes a DataImport and creates pages in a folder from its files. """ def __init__(self, dataimport_id): self.dataimport_id = dataimport_id self.dataimport = None - self.volume = None + self.folder = None if 'PONOS_DATA' in os.environ: self.working_dir = Path(os.environ['PONOS_DATA']) / 'current' else: @@ -62,27 +62,27 @@ class FileImport(object): return files - def get_or_create_volume(self): - if 'volume_id' in self.dataimport['payload']: + def get_or_create_folder(self): + if 'element_id' in self.dataimport['payload']: try: - self.volume = default_client.request( + self.folder = default_client.request( 'RetrieveElement', - id=self.dataimport['payload']['volume_id'], + id=self.dataimport['payload']['element_id'], ) except ErrorResponse as e: if e.status_code == 404: - raise ValueError('Volume {} not found'.format(self.dataimport['payload']['volume_id'])) + raise ValueError('Folder {} not found'.format(self.dataimport['payload']['element_id'])) raise - elif 'volume_name' in self.dataimport['payload']: - self.volume = default_client.request('CreateElement', body={ + elif 'element_name' in self.dataimport['payload'] and 'element_type' in self.dataimport['payload']: + self.folder = default_client.request('CreateElement', body={ 'corpus': self.dataimport['corpus'], - 'name': self.dataimport['payload']['volume_name'], - 'type': 'volume', + 'name': self.dataimport['payload']['element_name'], + 'type': self.dataimport['payload']['element_type'], }) else: - raise ValueError('Missing volume information in import payload') + raise ValueError('Missing folder information in import payload') def get_images(self, datafile): """ @@ -154,9 +154,9 @@ class FileImport(object): for i, path in enumerate(image_paths, start=1): image = self.create_image(path, datafile) page = None - for volume_page in self.existing_pages: - if image['id'] == volume_page['zone']['image']['id']: - page = default_client.request('RetrieveElement', id=volume_page['id']) + for folder_page in self.existing_pages: + if image['id'] == folder_page['zone']['image']['id']: + page = default_client.request('RetrieveElement', id=folder_page['id']) break if page: logger.info('Page already exists for this import, skipping') @@ -169,7 +169,7 @@ class FileImport(object): 'metadata': { 'folio': str(i), }, - 'parent': self.volume['id'], + 'parent': self.folder['id'], }) # Add the image S3 url so that it can be accessed for ML config @@ -182,14 +182,14 @@ class FileImport(object): elements = [ { - "type": "volume", - "id": self.volume['id'], + "type": self.folder['type'], + "id": self.folder['id'], } ] elements.extend([ { - "type": "page", + "type": page['type'], "id": page['id'], } for page in pages @@ -210,11 +210,11 @@ class FileImport(object): logger.error('No files downloaded') sys.exit(1) - self.get_or_create_volume() - logger.info('Using volume {} "{}"'.format(self.volume['id'], self.volume['name'])) + self.get_or_create_folder() + logger.info('Using folder {} "{}"'.format(self.folder['id'], self.folder['name'])) - logger.info('Retrieving existing pages in volume') - self.existing_pages = list(default_client.paginate('ListElementChildren', id=self.volume['id'], type='page')) + logger.info('Retrieving existing pages in folder') + self.existing_pages = list(default_client.paginate('ListElementChildren', id=self.folder['id'], type='page')) logger.info('Retrieved {} existing pages'.format(len(self.existing_pages))) pages = [] @@ -230,5 +230,5 @@ class FileImport(object): logger.error("No pages have been imported") sys.exit(1) - logger.info('Imported {}Â pages into {}'.format(len(pages), self.volume['name'])) + logger.info('Imported {}Â pages into {}'.format(len(pages), self.folder['name'])) self.write_json(pages) diff --git a/tests/import_files/test_base.py b/tests/import_files/test_base.py index aefb5a92..1b140175 100644 --- a/tests/import_files/test_base.py +++ b/tests/import_files/test_base.py @@ -80,71 +80,72 @@ class TestFileImport(TestCase): self.assertListEqual(result, []) self.assertEqual(mock.call_count, 4) - def test_get_volume(self, mock): - mock.get('/api/v1/element/volumeid/', json={'id': 'volumeid'}) + def test_get_folder(self, mock): + mock.get('/api/v1/element/folderid/', json={'id': 'folderid', 'type': 'potato'}) fi = FileImport('importid') fi.dataimport = { "id": "importid", "payload": { - "volume_id": "volumeid", + "element_id": "folderid", }, } - self.assertIsNone(fi.volume) - fi.get_or_create_volume() + self.assertIsNone(fi.folder) + fi.get_or_create_folder() - self.assertEqual(fi.volume, {'id': 'volumeid'}) + self.assertEqual(fi.folder, {'id': 'folderid', 'type': 'potato'}) self.assertEqual(mock.call_count, 1) - def test_create_volume(self, mock): - mock.post('/api/v1/elements/create/', json={'id': 'volumeid'}) + def test_create_folder(self, mock): + mock.post('/api/v1/elements/create/', json={'id': 'folderid', 'type': 'potato'}) fi = FileImport('importid') fi.dataimport = { "id": "importid", "corpus": "corpusid", "payload": { - "volume_name": "Untitled", + "element_name": "Untitled", + "element_type": "potato", }, } - self.assertIsNone(fi.volume) - fi.get_or_create_volume() + self.assertIsNone(fi.folder) + fi.get_or_create_folder() - self.assertEqual(fi.volume, {'id': 'volumeid'}) + self.assertEqual(fi.folder, {'id': 'folderid', 'type': 'potato'}) self.assertEqual(mock.call_count, 1) self.assertDictEqual(mock.request_history[0].json(), { 'corpus': 'corpusid', 'name': 'Untitled', - 'type': 'volume', + 'type': 'potato', }) - def test_get_volume_missing(self, mock): - mock.get('/api/v1/element/volumeid/', status_code=404) + def test_get_folder_missing(self, mock): + mock.get('/api/v1/element/folderid/', status_code=404) fi = FileImport('importid') fi.dataimport = { "id": "importid", "payload": { - "volume_id": "volumeid", + "element_id": "folderid", }, } - self.assertIsNone(fi.volume) + self.assertIsNone(fi.folder) - with self.assertRaises(ValueError, msg='Volume volumeid not found'): - fi.get_or_create_volume() + with self.assertRaises(ValueError, msg='Folder folderid not found'): + fi.get_or_create_folder() self.assertEqual(mock.call_count, 1) - def test_no_volume(self, mock): + def test_no_folder(self, mock): fi = FileImport('importid') fi.dataimport = { "id": "importid", "payload": {}, } - self.assertIsNone(fi.volume) + self.assertIsNone(fi.folder) - with self.assertRaises(ValueError, msg='Missing volume information in import payload'): - fi.get_or_create_volume() + with self.assertRaises(ValueError, msg='Missing folder information in import payload'): + fi.get_or_create_folder() self.assertFalse(mock.called) @@ -236,8 +237,9 @@ class TestFileImport(TestCase): "id": "importid", "corpus": "corpusid", } - fi.volume = { - "id": "volumeid", + fi.folder = { + "id": "folderid", + "type": "potato", } fi.existing_pages = [] pages = fi.save_pages([SAMPLES / '200x200.jpg', ], {'id': 'fileid'}) @@ -259,7 +261,7 @@ class TestFileImport(TestCase): "type": "page", "name": "Page 1", "image": "imageid", - "parent": "volumeid", + "parent": "folderid", "metadata": { "folio": "1", }, @@ -270,18 +272,19 @@ class TestFileImport(TestCase): fi.dataimport = { "id": "importid", } - fi.volume = { - "id": "volumeid", + fi.folder = { + "id": "folderid", + "type": "potato", } fi.write_json([ - {'id': 'page1', 's3_url': 'http://s3/url1'}, - {'id': 'page2', 's3_url': 'http://s3/url2'}, + {'id': 'page1', 's3_url': 'http://s3/url1', 'type': 'page'}, + {'id': 'page2', 's3_url': 'http://s3/url2', 'type': 'page'}, ]) with (fi.working_dir / 'elements.json').open() as f: elements = json.load(f) self.assertListEqual(elements, [ - {'id': 'volumeid', 'type': 'volume'}, + {'id': 'folderid', 'type': 'potato'}, {'id': 'page1', 'type': 'page'}, {'id': 'page2', 'type': 'page'}, ]) @@ -303,7 +306,7 @@ class TestFileImport(TestCase): "corpus": "corpusid", "mode": "images", "payload": { - "volume_id": "volumeid", + "element_id": "folderid", }, "files": [ "file1", @@ -317,8 +320,8 @@ class TestFileImport(TestCase): }) # DataFile on S3 mock.get('http://s3/600x600.jpg', body=open(SAMPLES / '600x600.jpg', 'rb')) - # Volume - mock.get('/api/v1/element/volumeid/', json={'id': 'volumeid', 'name': 'Untitled'}) + # Folder + mock.get('/api/v1/element/folderid/', json={'id': 'folderid', 'name': 'Untitled', 'type': 'potato'}) # Image creation mock.post('/api/v1/image/', json={ 'id': 'imageid', @@ -328,10 +331,10 @@ class TestFileImport(TestCase): mock.put('http://s3/put', status_code=201) mock.patch('/api/v1/image/imageid/', status_code=200) # Page creation - mock.post('/api/v1/elements/create/', json={'id': 'pageid'}) - # Volume pages list retrieve + mock.post('/api/v1/elements/create/', json={'id': 'pageid', 'type': 'page'}) + # Folder pages list retrieve mock.get( - '/api/v1/elements/volumeid/children/', + '/api/v1/elements/folderid/children/', complete_qs=False, json={'count': 0, 'results': [], 'next': None} ) @@ -346,8 +349,8 @@ class TestFileImport(TestCase): ('GET', 'https://arkindex.teklia.com/api/v1/imports/importid/'), ('GET', 'https://arkindex.teklia.com/api/v1/imports/file/file1/'), ('GET', 'http://s3/600x600.jpg'), - ('GET', 'https://arkindex.teklia.com/api/v1/element/volumeid/'), - ('GET', 'https://arkindex.teklia.com/api/v1/elements/volumeid/children/?page=1&type=page'), + ('GET', 'https://arkindex.teklia.com/api/v1/element/folderid/'), + ('GET', 'https://arkindex.teklia.com/api/v1/elements/folderid/children/?page=1&type=page'), ('POST', 'https://arkindex.teklia.com/api/v1/image/'), ('PUT', 'http://s3/put'), ('PATCH', 'https://arkindex.teklia.com/api/v1/image/imageid/'), @@ -358,7 +361,7 @@ class TestFileImport(TestCase): with (fi.working_dir / 'elements.json').open() as f: elements = json.load(f) self.assertListEqual(elements, [ - {'id': 'volumeid', 'type': 'volume'}, + {'id': 'folderid', 'type': 'potato'}, {'id': 'pageid', 'type': 'page'}, ]) @@ -368,7 +371,7 @@ class TestFileImport(TestCase): "corpus": "corpusid", "mode": "images", "payload": { - "volume_id": "volumeid", + "element_id": "folderid", }, "files": [ "file1", @@ -380,7 +383,7 @@ class TestFileImport(TestCase): # DataFiles on S3 mock.get('http://s3/1', body=open(SAMPLES / '600x600.jpg', 'rb')) mock.get('http://s3/2', body=open(SAMPLES / '600x600.jpg', 'rb')) - mock.get('/api/v1/element/volumeid/', json={"id": "volumeid", "name": "volume"}) + mock.get('/api/v1/element/folderid/', json={"id": "folderid", "name": "Untitled", "type": "potato"}) # Mock image creation mock.post('/api/v1/image/', [ @@ -388,12 +391,12 @@ class TestFileImport(TestCase): {"status_code": 400, "json": {"id": "image2"}}, ]) - # Image pages are already present in volume + # Image pages are already present in folder mock.get( - '/api/v1/elements/volumeid/children/?page=1&type=page', + '/api/v1/elements/folderid/children/?page=1&type=page', json={"count": 2, "next": None, "results": [ - {"id": "page1", "zone": {"image": {"id": "image1"}}}, - {"id": "page2", "zone": {"image": {"id": "image2"}}} + {"id": "page1", "type": "page", "zone": {"image": {"id": "image1"}}}, + {"id": "page2", "type": "page", "zone": {"image": {"id": "image2"}}} ]} ) mock.get('/api/v1/image/image1/', json={"id": "image1", "s3_url": "http://s3/1"}) @@ -405,7 +408,7 @@ class TestFileImport(TestCase): mock.get('/api/v1/element/page1/', status_code=200) mock.get('/api/v1/element/page2/', status_code=200) - mock.post('/api/v1/elements/create/', json={'id': 'pageid'}) + mock.post('/api/v1/elements/create/', json={'id': 'pageid', 'type': 'page'}) fi = FileImport('importid') fi.run() @@ -418,8 +421,8 @@ class TestFileImport(TestCase): ('GET', 'http://s3/1'), ('GET', 'https://arkindex.teklia.com/api/v1/imports/file/file2/'), ('GET', 'http://s3/2'), - ('GET', 'https://arkindex.teklia.com/api/v1/element/volumeid/'), - ('GET', 'https://arkindex.teklia.com/api/v1/elements/volumeid/children/?page=1&type=page'), + ('GET', 'https://arkindex.teklia.com/api/v1/element/folderid/'), + ('GET', 'https://arkindex.teklia.com/api/v1/elements/folderid/children/?page=1&type=page'), ('POST', 'https://arkindex.teklia.com/api/v1/image/'), ('GET', 'https://arkindex.teklia.com/api/v1/image/image1/'), ('PATCH', 'https://arkindex.teklia.com/api/v1/image/image1/'), -- GitLab From 9a80d5210f1d899cfb37ba4fe359f6c4c3bbefa9 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Wed, 16 Oct 2019 12:21:35 +0200 Subject: [PATCH 2/3] Bump to 0.2.5-dev0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index abd41058..0eac58ed 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.4 +0.2.5-dev0 -- GitLab From affa519bb21250cbe888ee01e4d1f5fde0cc8bc1 Mon Sep 17 00:00:00 2001 From: Erwan Rouchet <rouchet@teklia.com> Date: Wed, 16 Oct 2019 12:31:46 +0200 Subject: [PATCH 3/3] Use element_type --- arkindex_tasks/import_files/__main__.py | 2 +- arkindex_tasks/import_files/base.py | 60 +++++++++++++------------ tests/import_files/test_base.py | 21 +++++---- 3 files changed, 46 insertions(+), 37 deletions(-) diff --git a/arkindex_tasks/import_files/__main__.py b/arkindex_tasks/import_files/__main__.py index 38a18cc4..e1dc56f2 100755 --- a/arkindex_tasks/import_files/__main__.py +++ b/arkindex_tasks/import_files/__main__.py @@ -8,7 +8,7 @@ import uuid def main(): parser = argparse.ArgumentParser( - description='Import files as pages in a volume', + description='Import files as elements in a folder', ) parser.add_argument( 'dataimport_id', diff --git a/arkindex_tasks/import_files/base.py b/arkindex_tasks/import_files/base.py index bd1f3afc..67496508 100644 --- a/arkindex_tasks/import_files/base.py +++ b/arkindex_tasks/import_files/base.py @@ -24,7 +24,7 @@ DOWNLOAD_CHUNK_SIZE = 8192 class FileImport(object): """ - A generic file import: takes a DataImport and creates pages in a folder from its files. + A generic file import: takes a DataImport and creates elements in a folder from its files. """ def __init__(self, dataimport_id): @@ -63,15 +63,15 @@ class FileImport(object): return files def get_or_create_folder(self): - if 'element_id' in self.dataimport['payload']: + if 'folder_id' in self.dataimport['payload']: try: self.folder = default_client.request( 'RetrieveElement', - id=self.dataimport['payload']['element_id'], + id=self.dataimport['payload']['folder_id'], ) except ErrorResponse as e: if e.status_code == 404: - raise ValueError('Folder {} not found'.format(self.dataimport['payload']['element_id'])) + raise ValueError('Folder {} not found'.format(self.dataimport['payload']['folder_id'])) raise elif 'element_name' in self.dataimport['payload'] and 'element_type' in self.dataimport['payload']: @@ -149,22 +149,22 @@ class FileImport(object): logger.warning('Upload failed ({!s}), retrying in {} seconds'.format(e, cooldown)) sleep(cooldown) - def save_pages(self, image_paths, datafile): - pages = [] + def save_elements(self, image_paths, datafile): + elements = [] for i, path in enumerate(image_paths, start=1): image = self.create_image(path, datafile) - page = None - for folder_page in self.existing_pages: - if image['id'] == folder_page['zone']['image']['id']: - page = default_client.request('RetrieveElement', id=folder_page['id']) + element = None + for folder_element in self.existing_elements: + if image['id'] == folder_element['zone']['image']['id']: + element = default_client.request('RetrieveElement', id=folder_element['id']) break - if page: - logger.info('Page already exists for this import, skipping') + if element: + logger.info('Element already exists for this import, skipping') else: - page = default_client.request('CreateElement', body={ + element = default_client.request('CreateElement', body={ 'corpus': self.dataimport['corpus'], - 'type': 'page', - 'name': 'Page {}'.format(i), + 'type': self.dataimport['payload']['element_type'], + 'name': 'Element {}'.format(i), 'image': image['id'], 'metadata': { 'folio': str(i), @@ -173,9 +173,9 @@ class FileImport(object): }) # Add the image S3 url so that it can be accessed for ML config - page['s3_url'] = image['s3_url'] - pages.append(page) - return pages + element['s3_url'] = image['s3_url'] + elements.append(element) + return elements def write_json(self, pages): assert len(pages), 'ML configuration requires a nonempty list of pages' @@ -213,22 +213,26 @@ class FileImport(object): self.get_or_create_folder() logger.info('Using folder {} "{}"'.format(self.folder['id'], self.folder['name'])) - logger.info('Retrieving existing pages in folder') - self.existing_pages = list(default_client.paginate('ListElementChildren', id=self.folder['id'], type='page')) - logger.info('Retrieved {} existing pages'.format(len(self.existing_pages))) + logger.info('Retrieving existing elements in folder') + self.existing_elements = list(default_client.paginate( + 'ListElementChildren', + id=self.folder['id'], + type=self.dataimport['payload']['element_type'], + )) + logger.info('Retrieved {} existing elements'.format(len(self.existing_elements))) - pages = [] + elements = [] for df in datafiles: logger.info('Fetching images for {}'.format(df['name'])) images = self.get_images(df) if not images: continue - logger.info('Creating {}Â pages'.format(len(images))) - pages.extend(self.save_pages(images, df)) + logger.info('Creating {}Â elements'.format(len(images))) + elements.extend(self.save_elements(images, df)) - if len(pages) < 1: - logger.error("No pages have been imported") + if len(elements) < 1: + logger.error("No elements have been imported") sys.exit(1) - logger.info('Imported {}Â pages into {}'.format(len(pages), self.folder['name'])) - self.write_json(pages) + logger.info('Imported {}Â elements into {}'.format(len(elements), self.folder['name'])) + self.write_json(elements) diff --git a/tests/import_files/test_base.py b/tests/import_files/test_base.py index 1b140175..56e6e2ea 100644 --- a/tests/import_files/test_base.py +++ b/tests/import_files/test_base.py @@ -87,7 +87,7 @@ class TestFileImport(TestCase): fi.dataimport = { "id": "importid", "payload": { - "element_id": "folderid", + "folder_id": "folderid", }, } self.assertIsNone(fi.folder) @@ -126,7 +126,7 @@ class TestFileImport(TestCase): fi.dataimport = { "id": "importid", "payload": { - "element_id": "folderid", + "folder_id": "folderid", }, } self.assertIsNone(fi.folder) @@ -222,7 +222,7 @@ class TestFileImport(TestCase): self.assertEqual(sleep_mock.call_count, 2) self.assertEqual(sleep_mock.call_args_list, [call(5), call(5)]) - def test_save_pages(self, mock): + def test_save_elements(self, mock): mock.post('/api/v1/image/', json={ 'id': 'imageid', 's3_put_url': 'http://s3/put', @@ -236,13 +236,16 @@ class TestFileImport(TestCase): fi.dataimport = { "id": "importid", "corpus": "corpusid", + "payload": { + "element_type": "page" + } } fi.folder = { "id": "folderid", "type": "potato", } - fi.existing_pages = [] - pages = fi.save_pages([SAMPLES / '200x200.jpg', ], {'id': 'fileid'}) + fi.existing_elements = [] + pages = fi.save_elements([SAMPLES / '200x200.jpg', ], {'id': 'fileid'}) self.assertListEqual(pages, [ { 'id': 'pageid', @@ -259,7 +262,7 @@ class TestFileImport(TestCase): self.assertEqual(mock.request_history[3].json(), { "corpus": "corpusid", "type": "page", - "name": "Page 1", + "name": "Element 1", "image": "imageid", "parent": "folderid", "metadata": { @@ -306,7 +309,8 @@ class TestFileImport(TestCase): "corpus": "corpusid", "mode": "images", "payload": { - "element_id": "folderid", + "folder_id": "folderid", + "element_type": "page", }, "files": [ "file1", @@ -371,7 +375,8 @@ class TestFileImport(TestCase): "corpus": "corpusid", "mode": "images", "payload": { - "element_id": "folderid", + "folder_id": "folderid", + "element_type": "page", }, "files": [ "file1", -- GitLab