diff --git a/arkindex_worker/models.py b/arkindex_worker/models.py index b154cfa00f0453985305fa6e1e83abf7576fd86c..b07f936d470ade3867bb41a7bce4c09b954d0f73 100644 --- a/arkindex_worker/models.py +++ b/arkindex_worker/models.py @@ -229,12 +229,3 @@ class Transcription(MagicDict): def __str__(self): return "Transcription ({})".format(self.id) - - -class Corpus(MagicDict): - """ - Describes an Arkindex corpus. - """ - - def __str__(self): - return "Corpus {} ({})".format(self.name, self.id) diff --git a/arkindex_worker/worker/classification.py b/arkindex_worker/worker/classification.py index 5cdb63e669d38ac8c720dc9a0aa068b5c6778933..ee1e1b2da1f8712490a371442be698068cf3b51b 100644 --- a/arkindex_worker/worker/classification.py +++ b/arkindex_worker/worker/classification.py @@ -15,46 +15,38 @@ class ClassificationMixin(object): Mixin for the :class:`ElementsWorker` to add ``MLClass`` and ``Classification`` helpers. """ - def load_corpus_classes(self, corpus_id): + def load_corpus_classes(self): """ Load all ML classes for the given corpus ID and store them in the ``self.classes`` cache. - - :param corpus_id str: ID of the corpus. """ corpus_classes = self.api_client.paginate( "ListCorpusMLClasses", - id=corpus_id, + id=self.corpus_id, ) - self.classes[corpus_id] = { + self.classes[self.corpus_id] = { ml_class["name"]: ml_class["id"] for ml_class in corpus_classes } - logger.info(f"Loaded {len(self.classes[corpus_id])} ML classes") + logger.info(f"Loaded {len(self.classes[self.corpus_id])} ML classes") - def get_ml_class_id(self, corpus_id, ml_class): + def get_ml_class_id(self, ml_class): """ Return the MLClass ID corresponding to the given class name on a specific corpus. If no MLClass exists for this class name, a new one is created. - - :param corpus_id: ID of the corpus, or None to use the ``ARKINDEX_CORPUS_ID`` environment variable. - :type corpus_id: str or None :param ml_class str: Name of the MLClass. :returns str: ID of the retrieved or created MLClass. """ - if corpus_id is None: - corpus_id = self.corpus_id - - if not self.classes.get(corpus_id): - self.load_corpus_classes(corpus_id) + if not self.classes.get(self.corpus_id): + self.load_corpus_classes() - ml_class_id = self.classes[corpus_id].get(ml_class) + ml_class_id = self.classes[self.corpus_id].get(ml_class) if ml_class_id is None: - logger.info(f"Creating ML class {ml_class} on corpus {corpus_id}") + logger.info(f"Creating ML class {ml_class} on corpus {self.corpus_id}") try: response = self.request( - "CreateMLClass", id=corpus_id, body={"name": ml_class} + "CreateMLClass", id=self.corpus_id, body={"name": ml_class} ) - ml_class_id = self.classes[corpus_id][ml_class] = response["id"] + ml_class_id = self.classes[self.corpus_id][ml_class] = response["id"] logger.debug(f"Created ML class {response['id']}") except ErrorResponse as e: # Only reload for 400 errors @@ -65,11 +57,11 @@ class ClassificationMixin(object): logger.info( f"Reloading corpus classes to see if {ml_class} already exists" ) - self.load_corpus_classes(corpus_id) + self.load_corpus_classes() assert ( - ml_class in self.classes[corpus_id] + ml_class in self.classes[self.corpus_id] ), "Missing class {ml_class} even after reloading" - ml_class_id = self.classes[corpus_id][ml_class] + ml_class_id = self.classes[self.corpus_id][ml_class] return ml_class_id @@ -109,7 +101,7 @@ class ClassificationMixin(object): "CreateClassification", body={ "element": str(element.id), - "ml_class": self.get_ml_class_id(None, ml_class), + "ml_class": self.get_ml_class_id(ml_class), "worker_version": self.worker_version_id, "confidence": confidence, "high_confidence": high_confidence, diff --git a/arkindex_worker/worker/element.py b/arkindex_worker/worker/element.py index daa53433c1e83e809d9d0884ee878d07351bc350..c3c01de3f0f1f57e33b409ccba5a3ff75245a975 100644 --- a/arkindex_worker/worker/element.py +++ b/arkindex_worker/worker/element.py @@ -2,15 +2,13 @@ """ ElementsWorker methods for elements and element types. """ - -import uuid from typing import Dict, Iterable, List, NamedTuple, Optional, Union from peewee import IntegrityError from arkindex_worker import logger from arkindex_worker.cache import CachedElement, CachedImage -from arkindex_worker.models import Corpus, Element +from arkindex_worker.models import Element ElementType = NamedTuple("ElementType", name=str, slug=str, is_folder=bool) @@ -26,7 +24,7 @@ class ElementMixin(object): Mixin for the :class:`ElementsWorker` to provide ``Element`` helpers. """ - def create_required_types(self, corpus: Corpus, element_types: List[ElementType]): + def create_required_types(self, element_types: List[ElementType]): """Creates given element types in the corpus. :param Corpus corpus: The corpus to create types on. @@ -39,48 +37,42 @@ class ElementMixin(object): "slug": element_type.slug, "display_name": element_type.name, "folder": element_type.is_folder, - "corpus": corpus.id, + "corpus": self.corpus_id, }, ) logger.info(f"Created a new element type with slug {element_type.slug}") def check_required_types( - self, corpus_id: str, *type_slugs: str, create_missing: bool = False + self, *type_slugs: str, create_missing: bool = False ) -> bool: """ Check that a corpus has a list of required element types, and raise an exception if any of them are missing. - :param str corpus_id: ID of the corpus to check types on. :param str \\*type_slugs: Type slugs to look for. :param bool create_missing: Whether missing types should be created. :returns bool: True if all of the specified type slugs have been found. :raises MissingTypeError: If any of the specified type slugs were not found. """ - assert isinstance( - corpus_id, (uuid.UUID, str) - ), "Corpus ID should be a string or UUID" assert len(type_slugs), "At least one element type slug is required." assert all( isinstance(slug, str) for slug in type_slugs ), "Element type slugs must be strings." - corpus = Corpus(self.request("RetrieveCorpus", id=corpus_id)) - # corpus = self.corpus_id - available_slugs = {element_type.slug for element_type in corpus.types} + corpus = self.request("RetrieveCorpus", id=self.corpus_id) + available_slugs = {element_type["slug"] for element_type in corpus["types"]} missing_slugs = set(type_slugs) - available_slugs if missing_slugs: if create_missing: self.create_required_types( - corpus, element_types=[ ElementType(slug, slug, False) for slug in missing_slugs ], ) else: raise MissingTypeError( - f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in the {corpus.name} corpus ({corpus.id}).' + f'Element type(s) {", ".join(sorted(missing_slugs))} were not found in the {corpus["name"]} corpus ({corpus["id"]}).' ) return True diff --git a/arkindex_worker/worker/entity.py b/arkindex_worker/worker/entity.py index b0b529038c35a9a0925b2cf25054aea134741d15..173b6c34d3c025d909e30a1ced077c0cab56bfd7 100644 --- a/arkindex_worker/worker/entity.py +++ b/arkindex_worker/worker/entity.py @@ -30,9 +30,7 @@ class EntityMixin(object): Mixin for the :class:`ElementsWorker` to add ``Entity`` helpers. """ - def create_entity( - self, element, name, type, corpus=None, metas=dict(), validated=None - ): + def create_entity(self, element, name, type, metas=dict(), validated=None): """ Create an entity on the given corpus. If cache support is enabled, a :class:`CachedEntity` will also be created. @@ -46,9 +44,6 @@ class EntityMixin(object): value of the ``ARKINDEX_CORPUS_ID`` environment variable. :type corpus: str or None """ - if corpus is None: - corpus = self.corpus_id - assert element and isinstance( element, (Element, CachedElement) ), "element shouldn't be null and should be an Element or CachedElement" @@ -58,9 +53,6 @@ class EntityMixin(object): assert type and isinstance( type, EntityType ), "type shouldn't be null and should be of type EntityType" - assert corpus and isinstance( - corpus, str - ), "corpus shouldn't be null and should be of type str" if metas: assert isinstance(metas, dict), "metas should be of type dict" if validated is not None: @@ -76,7 +68,7 @@ class EntityMixin(object): "type": type.value, "metas": metas, "validated": validated, - "corpus": corpus, + "corpus": self.corpus_id, "worker_version": self.worker_version_id, }, ) diff --git a/tests/test_elements_worker/test_classifications.py b/tests/test_elements_worker/test_classifications.py index 2532c3591f0927ca2ed5d3b1c066bb9c3b04a41c..97531fe32260d090eb7b934013c1a47fc736cffa 100644 --- a/tests/test_elements_worker/test_classifications.py +++ b/tests/test_elements_worker/test_classifications.py @@ -12,7 +12,7 @@ from . import BASE_API_CALLS def test_get_ml_class_id_load_classes(responses, mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" + corpus_id = mock_elements_worker.corpus_id responses.add( responses.GET, f"http://testserver/api/v1/corpus/{corpus_id}/classes/", @@ -30,7 +30,7 @@ def test_get_ml_class_id_load_classes(responses, mock_elements_worker): ) assert not mock_elements_worker.classes - ml_class_id = mock_elements_worker.get_ml_class_id(corpus_id, "good") + ml_class_id = mock_elements_worker.get_ml_class_id("good") assert len(responses.calls) == len(BASE_API_CALLS) + 1 assert [ @@ -39,16 +39,16 @@ def test_get_ml_class_id_load_classes(responses, mock_elements_worker): ("GET", f"http://testserver/api/v1/corpus/{corpus_id}/classes/"), ] assert mock_elements_worker.classes == { - "12341234-1234-1234-1234-123412341234": {"good": "0000"} + "11111111-1111-1111-1111-111111111111": {"good": "0000"} } assert ml_class_id == "0000" def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses): # A missing class is now created automatically - corpus_id = "12341234-1234-1234-1234-123412341234" + corpus_id = mock_elements_worker.corpus_id mock_elements_worker.classes = { - "12341234-1234-1234-1234-123412341234": {"good": "0000"} + "11111111-1111-1111-1111-111111111111": {"good": "0000"} } responses.add( @@ -60,15 +60,15 @@ def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses): # Missing class at first assert mock_elements_worker.classes == { - "12341234-1234-1234-1234-123412341234": {"good": "0000"} + "11111111-1111-1111-1111-111111111111": {"good": "0000"} } - ml_class_id = mock_elements_worker.get_ml_class_id(corpus_id, "bad") + ml_class_id = mock_elements_worker.get_ml_class_id("bad") assert ml_class_id == "new-ml-class-1234" # Now it's available assert mock_elements_worker.classes == { - "12341234-1234-1234-1234-123412341234": { + "11111111-1111-1111-1111-111111111111": { "good": "0000", "bad": "new-ml-class-1234", } @@ -76,17 +76,17 @@ def test_get_ml_class_id_inexistant_class(mock_elements_worker, responses): def test_get_ml_class_id(mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" + # corpus_id = "12341234-1234-1234-1234-123412341234" mock_elements_worker.classes = { - "12341234-1234-1234-1234-123412341234": {"good": "0000"} + "11111111-1111-1111-1111-111111111111": {"good": "0000"} } - ml_class_id = mock_elements_worker.get_ml_class_id(corpus_id, "good") + ml_class_id = mock_elements_worker.get_ml_class_id("good") assert ml_class_id == "0000" def test_get_ml_class_reload(responses, mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" + corpus_id = mock_elements_worker.corpus_id # Add some initial classes responses.add( @@ -133,7 +133,7 @@ def test_get_ml_class_reload(responses, mock_elements_worker): ) # Simply request class 2, it should be reloaded - assert mock_elements_worker.get_ml_class_id(corpus_id, "class2") == "class2_id" + assert mock_elements_worker.get_ml_class_id("class2") == "class2_id" assert len(responses.calls) == len(BASE_API_CALLS) + 3 assert mock_elements_worker.classes == { diff --git a/tests/test_elements_worker/test_elements.py b/tests/test_elements_worker/test_elements.py index c74aa273ce87aaebd7ed9335df1aef610a982227..05aac1300f56564d3b3732a3173a69cbaad0d61c 100644 --- a/tests/test_elements_worker/test_elements.py +++ b/tests/test_elements_worker/test_elements.py @@ -22,20 +22,17 @@ from . import BASE_API_CALLS def test_check_required_types_argument_types(mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" - worker = ElementsWorker() - with pytest.raises(AssertionError) as e: - worker.check_required_types(corpus_id) + mock_elements_worker.check_required_types() assert str(e.value) == "At least one element type slug is required." with pytest.raises(AssertionError) as e: - worker.check_required_types(corpus_id, "lol", 42) + mock_elements_worker.check_required_types("lol", 42) assert str(e.value) == "Element type slugs must be strings." -def test_check_required_types(responses): - corpus_id = "12341234-1234-1234-1234-123412341234" +def test_check_required_types(responses, mock_elements_worker): + corpus_id = mock_elements_worker.corpus_id responses.add( responses.GET, f"http://testserver/api/v1/corpus/{corpus_id}/", @@ -45,22 +42,21 @@ def test_check_required_types(responses): "types": [{"slug": "folder"}, {"slug": "page"}], }, ) - worker = ElementsWorker() - worker.setup_api_client() + mock_elements_worker.setup_api_client() - assert worker.check_required_types(corpus_id, "page") - assert worker.check_required_types(corpus_id, "page", "folder") + assert mock_elements_worker.check_required_types("page") + assert mock_elements_worker.check_required_types("page", "folder") with pytest.raises(MissingTypeError) as e: - assert worker.check_required_types(corpus_id, "page", "text_line", "act") + assert mock_elements_worker.check_required_types("page", "text_line", "act") assert ( str(e.value) - == "Element type(s) act, text_line were not found in the Some Corpus corpus (12341234-1234-1234-1234-123412341234)." + == "Element type(s) act, text_line were not found in the Some Corpus corpus (11111111-1111-1111-1111-111111111111)." ) -def test_create_missing_types(responses): - corpus_id = "12341234-1234-1234-1234-123412341234" +def test_create_missing_types(responses, mock_elements_worker): + corpus_id = mock_elements_worker.corpus_id responses.add( responses.GET, @@ -99,11 +95,10 @@ def test_create_missing_types(responses): ) ], ) - worker = ElementsWorker() - worker.setup_api_client() + mock_elements_worker.setup_api_client() - assert worker.check_required_types( - corpus_id, "page", "text_line", "act", create_missing=True + assert mock_elements_worker.check_required_types( + "page", "text_line", "act", create_missing=True ) @@ -276,7 +271,7 @@ def test_database_arg_cache_missing_version_table( def test_load_corpus_classes_api_error(responses, mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" + corpus_id = mock_elements_worker.corpus_id responses.add( responses.GET, f"http://testserver/api/v1/corpus/{corpus_id}/classes/", @@ -287,7 +282,7 @@ def test_load_corpus_classes_api_error(responses, mock_elements_worker): with pytest.raises( Exception, match="Stopping pagination as data will be incomplete" ): - mock_elements_worker.load_corpus_classes(corpus_id) + mock_elements_worker.load_corpus_classes() assert len(responses.calls) == len(BASE_API_CALLS) + 5 assert [ @@ -304,7 +299,7 @@ def test_load_corpus_classes_api_error(responses, mock_elements_worker): def test_load_corpus_classes(responses, mock_elements_worker): - corpus_id = "12341234-1234-1234-1234-123412341234" + corpus_id = mock_elements_worker.corpus_id responses.add( responses.GET, f"http://testserver/api/v1/corpus/{corpus_id}/classes/", @@ -330,7 +325,7 @@ def test_load_corpus_classes(responses, mock_elements_worker): ) assert not mock_elements_worker.classes - mock_elements_worker.load_corpus_classes(corpus_id) + mock_elements_worker.load_corpus_classes() assert len(responses.calls) == len(BASE_API_CALLS) + 1 assert [ @@ -339,7 +334,7 @@ def test_load_corpus_classes(responses, mock_elements_worker): ("GET", f"http://testserver/api/v1/corpus/{corpus_id}/classes/"), ] assert mock_elements_worker.classes == { - "12341234-1234-1234-1234-123412341234": { + "11111111-1111-1111-1111-111111111111": { "good": "0000", "average": "1111", "bad": "2222", diff --git a/tests/test_elements_worker/test_entities.py b/tests/test_elements_worker/test_entities.py index 71ed34ab29e5e6f18ddcab31d94165125e0a23bb..fded79794d9adbadbb53d0df8d7f31ebb9dd595e 100644 --- a/tests/test_elements_worker/test_entities.py +++ b/tests/test_elements_worker/test_entities.py @@ -24,7 +24,6 @@ def test_create_entity_wrong_element(mock_elements_worker): element=None, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert ( str(e.value) @@ -36,7 +35,6 @@ def test_create_entity_wrong_element(mock_elements_worker): element="not element type", name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert ( str(e.value) @@ -52,7 +50,6 @@ def test_create_entity_wrong_name(mock_elements_worker): element=elt, name=None, type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert str(e.value) == "name shouldn't be null and should be of type str" @@ -61,7 +58,6 @@ def test_create_entity_wrong_name(mock_elements_worker): element=elt, name=1234, type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert str(e.value) == "name shouldn't be null and should be of type str" @@ -74,7 +70,6 @@ def test_create_entity_wrong_type(mock_elements_worker): element=elt, name="Bob Bob", type=None, - corpus="12341234-1234-1234-1234-123412341234", ) assert str(e.value) == "type shouldn't be null and should be of type EntityType" @@ -83,7 +78,6 @@ def test_create_entity_wrong_type(mock_elements_worker): element=elt, name="Bob Bob", type=1234, - corpus="12341234-1234-1234-1234-123412341234", ) assert str(e.value) == "type shouldn't be null and should be of type EntityType" @@ -92,7 +86,6 @@ def test_create_entity_wrong_type(mock_elements_worker): element=elt, name="Bob Bob", type="not_an_entity_type", - corpus="12341234-1234-1234-1234-123412341234", ) assert str(e.value) == "type shouldn't be null and should be of type EntityType" @@ -101,6 +94,7 @@ def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker): elt = Element({"id": "12341234-1234-1234-1234-123412341234"}) # Triggering an error on metas param, not giving corpus should work since + # ARKINDEX_CORPUS_ID environment variable is set on mock_elements_worker with pytest.raises(AssertionError) as e: mock_elements_worker.create_entity( element=elt, @@ -110,27 +104,6 @@ def test_create_entity_wrong_corpus(monkeypatch, mock_elements_worker): ) assert str(e.value) == "metas should be of type dict" - # # Removing corpus_id variable should give an error when corpus=None - mock_elements_worker.corpus_id = None - with pytest.raises(AssertionError) as e: - mock_elements_worker.create_entity( - element=elt, - name="Bob Bob", - type=EntityType.Person, - corpus=None, - ) - assert str(e.value) == "corpus shouldn't be null and should be of type str" - - with pytest.raises(AssertionError) as e: - mock_elements_worker.create_entity( - element=elt, - name="Bob Bob", - type=EntityType.Person, - corpus=1234, - ) - assert str(e.value) == "corpus shouldn't be null and should be of type str" - mock_elements_worker.corpus_id = ("11111111-1111-1111-1111-111111111111",) - def test_create_entity_wrong_metas(mock_elements_worker): elt = Element({"id": "12341234-1234-1234-1234-123412341234"}) @@ -140,7 +113,6 @@ def test_create_entity_wrong_metas(mock_elements_worker): element=elt, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", metas="wrong metas", ) assert str(e.value) == "metas should be of type dict" @@ -154,7 +126,6 @@ def test_create_entity_wrong_validated(mock_elements_worker): element=elt, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", validated="wrong validated", ) assert str(e.value) == "validated should be of type bool" @@ -173,7 +144,6 @@ def test_create_entity_api_error(responses, mock_elements_worker): element=elt, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert len(responses.calls) == len(BASE_API_CALLS) + 5 @@ -202,7 +172,6 @@ def test_create_entity(responses, mock_elements_worker): element=elt, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert len(responses.calls) == len(BASE_API_CALLS) + 1 @@ -216,7 +185,7 @@ def test_create_entity(responses, mock_elements_worker): "type": "person", "metas": {}, "validated": None, - "corpus": "12341234-1234-1234-1234-123412341234", + "corpus": "11111111-1111-1111-1111-111111111111", "worker_version": "12341234-1234-1234-1234-123412341234", } assert entity_id == "12345678-1234-1234-1234-123456789123" @@ -235,7 +204,6 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache): element=elt, name="Bob Bob", type=EntityType.Person, - corpus="12341234-1234-1234-1234-123412341234", ) assert len(responses.calls) == len(BASE_API_CALLS) + 1 @@ -250,7 +218,7 @@ def test_create_entity_with_cache(responses, mock_elements_worker_with_cache): "type": "person", "metas": {}, "validated": None, - "corpus": "12341234-1234-1234-1234-123412341234", + "corpus": "11111111-1111-1111-1111-111111111111", "worker_version": "12341234-1234-1234-1234-123412341234", } assert entity_id == "12345678-1234-1234-1234-123456789123"