Skip to content
Snippets Groups Projects

New Process Dataset Sets management

Merged ml bonhomme requested to merge process-dataset-sets into master
All threads resolved!
+ 33
33
@@ -46,7 +46,6 @@ from rest_framework.generics import (
RetrieveDestroyAPIView,
RetrieveUpdateAPIView,
RetrieveUpdateDestroyAPIView,
UpdateAPIView,
)
from rest_framework.response import Response
from rest_framework.serializers import Serializer
@@ -61,7 +60,7 @@ from arkindex.process.models import (
GitRef,
GitRefType,
Process,
ProcessDataset,
ProcessDatasetSet,
ProcessMode,
Revision,
Worker,
@@ -87,7 +86,7 @@ from arkindex.process.serializers.imports import (
StartProcessSerializer,
)
from arkindex.process.serializers.ingest import BucketSerializer, S3ImportSerializer
from arkindex.process.serializers.training import ProcessDatasetSerializer
from arkindex.process.serializers.training import ProcessDatasetSetSerializer
from arkindex.process.serializers.worker_runs import (
CorpusWorkerRunSerializer,
UserWorkerRunSerializer,
@@ -126,7 +125,7 @@ from arkindex.project.pagination import CountCursorPagination
from arkindex.project.permissions import IsVerified, IsVerifiedOrReadOnly
from arkindex.project.tools import PercentileCont
from arkindex.project.triggers import process_delete
from arkindex.training.models import Model
from arkindex.training.models import DatasetSet, Model
from arkindex.users.models import Role, Scope
logger = logging.getLogger(__name__)
@@ -565,7 +564,7 @@ class StartProcess(CorpusACLMixin, CreateAPIView):
"model_version__model",
"configuration",
)))
.prefetch_related("datasets")
.prefetch_related(Prefetch("sets", queryset=DatasetSet.objects.select_related("dataset")))
# Uses Exists() for has_tasks and not a __isnull because we are not joining on tasks and do not need to fetch them
.annotate(has_tasks=Exists(Task.objects.filter(process=OuterRef("pk"))))
)
@@ -677,20 +676,20 @@ class DataFileCreate(CreateAPIView):
@extend_schema(tags=["process"])
@extend_schema_view(
get=extend_schema(
operation_id="ListProcessDatasets",
operation_id="ListProcessSets",
description=dedent(
"""
List all datasets on a process.
List all dataset sets on a process.
Requires a **guest** access to the process.
"""
),
),
)
class ProcessDatasets(ProcessACLMixin, ListAPIView):
class ProcessDatasetSets(ProcessACLMixin, ListAPIView):
permission_classes = (IsVerified, )
serializer_class = ProcessDatasetSerializer
queryset = ProcessDataset.objects.none()
serializer_class = ProcessDatasetSetSerializer
queryset = ProcessDatasetSet.objects.none()
@cached_property
def process(self):
@@ -704,10 +703,10 @@ class ProcessDatasets(ProcessACLMixin, ListAPIView):
def get_queryset(self):
return (
ProcessDataset.objects.filter(process_id=self.process.id)
.select_related("process__creator", "dataset__creator")
.prefetch_related("dataset__sets")
.order_by("dataset__name")
ProcessDatasetSet.objects.filter(process_id=self.process.id)
.select_related("process__creator", "set__dataset__creator")
.prefetch_related(Prefetch("set__dataset__sets", queryset=DatasetSet.objects.order_by("name")))
.order_by("set__dataset__name", "set__name")
)
def get_serializer_context(self):
@@ -722,51 +721,52 @@ class ProcessDatasets(ProcessACLMixin, ListAPIView):
@extend_schema(tags=["process"])
@extend_schema_view(
post=extend_schema(
operation_id="CreateProcessDataset",
operation_id="CreateProcessSet",
description=dedent(
"""
Add a dataset to a process.
Add a dataset set to a process.
Requires an **admin** access to the process and a **guest** access to the dataset's corpus.
"""
),
),
delete=extend_schema(
operation_id="DestroyProcessDataset",
operation_id="DestroyProcessSet",
description=dedent(
"""
Remove a dataset from a process.
Remove a dataset set from a process.
Requires an **admin** access to the process.
"""
),
),
)
class ProcessDatasetManage(CreateAPIView, UpdateAPIView, DestroyAPIView):
class ProcessDatasetSetManage(CreateAPIView, DestroyAPIView):
permission_classes = (IsVerified, )
serializer_class = ProcessDatasetSerializer
serializer_class = ProcessDatasetSetSerializer
def get_object(self):
process_dataset = get_object_or_404(
ProcessDataset.objects
.select_related("dataset__creator", "process__corpus")
.prefetch_related("dataset__sets")
qs = (
ProcessDatasetSet.objects
.select_related("set__dataset__creator", "process__corpus")
# Required to check for a process that have already started
.annotate(process_has_tasks=Exists(Task.objects.filter(process_id=self.kwargs["process"]))),
dataset_id=self.kwargs["dataset"], process_id=self.kwargs["process"]
.annotate(process_has_tasks=Exists(Task.objects.filter(process_id=self.kwargs["process"])))
)
# Only prefetch the dataset sets when creating
if self.request.method != "DELETE":
qs.prefetch_related(Prefetch("set__dataset__sets", queryset=DatasetSet.objects.order_by("name")))
process_set = get_object_or_404(
qs,
set_id=self.kwargs["set"], process_id=self.kwargs["process"]
)
# Copy the has_tasks annotation onto the process
process_dataset.process.has_tasks = process_dataset.process_has_tasks
return process_dataset
process_set.process.has_tasks = process_set.process_has_tasks
return process_set
def destroy(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
# Ignore the sets when retrieving the ProcessDataset instance, as there cannot be
# two ProcessDatasets with the same dataset and process, whatever the sets
validated_data = serializer.validated_data
del validated_data["sets"]
get_object_or_404(ProcessDataset, **validated_data).delete()
get_object_or_404(ProcessDatasetSet, **serializer.validated_data).delete()
return Response(status=status.HTTP_204_NO_CONTENT)
Loading