Skip to content
Snippets Groups Projects

Dataset unique elements

Merged Valentin Rigal requested to merge dataset-unique-elements into master
All threads resolved!
1 file
+ 17
12
Compare changes
  • Side-by-side
  • Inline
@@ -542,6 +542,10 @@ class DatasetSerializer(serializers.ModelSerializer):
set_elements = DatasetSetsCountField(
help_text="Distribution of elements in sets. This value is set to null when listing multiple datasets.",
)
unique_elements = serializers.BooleanField(
default=True,
help_text="Ensures that an element is only present in a single set at a time.",
)
def validate_state(self, state):
"""
@@ -575,6 +579,20 @@ class DatasetSerializer(serializers.ModelSerializer):
raise ValidationError("Either do not specify set names to use the default values, or specify a non-empty list of names.")
return set_names
def validate_unique_elements(self, unique):
# When updating a dataset to switch unique_elements from False to True,
# check that it does not contain duplicates.
if unique is True and self.instance and not self.instance.unique_elements and (
DatasetElement.objects
.filter(set__dataset_id=self.instance.pk)
.values("element_id")
.annotate(dupes=Count("element_id"))
.filter(dupes__gte=2)
.exists()
):
raise ValidationError("Some elements are currently contained by multiple sets.")
return unique
def validate(self, data):
data = super().validate(data)
@@ -636,6 +654,7 @@ class DatasetSerializer(serializers.ModelSerializer):
# Hidden field to set the creator as the authenticated user
"default_creator",
"task_id",
"unique_elements",
"created",
"updated",
)
@@ -700,6 +719,21 @@ class DatasetElementSerializer(serializers.ModelSerializer):
self.fields["element_id"].queryset = Element.objects.filter(corpus=dataset.corpus)
self.fields["set"].queryset = dataset.sets.all()
def validate_element_id(self, element):
dataset = self.context.get("dataset")
if dataset and dataset.unique_elements and (
existing_set := (
dataset.sets
.filter(set_elements__element=element)
.values_list("name", flat=True)
.first()
)
):
raise ValidationError([
f"The dataset requires unique elements and this element is already present in set {existing_set}."
])
return element
def validate(self, data):
data = super().validate(data)
data.pop("dataset")
@@ -759,6 +793,20 @@ class SelectionDatasetElementSerializer(serializers.Serializer):
raise ValidationError(f"Dataset {set.dataset.id} is not part of corpus {corpus.name}.")
if set.dataset.state == DatasetState.Complete:
raise ValidationError(f"Dataset {set.dataset.id} is marked as completed.")
# Ensure adding elements to the dataset does not break uniqueness constraint
selection = self.context["request"].user.selected_elements.filter(corpus=corpus)
if set.dataset.unique_elements and (
existing_set := (
set.dataset.sets
.exclude(id=set.id)
.filter(set_elements__element_id__in=selection.values_list("id", flat=True))
.values_list("name", flat=True)
.first()
)
):
raise ValidationError([
f"The dataset requires unique elements and some elements are already present in set {existing_set}."
])
return set
def create(self, validated_data):
Loading