Skip to content
Snippets Groups Projects

Rework the worker due to `Dataset` API changes

Merged Eva Bardou requested to merge rework-worker into main
All threads resolved!
@@ -356,14 +356,15 @@ class DatasetExtractor(DatasetWorker):
self.data_folder.cleanup()
def run(self):
super().configure()
self.configure()
# Download corpus
self.download_latest_export()
dataset_sets: list[Set] = list(self.list_sets())
grouped_sets: list[list[Set]] = [
list(sets) for _, sets in groupby(dataset_sets, attrgetter("dataset"))
grouped_sets: list[tuple[Dataset, list[Set]]] = [
(dataset, list(sets))
for dataset, sets in groupby(dataset_sets, attrgetter("dataset"))
]
if not grouped_sets:
logger.warning("No datasets to process, stopping.")
@@ -372,9 +373,7 @@ class DatasetExtractor(DatasetWorker):
# Process every dataset
count = len(grouped_sets)
failed = 0
for i, sets in enumerate(grouped_sets, start=1):
dataset = sets[0].dataset
for i, (dataset, sets) in enumerate(grouped_sets, start=1):
try:
assert dataset.state in [
DatasetState.Open.value,
Loading