Training crashes with data augmentation
I've encountered this problem several times when training on 2 or 4 GPUs (batch_size > 1). It has never happened on a single GPU.
This could be related to padding, but we'll have to investigate.
Traceback (most recent call last):
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/bin/teklia-dan", line 8, in <module>
sys.exit(main())
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/cli.py", line 31, in main
status = args.pop("func")(**args)
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/ocr/train.py", line 141, in run
start_training(config, mlflow_logging=False)
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/ocr/train.py", line 105, in start_training
train(0, config, mlflow_logging)
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/ocr/train.py", line 52, in train
model.train(mlflow_logging=mlflow_logging)
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/ocr/manager/training.py", line 669, in train
for ind_batch, batch_data in enumerate(self.dataset.train_loader):
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
data = self._next_data()
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/lustre/fswork/projects/rech/yfq/ubz97wr/dan/dan/ocr/manager/dataset.py", line 73, in __getitem__
sample["img"] = self.augmentation_transforms(image=sample["img"])["image"]
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/albumentations/core/composition.py", line 210, in __call__
data = t(**data)
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/albumentations/core/composition.py", line 359, in __call__
data = t(force_apply=True, **data)
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/albumentations/core/transforms_interface.py", line 109, in __call__
params_dependent_on_targets = self.get_params_dependent_on_targets(targets_as_params)
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/site-packages/albumentations/augmentations/dropout/coarse_dropout.py", line 151, in get_params_dependent_on_targets
x1 = random.randint(0, width - hole_width)
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/random.py", line 370, in randint
return self.randrange(a, b+1)
File "/lustre/fsn1/projects/rech/yfq/ubz97wr/envs/dan/lib/python3.10/random.py", line 353, in randrange
raise ValueError("empty range for randrange() (%d, %d, %d)" % (istart, istop, width))
ValueError: empty range for randrange() (0, 0, 0)