diff --git a/kaldi_data_generator/kaldi_data_generator.py b/kaldi_data_generator/kaldi_data_generator.py index 5156c7735c3adf1af8714b0d75bba6a490cf6ba4..df09faf0379606944d667d66dca25d3b48c359d9 100644 --- a/kaldi_data_generator/kaldi_data_generator.py +++ b/kaldi_data_generator/kaldi_data_generator.py @@ -368,7 +368,8 @@ class KaldiPartitionSplitter: self.use_existing_split = use_existing_split def page_level_split(self, line_ids: list) -> dict: - page_ids = list({"_".join(line_id.split("_")[:-1]) for line_id in line_ids}) + # need to sort again, because `set` will lose the order + page_ids = sorted({"_".join(line_id.split("_")[:-1]) for line_id in line_ids}) random.Random(SEED).shuffle(page_ids) page_count = len(page_ids) @@ -398,7 +399,7 @@ class KaldiPartitionSplitter: lines_path = Path(f"{self.out_dir_base}/Lines") line_ids = [ str(file.relative_to(lines_path).with_suffix("")) - for file in lines_path.glob("**/*.jpg") + for file in sorted(lines_path.glob("**/*.jpg")) ] if self.use_existing_split: