From fcfdfcbed9db89d8b1e6bd870751950d0d8e5c36 Mon Sep 17 00:00:00 2001 From: Martin <maarand@teklia.com> Date: Fri, 29 Jan 2021 13:31:02 +0100 Subject: [PATCH] add sorts to make the splits reproducible --- kaldi_data_generator/kaldi_data_generator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kaldi_data_generator/kaldi_data_generator.py b/kaldi_data_generator/kaldi_data_generator.py index 5156c77..df09faf 100644 --- a/kaldi_data_generator/kaldi_data_generator.py +++ b/kaldi_data_generator/kaldi_data_generator.py @@ -368,7 +368,8 @@ class KaldiPartitionSplitter: self.use_existing_split = use_existing_split def page_level_split(self, line_ids: list) -> dict: - page_ids = list({"_".join(line_id.split("_")[:-1]) for line_id in line_ids}) + # need to sort again, because `set` will lose the order + page_ids = sorted({"_".join(line_id.split("_")[:-1]) for line_id in line_ids}) random.Random(SEED).shuffle(page_ids) page_count = len(page_ids) @@ -398,7 +399,7 @@ class KaldiPartitionSplitter: lines_path = Path(f"{self.out_dir_base}/Lines") line_ids = [ str(file.relative_to(lines_path).with_suffix("")) - for file in lines_path.glob("**/*.jpg") + for file in sorted(lines_path.glob("**/*.jpg")) ] if self.use_existing_split: -- GitLab