From fcfdfcbed9db89d8b1e6bd870751950d0d8e5c36 Mon Sep 17 00:00:00 2001
From: Martin <maarand@teklia.com>
Date: Fri, 29 Jan 2021 13:31:02 +0100
Subject: [PATCH] add sorts to make the splits reproducible

---
 kaldi_data_generator/kaldi_data_generator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kaldi_data_generator/kaldi_data_generator.py b/kaldi_data_generator/kaldi_data_generator.py
index 5156c77..df09faf 100644
--- a/kaldi_data_generator/kaldi_data_generator.py
+++ b/kaldi_data_generator/kaldi_data_generator.py
@@ -368,7 +368,8 @@ class KaldiPartitionSplitter:
         self.use_existing_split = use_existing_split
 
     def page_level_split(self, line_ids: list) -> dict:
-        page_ids = list({"_".join(line_id.split("_")[:-1]) for line_id in line_ids})
+        # need to sort again, because `set` will lose the order
+        page_ids = sorted({"_".join(line_id.split("_")[:-1]) for line_id in line_ids})
         random.Random(SEED).shuffle(page_ids)
         page_count = len(page_ids)
 
@@ -398,7 +399,7 @@ class KaldiPartitionSplitter:
         lines_path = Path(f"{self.out_dir_base}/Lines")
         line_ids = [
             str(file.relative_to(lines_path).with_suffix(""))
-            for file in lines_path.glob("**/*.jpg")
+            for file in sorted(lines_path.glob("**/*.jpg"))
         ]
 
         if self.use_existing_split:
-- 
GitLab