From ec0b94eb56d11dbc806e10985a349caf76f6ecfe Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Thu, 1 Dec 2022 10:22:19 +0000
Subject: [PATCH] Properly compute files hashes during models publication

---
 arkindex_worker/worker/training.py                     | 10 +++++++---
 tests/conftest.py                                      |  4 ++--
 .../model-best => root_folder}/model_file.pth          |  0
 .../subfolder1}/model_file.pth                         |  0
 tests/samples/root_folder/subfolder2/model_file.pth    |  1 +
 tests/test_elements_worker/test_training.py            |  2 +-
 6 files changed, 11 insertions(+), 6 deletions(-)
 rename tests/samples/{model_files_with_subfolder/model-best => root_folder}/model_file.pth (100%)
 rename tests/samples/{model_files_with_subfolder/model-last => root_folder/subfolder1}/model_file.pth (100%)
 create mode 100644 tests/samples/root_folder/subfolder2/model_file.pth

diff --git a/arkindex_worker/worker/training.py b/arkindex_worker/worker/training.py
index ffe52d6f..bacd1dde 100644
--- a/arkindex_worker/worker/training.py
+++ b/arkindex_worker/worker/training.py
@@ -50,10 +50,14 @@ def create_archive(path: DirPath) -> Tuple[Path, Hash, FileSize, Hash]:
 
     # Create an uncompressed tar archive with all the needed files
     # Files hierarchy ifs kept in the archive.
-
+    file_list = []
     with tarfile.open(path_to_tar_archive, "w") as tar:
-        tar.add(path)
-        file_list = [member for member in tar.getnames() if os.path.isfile(member)]
+        for p in path.glob("**/*"):
+            x = p.relative_to(path)
+            tar.add(p, arcname=x, recursive=False)
+            # Only keep files when computing the hash
+            if p.is_file():
+                file_list.append(p)
 
     # Sort by path
     file_list.sort()
diff --git a/tests/conftest.py b/tests/conftest.py
index 8b6f16ef..1ab4047a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@ from arkindex_worker.worker import BaseWorker, ElementsWorker
 from arkindex_worker.worker.transcription import TextOrientation
 
 FIXTURES_DIR = Path(__file__).resolve().parent / "data"
-SAMPLES_DIR = Path("tests") / "samples"
+SAMPLES_DIR = Path(__file__).resolve().parent / "samples"
 
 __yaml_cache = {}
 
@@ -282,7 +282,7 @@ def model_file_dir():
 
 @pytest.fixture
 def model_file_dir_with_subfolder():
-    return SAMPLES_DIR / "model_files_with_subfolder"
+    return SAMPLES_DIR / "root_folder"
 
 
 @pytest.fixture
diff --git a/tests/samples/model_files_with_subfolder/model-best/model_file.pth b/tests/samples/root_folder/model_file.pth
similarity index 100%
rename from tests/samples/model_files_with_subfolder/model-best/model_file.pth
rename to tests/samples/root_folder/model_file.pth
diff --git a/tests/samples/model_files_with_subfolder/model-last/model_file.pth b/tests/samples/root_folder/subfolder1/model_file.pth
similarity index 100%
rename from tests/samples/model_files_with_subfolder/model-last/model_file.pth
rename to tests/samples/root_folder/subfolder1/model_file.pth
diff --git a/tests/samples/root_folder/subfolder2/model_file.pth b/tests/samples/root_folder/subfolder2/model_file.pth
new file mode 100644
index 00000000..cc78ba30
--- /dev/null
+++ b/tests/samples/root_folder/subfolder2/model_file.pth
@@ -0,0 +1 @@
+Wow this is actually the data of the best model ever created on Arkindex
\ No newline at end of file
diff --git a/tests/test_elements_worker/test_training.py b/tests/test_elements_worker/test_training.py
index 4448554d..551fdf6d 100644
--- a/tests/test_elements_worker/test_training.py
+++ b/tests/test_elements_worker/test_training.py
@@ -56,7 +56,7 @@ def test_create_archive_with_subfolder(model_file_dir_with_subfolder):
     ):
         assert os.path.exists(zst_archive_path), "The archive was not created"
         assert (
-            hash == "e2fa86cefc33b24502ad4151a638dd29"
+            hash == "3e453881404689e6e125144d2db3e605"
         ), "Hash was not properly computed"
         assert 300 < size < 1500
 
-- 
GitLab