From 74ffe2839ee84ddd993b6f84d5c5296160ae966c Mon Sep 17 00:00:00 2001
From: manonBlanco <blanco@teklia.com>
Date: Mon, 6 Nov 2023 16:49:12 +0100
Subject: [PATCH] Catch runtimeError when formatting LM files

---
 dan/datasets/extract/arkindex.py |  5 ++++-
 dan/datasets/extract/utils.py    | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 5fa595eb..b98fc672 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -384,6 +384,9 @@ class ArkindexExtractor:
             subword_vocab_size=self.subword_vocab_size,
         )
 
+        if not tokenizer.sentencepiece_model:
+            return
+
         for level, tokenize in (
             ("characters", tokenizer.char_tokenize),
             ("words", tokenizer.word_tokenize),
@@ -454,7 +457,7 @@ class ArkindexExtractor:
                 )
 
         if failed_downloads:
-            logger.error(f"Failed to download {len(failed_downloads)} image(s).")
+            logger.warning(f"Failed to download {len(failed_downloads)} image(s).")
             print(*list(map(": ".join, failed_downloads)), sep="\n")
 
     def run(self):
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 8ee14af3..183c9222 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -186,12 +186,19 @@ class Tokenizer:
         with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
             tmp.write("\n".join(self.training_corpus))
             tmp.flush()
-            spm.SentencePieceTrainer.train(
-                input=tmp.name,
-                vocab_size=self.subword_vocab_size,
-                model_prefix=self.prefix,
-                user_defined_symbols=self.special_tokens,
-            )
+
+            try:
+                spm.SentencePieceTrainer.train(
+                    input=tmp.name,
+                    vocab_size=self.subword_vocab_size,
+                    model_prefix=self.prefix,
+                    user_defined_symbols=self.special_tokens,
+                    minloglevel=1,
+                )
+            except Exception as e:
+                logger.warning(f"Failed to train a sentencepiece model for subword tokenization: {e}")
+                self.sentencepiece_model = None
+                return
 
         # Load the model
         self.sentencepiece_model = spm.SentencePieceProcessor(
-- 
GitLab