Skip to content
Snippets Groups Projects
Commit 74ffe283 authored by Manon Blanco's avatar Manon Blanco
Browse files

Catch runtimeError when formatting LM files

parent a62a2366
No related branches found
No related tags found
1 merge request!313Catch runtimeError when formatting LM files
This commit is part of merge request !313. Comments created here will be created in the context of that merge request.
......@@ -384,6 +384,9 @@ class ArkindexExtractor:
subword_vocab_size=self.subword_vocab_size,
)
if not tokenizer.sentencepiece_model:
return
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
......@@ -454,7 +457,7 @@ class ArkindexExtractor:
)
if failed_downloads:
logger.error(f"Failed to download {len(failed_downloads)} image(s).")
logger.warning(f"Failed to download {len(failed_downloads)} image(s).")
print(*list(map(": ".join, failed_downloads)), sep="\n")
def run(self):
......
......@@ -186,12 +186,19 @@ class Tokenizer:
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
tmp.write("\n".join(self.training_corpus))
tmp.flush()
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
try:
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
minloglevel=1,
)
except Exception as e:
logger.warning(f"Failed to train a sentencepiece model for subword tokenization: {e}")
self.sentencepiece_model = None
return
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment