diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index dd57da22ad7349cfff042f09a6c0982a9cc35380..6bd3693c68c9422b8709c001f5166fc6a4d54b4c 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -193,10 +193,12 @@ class Tokenizer: vocab_size=self.subword_vocab_size, model_prefix=self.prefix, user_defined_symbols=self.special_tokens, + minloglevel=1, ) except Exception as e: logger.warning( - f"Failed to train a sentencepiece model for subword tokenization: {e}" + f"Failed to train a sentencepiece model for subword tokenization: {e} " + "Try again by editing the `--subword-vocab-size` parameter." ) self.sentencepiece_model = None return