Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (2)
......@@ -384,6 +384,9 @@ class ArkindexExtractor:
subword_vocab_size=self.subword_vocab_size,
)
if not tokenizer.sentencepiece_model:
return
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
......@@ -478,6 +481,11 @@ class ArkindexExtractor:
pbar.update()
pbar.refresh()
if not self.data:
raise Exception(
"No data was extracted using the provided export database and parameters."
)
self.download_images()
self.format_lm_files()
self.export()
......
......@@ -186,12 +186,22 @@ class Tokenizer:
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
tmp.write("\n".join(self.training_corpus))
tmp.flush()
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
try:
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
minloglevel=1,
)
except Exception as e:
logger.warning(
f"Failed to train a sentencepiece model for subword tokenization: {e} "
"Try again by editing the `--subword-vocab-size` parameter."
)
self.sentencepiece_model = None
return
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
......