Skip to content
Snippets Groups Projects
Commit 95570042 authored by Solene Tarride's avatar Solene Tarride
Browse files

Add vocabulary size parameter for subword tokenizer

parent c0d6f936
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None:
help="Images larger than this height will be resized to this width.",
)
parser.add_argument(
"--subword-vocab-size",
type=int,
default=1000,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
)
# Formatting arguments
parser.add_argument(
"--image-format",
......
......@@ -78,6 +78,7 @@ class ArkindexExtractor:
keep_spaces: bool = False,
image_extension: str = "",
allow_empty: bool = False,
subword_vocab_size: int = 1000,
) -> None:
self.folders = folders
self.element_type = element_type
......@@ -93,8 +94,8 @@ class ArkindexExtractor:
self.image_extension = image_extension
self.allow_empty = allow_empty
self.mapping = LMTokenMapping()
self.keep_spaces = keep_spaces
self.subword_vocab_size = subword_vocab_size
self.data: Dict = defaultdict(dict)
self.charset = set()
......@@ -375,6 +376,7 @@ class ArkindexExtractor:
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
......@@ -518,6 +520,7 @@ def run(
image_format: str,
keep_spaces: bool,
allow_empty: bool,
subword_vocab_size: int,
):
assert database.exists(), f"No file found @ {database}"
open_database(path=database)
......@@ -544,4 +547,5 @@ def run(
keep_spaces=keep_spaces,
image_extension=image_format,
allow_empty=allow_empty,
subword_vocab_size=subword_vocab_size,
).run()
......@@ -134,7 +134,7 @@ class Tokenizer:
self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property
......@@ -153,11 +153,6 @@ class Tokenizer:
def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self):
"""
Train a sentencepiece model on the training corpus.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment