Skip to content
Snippets Groups Projects
Commit c9f4a20b authored by Solene Tarride's avatar Solene Tarride
Browse files

Add vocabulary size parameter for subword tokenizer

parent 29106bb0
No related branches found
No related tags found
1 merge request!287Support subword and word language models
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None:
help="Images larger than this height will be resized to this height.",
)
parser.add_argument(
"--subword-vocab-size",
type=int,
default=1000,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
)
# Formatting arguments
parser.add_argument(
"--image-format",
......
......@@ -78,6 +78,7 @@ class ArkindexExtractor:
keep_spaces: bool = False,
image_extension: str = "",
allow_empty: bool = False,
subword_vocab_size: int = 1000,
) -> None:
self.folders = folders
self.element_type = element_type
......@@ -93,8 +94,8 @@ class ArkindexExtractor:
self.image_extension = image_extension
self.allow_empty = allow_empty
self.mapping = LMTokenMapping()
self.keep_spaces = keep_spaces
self.subword_vocab_size = subword_vocab_size
self.data: Dict = defaultdict(dict)
self.charset = set()
......@@ -375,6 +376,7 @@ class ArkindexExtractor:
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
......@@ -518,6 +520,7 @@ def run(
image_format: str,
keep_spaces: bool,
allow_empty: bool,
subword_vocab_size: int,
):
assert database.exists(), f"No file found @ {database}"
open_database(path=database)
......@@ -544,4 +547,5 @@ def run(
keep_spaces=keep_spaces,
image_extension=image_format,
allow_empty=allow_empty,
subword_vocab_size=subword_vocab_size,
).run()
......@@ -142,7 +142,7 @@ class Tokenizer:
self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property
......@@ -161,11 +161,6 @@ class Tokenizer:
def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self):
"""
Train a sentencepiece model on the training corpus.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment