Skip to content
Snippets Groups Projects
Commit 1611527b authored by Solene Tarride's avatar Solene Tarride Committed by Solene Tarride
Browse files

Add vocabulary size parameter for subword tokenizer

parent d6dcd979
No related branches found
No related tags found
No related merge requests found
...@@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None: ...@@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None:
help="Images larger than this height will be resized to this width.", help="Images larger than this height will be resized to this width.",
) )
parser.add_argument(
"--subword-vocab-size",
type=int,
default=1000,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
)
# Formatting arguments # Formatting arguments
parser.add_argument( parser.add_argument(
"--image-format", "--image-format",
......
...@@ -78,6 +78,7 @@ class ArkindexExtractor: ...@@ -78,6 +78,7 @@ class ArkindexExtractor:
keep_spaces: bool = False, keep_spaces: bool = False,
image_extension: str = "", image_extension: str = "",
allow_empty: bool = False, allow_empty: bool = False,
subword_vocab_size: int = 1000,
) -> None: ) -> None:
self.folders = folders self.folders = folders
self.element_type = element_type self.element_type = element_type
...@@ -93,8 +94,8 @@ class ArkindexExtractor: ...@@ -93,8 +94,8 @@ class ArkindexExtractor:
self.image_extension = image_extension self.image_extension = image_extension
self.allow_empty = allow_empty self.allow_empty = allow_empty
self.mapping = LMTokenMapping() self.mapping = LMTokenMapping()
self.keep_spaces = keep_spaces self.keep_spaces = keep_spaces
self.subword_vocab_size = subword_vocab_size
self.data: Dict = defaultdict(dict) self.data: Dict = defaultdict(dict)
self.charset = set() self.charset = set()
...@@ -375,6 +376,7 @@ class ArkindexExtractor: ...@@ -375,6 +376,7 @@ class ArkindexExtractor:
outdir=self.output / "language_model", outdir=self.output / "language_model",
mapping=self.mapping, mapping=self.mapping,
tokens=self.tokens, tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
) )
self.language_corpus["characters"] = [ self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus tokenizer.char_tokenize(doc) for doc in train_corpus
...@@ -518,6 +520,7 @@ def run( ...@@ -518,6 +520,7 @@ def run(
image_format: str, image_format: str,
keep_spaces: bool, keep_spaces: bool,
allow_empty: bool, allow_empty: bool,
subword_vocab_size: int,
): ):
assert database.exists(), f"No file found @ {database}" assert database.exists(), f"No file found @ {database}"
open_database(path=database) open_database(path=database)
...@@ -544,4 +547,5 @@ def run( ...@@ -544,4 +547,5 @@ def run(
keep_spaces=keep_spaces, keep_spaces=keep_spaces,
image_extension=image_format, image_extension=image_format,
allow_empty=allow_empty, allow_empty=allow_empty,
subword_vocab_size=subword_vocab_size,
).run() ).run()
...@@ -142,7 +142,7 @@ class Tokenizer: ...@@ -142,7 +142,7 @@ class Tokenizer:
self.tokens = tokens self.tokens = tokens
self.mapping = mapping self.mapping = mapping
# Train the subword tokenizer # Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size self.subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer() self.sentencepiece_model = self.train_subword_tokenizer()
@property @property
...@@ -161,11 +161,6 @@ class Tokenizer: ...@@ -161,11 +161,6 @@ class Tokenizer:
def special_tokens(self) -> List[str]: def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens)) return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self): def train_subword_tokenizer(self):
""" """
Train a sentencepiece model on the training corpus. Train a sentencepiece model on the training corpus.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment