diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 3f58b4f6338afae6eac410931675a7691eb949ac..6771e77bc522d435ea8e2b163f273fa4d3a2e1e1 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None: help="Images larger than this height will be resized to this height.", ) + parser.add_argument( + "--subword-vocab-size", + type=int, + default=1000, + help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.", + ) + # Formatting arguments parser.add_argument( "--image-format", diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 2befb801b680b89328c99eb714f0e3733802f223..9bff74d9cf0f3b5f5a935d542231dc2ebd84e710 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -78,6 +78,7 @@ class ArkindexExtractor: keep_spaces: bool = False, image_extension: str = "", allow_empty: bool = False, + subword_vocab_size: int = 1000, ) -> None: self.folders = folders self.element_type = element_type @@ -93,8 +94,8 @@ class ArkindexExtractor: self.image_extension = image_extension self.allow_empty = allow_empty self.mapping = LMTokenMapping() - self.keep_spaces = keep_spaces + self.subword_vocab_size = subword_vocab_size self.data: Dict = defaultdict(dict) self.charset = set() @@ -375,6 +376,7 @@ class ArkindexExtractor: outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens, + subword_vocab_size=self.subword_vocab_size, ) self.language_corpus["characters"] = [ tokenizer.char_tokenize(doc) for doc in train_corpus @@ -518,6 +520,7 @@ def run( image_format: str, keep_spaces: bool, allow_empty: bool, + subword_vocab_size: int, ): assert database.exists(), f"No file found @ {database}" open_database(path=database) @@ -544,4 +547,5 @@ def run( keep_spaces=keep_spaces, image_extension=image_format, allow_empty=allow_empty, + subword_vocab_size=subword_vocab_size, ).run() diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 8116e1961edae0bd1fc1ce0c127474335501a290..5cd8f5333f3043cb97fe9113e7c96c0731e15b1e 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -142,7 +142,7 @@ class Tokenizer: self.tokens = tokens self.mapping = mapping # Train the subword tokenizer - self.user_subword_vocab_size = subword_vocab_size + self.subword_vocab_size = subword_vocab_size self.sentencepiece_model = self.train_subword_tokenizer() @property @@ -161,11 +161,6 @@ class Tokenizer: def special_tokens(self) -> List[str]: return list(set(self.ner_tokens + self.mapping_tokens)) - @property - def subword_vocab_size(self): - n_words = len(set([word for doc in self.corpus for word in doc.split()])) - return min(self.user_subword_vocab_size, 3 * n_words) - def train_subword_tokenizer(self): """ Train a sentencepiece model on the training corpus.