diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index 3f58b4f6338afae6eac410931675a7691eb949ac..6771e77bc522d435ea8e2b163f273fa4d3a2e1e1 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -147,6 +147,13 @@ def add_extract_parser(subcommands) -> None:
         help="Images larger than this height will be resized to this height.",
     )
 
+    parser.add_argument(
+        "--subword-vocab-size",
+        type=int,
+        default=1000,
+        help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
+    )
+
     # Formatting arguments
     parser.add_argument(
         "--image-format",
diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py
index 2befb801b680b89328c99eb714f0e3733802f223..9bff74d9cf0f3b5f5a935d542231dc2ebd84e710 100644
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -78,6 +78,7 @@ class ArkindexExtractor:
         keep_spaces: bool = False,
         image_extension: str = "",
         allow_empty: bool = False,
+        subword_vocab_size: int = 1000,
     ) -> None:
         self.folders = folders
         self.element_type = element_type
@@ -93,8 +94,8 @@ class ArkindexExtractor:
         self.image_extension = image_extension
         self.allow_empty = allow_empty
         self.mapping = LMTokenMapping()
-
         self.keep_spaces = keep_spaces
+        self.subword_vocab_size = subword_vocab_size
 
         self.data: Dict = defaultdict(dict)
         self.charset = set()
@@ -375,6 +376,7 @@ class ArkindexExtractor:
             outdir=self.output / "language_model",
             mapping=self.mapping,
             tokens=self.tokens,
+            subword_vocab_size=self.subword_vocab_size,
         )
         self.language_corpus["characters"] = [
             tokenizer.char_tokenize(doc) for doc in train_corpus
@@ -518,6 +520,7 @@ def run(
     image_format: str,
     keep_spaces: bool,
     allow_empty: bool,
+    subword_vocab_size: int,
 ):
     assert database.exists(), f"No file found @ {database}"
     open_database(path=database)
@@ -544,4 +547,5 @@ def run(
         keep_spaces=keep_spaces,
         image_extension=image_format,
         allow_empty=allow_empty,
+        subword_vocab_size=subword_vocab_size,
     ).run()
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 8116e1961edae0bd1fc1ce0c127474335501a290..5cd8f5333f3043cb97fe9113e7c96c0731e15b1e 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -142,7 +142,7 @@ class Tokenizer:
         self.tokens = tokens
         self.mapping = mapping
         # Train the subword tokenizer
-        self.user_subword_vocab_size = subword_vocab_size
+        self.subword_vocab_size = subword_vocab_size
         self.sentencepiece_model = self.train_subword_tokenizer()
 
     @property
@@ -161,11 +161,6 @@ class Tokenizer:
     def special_tokens(self) -> List[str]:
         return list(set(self.ner_tokens + self.mapping_tokens))
 
-    @property
-    def subword_vocab_size(self):
-        n_words = len(set([word for doc in self.corpus for word in doc.split()]))
-        return min(self.user_subword_vocab_size, 3 * n_words)
-
     def train_subword_tokenizer(self):
         """
         Train a sentencepiece model on the training corpus.