diff --git a/tests/test_extract.py b/tests/test_extract.py
index 19f59cd1ca99fb9092d523e755b8ab53919c5d06..3502f496d8fad2e63ce38ee1732b95af0f34c284 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -319,11 +319,127 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
         arkindex_extractor.process_element(element, "val")
 
 
-@pytest.mark.parametrize("load_entities", (True, False))
-@pytest.mark.parametrize("keep_spaces", (True, False))
-# Transcription and entities have the same worker version
 @pytest.mark.parametrize(
-    "transcription_entities_worker_version", ("worker_version_id", False)
+    "load_entities,keep_spaces,transcription_entities_worker_version,expected_subword_language_corpus,subword_vocab_size",
+    (
+        (
+            True,
+            True,
+            "worker_version_id",
+            """▁ ⓢ c a i l l e t ▁ ⓕ m a u r i c e ▁ ⓑ 28. 9.0 6
+▁ ⓢ re b ou l ▁ ⓕ j e a n ▁ ⓑ 30. 9.0 2
+▁ ⓢ b a re y re ▁ ⓕ j e a n ▁ ⓑ 28. 3 . 1 1
+▁ ⓢ r ou s s y ▁ ⓕ j e a n ▁ ⓑ 4 . 1 1 . 1 4
+▁ ⓢ m a r i n ▁ ⓕ m a r c e l ▁ ⓑ 1 0 . 8 . 0 6
+▁ ⓢ a m i c a l ▁ ⓕ e l o i ▁ ⓑ 1 1 . 1 0 . 0 4
+▁ ⓢ b i r o s ▁ ⓕ m a e l ▁ ⓑ 30. 1 0 . 1 0""",
+            40,
+        ),
+        (
+            True,
+            False,
+            "worker_version_id",
+            """▁ ⓢ c a i l l e t ▁ ⓕ m a u r i c e ▁ ⓑ 28. 9.0 6
+▁ ⓢ re b ou l ▁ ⓕ j e a n ▁ ⓑ 30. 9.0 2
+▁ ⓢ b a re y re ▁ ⓕ j e a n ▁ ⓑ 28. 3 . 1 1
+▁ ⓢ r ou s s y ▁ ⓕ j e a n ▁ ⓑ 4 . 1 1 . 1 4
+▁ ⓢ m a r i n ▁ ⓕ m a r c e l ▁ ⓑ 1 0 . 8 . 0 6
+▁ ⓢ a m i c a l ▁ ⓕ e l o i ▁ ⓑ 1 1 . 1 0 . 0 4
+▁ ⓢ b i r o s ▁ ⓕ m a e l ▁ ⓑ 30. 1 0 . 1 0""",
+            40,
+        ),
+        (
+            False,
+            True,
+            "worker_version_id",
+            """▁ ca i l l e t ▁ ma u r i ce ▁ 28. 9.0 6
+▁ re b o u l ▁ j e a n ▁ 30. 9.0 2
+▁ b a re y re ▁ j e a n ▁ 28. 3 . 1 1
+▁ r o u s s y ▁ j e a n ▁ 4 . 11.1 4
+▁ ma r i n ▁ ma r ce l ▁ 10. 8 . 0 6
+▁ a m i ca l ▁ el o i ▁ 11.1 0 . 0 4
+▁ b i r o s ▁ ma el ▁ 30. 10. 1 0""",
+            40,
+        ),
+        (
+            False,
+            False,
+            "worker_version_id",
+            """▁ ca i l l e t ▁ ma u r i ce ▁ 28. 9.0 6
+▁ re b o u l ▁ j e a n ▁ 30. 9.0 2
+▁ b a re y re ▁ j e a n ▁ 28. 3 . 1 1
+▁ r o u s s y ▁ j e a n ▁ 4 . 11.1 4
+▁ ma r i n ▁ ma r ce l ▁ 10. 8 . 0 6
+▁ a m i ca l ▁ el o i ▁ 11.1 0 . 0 4
+▁ b i r o s ▁ ma el ▁ 30. 10. 1 0""",
+            40,
+        ),
+        (
+            True,
+            True,
+            False,
+            """▁ ⓢ C a i l l e t ▁ ⓕ M a u r i c e ▁ ⓑ 2 8 . 9 . 0 6
+▁ ⓢ R e b o u l ▁ ⓕ J e a n ▁ ⓑ 3 0 . 9 . 0 2
+▁ ⓢ B a r e y r e ▁ ⓕ J e a n ▁ ⓑ 2 8 . 3 . 1 1
+▁ ⓢ R o u s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 1 1 . 1 4
+▁ ⓢ M a r i n ▁ ⓕ M a r c e l ▁ ⓑ 1 0 . 8 . 0 6
+▁ ⓢ A m i c a l ▁ ⓕ E l o i ▁ ⓑ 1 1 . 1 0 . 0 4
+▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 3 0 . 1 0 . 1 0""",
+            40,
+        ),
+        (
+            True,
+            True,
+            False,
+            """▁ ⓢ C a i l l e t ▁ ⓕ M a u ri ce ▁ ⓑ 28. 9.0 6
+▁ ⓢ R e b ou l ▁ ⓕ J e a n ▁ ⓑ 30. 9.0 2
+▁ ⓢ B a re y re ▁ ⓕ J e a n ▁ ⓑ 28. 3 . 1 1
+▁ ⓢ R ou s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 11.1 4
+▁ ⓢ Mar i n ▁ ⓕ Mar ce l ▁ ⓑ 10. 8 . 0 6
+▁ ⓢ A m ic a l ▁ ⓕ E l o i ▁ ⓑ 11.1 0 . 0 4
+▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 30. 10. 10""",
+            55,
+        ),
+        (
+            True,
+            False,
+            False,
+            """▁ ⓢ C a i l l e t ▁ ⓕ M a u r i c e ▁ ⓑ 2 8 . 9 . 0 6
+▁ ⓢ R e b o u l ▁ ⓕ J e a n ▁ ⓑ 3 0 . 9 . 0 2
+▁ ⓢ B a r e y r e ▁ ⓕ J e a n ▁ ⓑ 2 8 . 3 . 1 1
+▁ ⓢ R o u s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 1 1 . 1 4
+▁ ⓢ M a r i n ▁ ⓕ M a r c e l ▁ ⓑ 1 0 . 8 . 0 6
+▁ ⓢ A m i c a l ▁ ⓕ E l o i ▁ ⓑ 1 1 . 1 0 . 0 4
+▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 3 0 . 1 0 . 1 0""",
+            40,
+        ),
+        (
+            False,
+            True,
+            False,
+            """▁ C a i l l e t ▁ Ma u r i c e ▁ 28. 9.0 6
+▁ R e b o u l ▁ J e a n ▁ 30. 9.0 2
+▁ B a r e y r e ▁ J e a n ▁ 28. 3 . 1 1
+▁ R o u s s y ▁ J e a n ▁ 4 . 1 1 . 1 4
+▁ Ma r i n ▁ Ma r c e l ▁ 1 0 . 8 . 0 6
+▁ A m i c a l ▁ E l o i ▁ 1 1 . 1 0 . 0 4
+▁ B i r o s ▁ Ma e l ▁ 30. 1 0 . 1 0""",
+            40,
+        ),
+        (
+            False,
+            False,
+            False,
+            """▁ C a i l l e t ▁ Ma u r i c e ▁ 28. 9.0 6
+▁ R e b o u l ▁ J e a n ▁ 30. 9.0 2
+▁ B a r e y r e ▁ J e a n ▁ 28. 3 . 1 1
+▁ R o u s s y ▁ J e a n ▁ 4 . 1 1 . 1 4
+▁ Ma r i n ▁ Ma r c e l ▁ 1 0 . 8 . 0 6
+▁ A m i c a l ▁ E l o i ▁ 1 1 . 1 0 . 0 4
+▁ B i r o s ▁ Ma e l ▁ 30. 1 0 . 1 0""",
+            40,
+        ),
+    ),
 )
 @patch("dan.datasets.extract.arkindex.download_image")
 def test_extract(
@@ -332,6 +448,8 @@ def test_extract(
     keep_spaces,
     transcription_entities_worker_version,
     mock_database,
+    expected_subword_language_corpus,
+    subword_vocab_size,
     tmp_path,
 ):
     output = tmp_path / "extraction"
@@ -362,6 +480,7 @@ def test_extract(
         else None,
         keep_spaces=keep_spaces,
         image_extension=".jpg",
+        subword_vocab_size=subword_vocab_size,
     )
     # Mock build_image_url to simply return the path to the image
     extractor.build_iiif_url = mock_build_image_url
@@ -488,17 +607,11 @@ def test_extract(
 ⓢ Amical ▁ ⓕ Eloi ▁ ⓑ 11 ▁ . ▁ 10 ▁ . ▁ 04
 ⓢ Biros ▁ ⓕ Mael ▁ ⓑ 30 ▁ . ▁ 10 ▁ . ▁ 10"""
 
-    expected_subword_language_corpus = """▁ ⓢ C a i l l e t ▁ ⓕ M a u ri ce ▁ ⓑ 28. 9.0 6
-▁ ⓢ R e b ou l ▁ ⓕ J e a n ▁ ⓑ 30. 9.0 2
-▁ ⓢ B a re y re ▁ ⓕ J e a n ▁ ⓑ 28. 3 .11
-▁ ⓢ R ou s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 11.1 4
-▁ ⓢ Mar i n ▁ ⓕ Mar ce l ▁ ⓑ 10. 8. 0 6
-▁ ⓢ A m ic a l ▁ ⓕ E l o i ▁ ⓑ 11.1 0 . 0 4
-▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 30. 10. 10"""
-
     # Transcriptions with worker version are in lowercase
     if transcription_entities_worker_version:
         expected_char_language_corpus = expected_char_language_corpus.lower()
+        expected_word_language_corpus = expected_word_language_corpus.lower()
+        expected_subword_language_corpus = expected_subword_language_corpus.lower()
 
     # If we do not load entities, remove tokens
     if not load_entities:
@@ -506,12 +619,23 @@ def test_extract(
         expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
             "", expected_char_language_corpus
         )
-
+        expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
+            "", expected_word_language_corpus
+        )
+        expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
+            "", expected_subword_language_corpus
+        )
     # Replace double spaces with regular space
     if not keep_spaces:
         expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
             "▁", expected_char_language_corpus
         )
+        expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_word_language_corpus
+        )
+        expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_subword_language_corpus
+        )
 
     assert (
         output / "language_model" / "corpus_characters.txt"
@@ -520,7 +644,8 @@ def test_extract(
     assert (
         output / "language_model" / "corpus_words.txt"
     ).read_text() == expected_word_language_corpus
-
+    print((output / "language_model" / "corpus_subwords.txt").read_text())
+    print(expected_subword_language_corpus)
     assert (
         output / "language_model" / "corpus_subwords.txt"
     ).read_text() == expected_subword_language_corpus