Merge branch 'language-model-command' into 'main'

Use a dedicated command to build language model resources Closes #296 See merge request !439

Merge branch 'language-model-command' into 'main'
646805ae · Yoann Schneider · e2f3d8f3 · b04d27c6 · 646805ae · 646805ae
Commit 646805ae authored 9 months ago by Yoann Schneider
--- a/dan/datasets/__init__.py
+++ b/dan/datasets/__init__.py
@@ -10,6 +10,7 @@ from dan.datasets.analyze import add_analyze_parser
 from dan.datasets.download import add_download_parser
 from dan.datasets.entities import add_entities_parser
 from dan.datasets.extract import add_extract_parser
+from dan.datasets.language_model import add_language_model_parser
 from dan.datasets.tokens import add_tokens_parser


@@ -26,3 +27,4 @@ def add_dataset_parser(subcommands) -> None:
    add_analyze_parser(subcommands)
    add_entities_parser(subcommands)
    add_tokens_parser(subcommands)
+    add_language_model_parser(subcommands)
--- a/dan/datasets/download/__init__.py
+++ b/dan/datasets/download/__init__.py
@@ -60,18 +60,4 @@ def add_download_parser(subcommands) -> None:
        help="Token to use to replace character in the validation/test sets that is not included in the training set.",
    )

-    parser.add_argument(
-        "--subword-vocab-size",
-        type=int,
-        help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
-        default=1000,
-    )
-
-    parser.add_argument(
-        "--tokens",
-        type=pathlib.Path,
-        help="Mapping between starting tokens and end tokens to extract text with their entities.",
-        required=False,
-    )
-
    parser.set_defaults(func=run)
--- a/dan/datasets/download/images.py
+++ b/dan/datasets/download/images.py
@@ -19,13 +19,10 @@ from tqdm import tqdm

 from dan.datasets.download.exceptions import ImageDownloadError
 from dan.datasets.download.utils import (
-    Tokenizer,
    download_image,
    get_bbox,
-    get_vocabulary,
 )
 from dan.datasets.extract.arkindex import TRAIN_NAME
-from dan.utils import LMTokenMapping, parse_tokens
 from line_image_extractor.extractor import extract
 from line_image_extractor.image_utils import (
    BoundingBox,
@@ -33,8 +30,6 @@ from line_image_extractor.image_utils import (
    polygon_to_bbox,
 )

-LANGUAGE_DIR = "language_model"  # Subpath to the language model directory.
-
 IMAGES_DIR = "images"  # Subpath to the images directory.

 IIIF_URL = "{image_url}/{bbox}/{size}/0/default.jpg"
@@ -56,8 +51,6 @@ class ImageDownloader:
        max_height: int | None = None,
        image_extension: str = "",
        unknown_token: str = "⁇",
-        subword_vocab_size: int = 1000,
-        tokens: Path | None = None,
    ) -> None:
        self.output = output

@@ -65,16 +58,6 @@ class ImageDownloader:
        self.max_height = max_height
        self.image_extension = image_extension

-        self.unknown_token = unknown_token
-        self.tokens = parse_tokens(tokens) if tokens else {}
-
-        self.subword_vocab_size = subword_vocab_size
-        self.mapping = LMTokenMapping()
-
-        self.language_corpus = defaultdict(list)
-        self.language_tokens = []
-        self.language_lexicon = defaultdict(list)
-
        # Load split file
        split_file = self.output / "split.json" if self.output else None
        self.split: Dict = (
@@ -94,6 +77,10 @@ class ImageDownloader:
            )
        )

+        # Add unknown token to charset
+        self.unknown_token = unknown_token
+        self.charset.add(self.unknown_token)
+
    def check_extraction(self, values: dict) -> str | None:
        # Check dataset_id parameter
        if values.get("dataset_id") is None:
@@ -273,62 +260,6 @@ class ImageDownloader:
            logger.error(f"Failed to download {len(failed_downloads)} image(s).")
            print(*list(map(": ".join, failed_downloads)), sep="\n")

-    def format_lm_files(self) -> None:
-        """
-        Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
-        """
-        logger.info("Preparing language resources")
-        # Add unknown token to charset
-        self.charset.add(self.unknown_token)
-
-        # Build LM tokens
-        for token in sorted(list(self.charset)):
-            assert (
-                token not in self.mapping.encode.values()
-            ), f"Special token {token} is reserved for language modeling."
-            self.language_tokens.append(
-                self.mapping.encode[token]
-            ) if token in self.mapping.encode else self.language_tokens.append(token)
-        self.language_tokens.append(self.mapping.ctc.encoded)
-
-        # Build LM corpus
-        train_corpus = [
-            values["text"].replace(
-                self.mapping.linebreak.display, self.mapping.space.display
-            )
-            for values in self.split[TRAIN_NAME].values()
-        ]
-
-        tokenizer = Tokenizer(
-            training_corpus=train_corpus,
-            charset=self.language_tokens,
-            unknown_token=self.unknown_token,
-            outdir=self.output / LANGUAGE_DIR,
-            mapping=self.mapping,
-            tokens=self.tokens,
-            subword_vocab_size=self.subword_vocab_size,
-        )
-
-        if not tokenizer.sentencepiece_model:
-            return
-
-        for level, tokenize in (
-            ("characters", tokenizer.char_tokenize),
-            ("words", tokenizer.word_tokenize),
-            ("subwords", tokenizer.subword_tokenize),
-        ):
-            self.language_corpus[level] = list(map(tokenize, train_corpus))
-
-        # Build LM lexicon
-        self.language_lexicon["characters"] = [
-            f"{token} {token}" for token in self.language_tokens
-        ]
-        for level in ["words", "subwords"]:
-            self.language_lexicon[level] = [
-                f"{token} {tokenizer.char_tokenize(token)}"
-                for token in get_vocabulary(self.language_corpus[level])
-            ]
-
    def export(self) -> None:
        """
        Writes a `labels.json` file containing a mapping of the images that have been correctly uploaded (identified by its path)
@@ -342,16 +273,6 @@ class ImageDownloader:
            )
        )

-        for level in ["characters", "words", "subwords"]:
-            (self.output / LANGUAGE_DIR / f"corpus_{level}.txt").write_text(
-                "\n".join(self.language_corpus[level])
-            )
-            (self.output / LANGUAGE_DIR / f"lexicon_{level}.txt").write_text(
-                "\n".join(self.language_lexicon[level])
-            )
-        (self.output / LANGUAGE_DIR / "tokens.txt").write_text(
-            "\n".join(self.language_tokens)
-        )
        (self.output / "charset.pkl").write_bytes(
            pickle.dumps(sorted(list(self.charset)))
        )
@@ -364,7 +285,6 @@ class ImageDownloader:
        """
        tasks: List[Dict[str, str]] = self.build_tasks()
        self.download_images(tasks)
-        self.format_lm_files()
        self.export()


@@ -374,8 +294,6 @@ def run(
    max_height: int | None,
    image_format: str,
    unknown_token: str,
-    subword_vocab_size: int,
-    tokens: Path | None,
 ):
    """
    Download the missing images from a `split.json` file and build a `labels.json` file containing
@@ -387,17 +305,11 @@ def run(
    :param max_height: Images larger than this height will be resized to this height
    :param image_format: Images will be saved under this format
    :param unknown_token: The token used to replace unknown characters.
-    :param subword_vocab_size: The size of the subword vocabulary.
-    :param tokens: Mapping between starting tokens and end tokens to extract text with their entities..
    """
-    (output / LANGUAGE_DIR).mkdir(parents=True, exist_ok=True)
-
    ImageDownloader(
        output=output,
        max_width=max_width,
        max_height=max_height,
        image_extension=image_format,
        unknown_token=unknown_token,
-        subword_vocab_size=subword_vocab_size,
-        tokens=tokens,
    ).run()
--- a/dan/datasets/download/utils.py
+++ b/dan/datasets/download/utils.py
@@ -2,18 +2,11 @@
 # This code is licensed under CeCILL-C

 # -*- coding: utf-8 -*-
-import itertools
 import logging
-import operator
-from dataclasses import dataclass, field
 from io import BytesIO
-from pathlib import Path
-from tempfile import NamedTemporaryFile
 from typing import List

 import requests
-import sentencepiece as spm
-from nltk import wordpunct_tokenize
 from PIL import Image, ImageOps
 from tenacity import (
    retry,
@@ -22,8 +15,6 @@ from tenacity import (
    wait_exponential,
 )

-from dan.utils import EntityType, LMTokenMapping
-
 logger = logging.getLogger(__name__)

 # See http://docs.python-requests.org/en/master/user/advanced/#timeouts
@@ -89,130 +80,3 @@ def get_bbox(polygon: List[List[int]]) -> str:
    x, y = min(all_x), min(all_y)
    width, height = max(all_x) - x, max(all_y) - y
    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
-
-
-def get_vocabulary(tokenized_text: List[str]) -> set[str]:
-    """
-    Compute set of vocabulary from tokenzied text.
-    :param tokenized_text: List of tokenized text.
-    """
-    return sorted(set([token for doc in tokenized_text for token in doc.split()]))
-
-
-@dataclass
-class Tokenizer:
-    """
-    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
-    :param training_corpus: List of training text.
-    :param outdir: Path to save the subword tokenizer.
-    :param mapping: Mapping between displayed and encoded versions of special characters.
-    :param tokens: Start and end tokens used to represent named entities.
-    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
-    """
-
-    training_corpus: List[str]
-    charset: List[str]
-    unknown_token: str
-    outdir: Path
-    mapping: LMTokenMapping
-    tokens: EntityType | None = None
-    subword_vocab_size: int = 1000
-    sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
-
-    @property
-    def prefix(self) -> Path:
-        return self.outdir / "subword_tokenizer"
-
-    @property
-    def ner_tokens(self) -> List[str]:
-        if self.tokens is None:
-            return []
-        return list(
-            itertools.chain(
-                map(operator.attrgetter("start"), self.tokens.values()),
-                filter(
-                    operator.truth,
-                    map(operator.attrgetter("end"), self.tokens.values()),
-                ),
-            )
-        )
-
-    @property
-    def mapping_tokens(self) -> List[str]:
-        return [token.encoded for token in self.mapping]
-
-    @property
-    def special_tokens(self) -> List[str]:
-        return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
-
-    def __post_init__(self) -> None:
-        """
-        Train a sentencepiece model on the training corpus.
-        """
-        # Write the corpus in a text file
-        logger.info("Training a sentencepiece model for subword tokenization")
-        with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp_file:
-            tmp_file.write("\n".join(self.training_corpus))
-            tmp_file.flush()
-
-            try:
-                spm.SentencePieceTrainer.train(
-                    input=tmp_file.name,
-                    vocab_size=self.subword_vocab_size,
-                    model_prefix=self.prefix,
-                    user_defined_symbols=self.special_tokens,
-                    minloglevel=1,
-                )
-            except Exception as e:
-                logger.warning(
-                    f"Failed to train a sentencepiece model for subword tokenization: {e} "
-                    "Try again by editing the `--subword-vocab-size` parameter."
-                )
-                self.sentencepiece_model = None
-                return
-
-        # Load the model
-        self.sentencepiece_model = spm.SentencePieceProcessor(
-            model_file=str(self.prefix.with_suffix(".model"))
-        )
-
-    def subword_tokenize(self, text: str) -> str:
-        """
-        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
-        """
-        tokens = self.sentencepiece_model.encode(text, out_type=str)
-        return " ".join(map("".join, map(self.encode, tokens)))
-
-    def word_tokenize(self, text: str) -> str:
-        """
-        Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
-        :param text: Text to be tokenized.
-        """
-        words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
-        return " ".join(
-            [
-                f"{word} {self.mapping.space.encoded}"
-                if (i != len(words) - 1 and word not in self.ner_tokens)
-                else word
-                for i, word in enumerate(words)
-            ]
-        )
-
-    def char_tokenize(self, text: str) -> str:
-        """
-        Tokenize text into a string of space-separated characters.
-        :param text: Text to be tokenized.
-        """
-        return " ".join(
-            [
-                char if char in self.charset else self.unknown_token
-                for char in self.encode(text)
-            ]
-        )
-
-    def encode(self, text: List[str]) -> List[str]:
-        """
-        Encode special tokens.
-        :param text: Text to be encoded.
-        """
-        return map(self.mapping.encode_token, text)
--- a/dan/datasets/language_model/__init__.py
+++ b/dan/datasets/language_model/__init__.py
+# Copyright Teklia (contact@teklia.com) & Denis Coquenet
+# This code is licensed under CeCILL-C
+
+# -*- coding: utf-8 -*-
+"""
+Build all resources needed for the language model from a split extracted by DAN
+"""
+
+import pathlib
+
+from dan.datasets.language_model.build import run
+
+
+def add_language_model_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "language-model",
+        description=__doc__,
+        help=__doc__,
+    )
+
+    # Required arguments.
+    parser.add_argument(
+        "--output",
+        type=pathlib.Path,
+        help="Path where the `labels.json` and `charset.pkl` files are stored and where the data will be generated.",
+        required=True,
+    )
+
+    # Formatting arguments
+    parser.add_argument(
+        "--subword-vocab-size",
+        type=int,
+        help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
+        default=1000,
+    )
+
+    parser.add_argument(
+        "--unknown-token",
+        type=str,
+        default="⁇",
+        help="Token to use to replace character in the validation/test sets that is not included in the training set.",
+    )
+    parser.add_argument(
+        "--tokens",
+        type=pathlib.Path,
+        help="Mapping between starting tokens and end tokens to extract text with their entities.",
+        required=False,
+    )
+
+    parser.set_defaults(func=run)
--- a/dan/datasets/language_model/build.py
+++ b/dan/datasets/language_model/build.py
+# Copyright Teklia (contact@teklia.com) & Denis Coquenet
+# This code is licensed under CeCILL-C
+
+# -*- coding: utf-8 -*-
+
+import json
+import logging
+import pickle
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict
+
+from dan.datasets.extract.arkindex import TRAIN_NAME
+from dan.datasets.language_model.utils import (
+    Tokenizer,
+    get_vocabulary,
+)
+from dan.utils import LMTokenMapping, parse_tokens
+
+LANGUAGE_DIR = "language_model"  # Subpath to the language model directory.
+
+logger = logging.getLogger(__name__)
+
+
+class LanguageModelBuilder:
+    """
+    Build a language model from extracted data
+    """
+
+    def __init__(
+        self,
+        output: Path | None = None,
+        subword_vocab_size: int = 1000,
+        unknown_token: str = "⁇",
+        tokens: Path | None = None,
+    ) -> None:
+        self.output = output
+
+        self.unknown_token = unknown_token
+        self.tokens = parse_tokens(tokens) if tokens else {}
+
+        self.subword_vocab_size = subword_vocab_size
+        self.mapping = LMTokenMapping()
+
+        self.language_corpus = defaultdict(list)
+        self.language_tokens = []
+        self.language_lexicon = defaultdict(list)
+
+        # Load labels file
+        labels_file = self.output / "labels.json" if self.output else None
+        self.labels: Dict = (
+            json.loads(labels_file.read_text())
+            if labels_file and labels_file.is_file()
+            else {}
+        )
+
+        # Load charset file
+        charset_file = self.output / "charset.pkl" if self.output else None
+        self.charset: Dict = (
+            pickle.loads(charset_file.read_bytes())
+            if charset_file and charset_file.is_file()
+            else {}
+        )
+
+    def format_lm_files(self) -> None:
+        """
+        Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
+        """
+        logger.info("Preparing language resources")
+
+        # Build LM tokens
+        for token in sorted(list(self.charset)):
+            assert (
+                token not in self.mapping.encode.values()
+            ), f"Special token {token} is reserved for language modeling."
+            self.language_tokens.append(
+                self.mapping.encode[token]
+            ) if token in self.mapping.encode else self.language_tokens.append(token)
+        self.language_tokens.append(self.mapping.ctc.encoded)
+
+        # Build LM corpus
+        train_corpus = [
+            value.replace(self.mapping.linebreak.display, self.mapping.space.display)
+            for value in self.labels[TRAIN_NAME].values()
+        ]
+
+        tokenizer = Tokenizer(
+            training_corpus=train_corpus,
+            charset=self.language_tokens,
+            unknown_token=self.unknown_token,
+            outdir=self.output / LANGUAGE_DIR,
+            mapping=self.mapping,
+            tokens=self.tokens,
+            subword_vocab_size=self.subword_vocab_size,
+        )
+
+        if not tokenizer.sentencepiece_model:
+            return
+
+        for level, tokenize in (
+            ("characters", tokenizer.char_tokenize),
+            ("words", tokenizer.word_tokenize),
+            ("subwords", tokenizer.subword_tokenize),
+        ):
+            self.language_corpus[level] = list(map(tokenize, train_corpus))
+
+        # Build LM lexicon
+        self.language_lexicon["characters"] = [
+            f"{token} {token}" for token in self.language_tokens
+        ]
+        for level in ["words", "subwords"]:
+            self.language_lexicon[level] = [
+                f"{token} {tokenizer.char_tokenize(token)}"
+                for token in get_vocabulary(self.language_corpus[level])
+            ]
+
+    def export(self) -> None:
+        """
+        Writes all files needed for the language model
+        """
+        for level in ["characters", "words", "subwords"]:
+            (self.output / LANGUAGE_DIR / f"corpus_{level}.txt").write_text(
+                "\n".join(self.language_corpus[level])
+            )
+            (self.output / LANGUAGE_DIR / f"lexicon_{level}.txt").write_text(
+                "\n".join(self.language_lexicon[level])
+            )
+
+        (self.output / LANGUAGE_DIR / "tokens.txt").write_text(
+            "\n".join(self.language_tokens)
+        )
+
+    def run(self) -> None:
+        """
+        Build and write all files needed for the language model
+        """
+        self.format_lm_files()
+        self.export()
+
+
+def run(
+    output: Path,
+    subword_vocab_size: int,
+    unknown_token: str,
+    tokens: Path | None,
+):
+    """
+    Build and write all files needed for the language model
+
+    :param output: Path where the `split.json` file is stored and where the data will be generated
+    :param subword_vocab_size: The size of the subword vocabulary.
+    :param unknown_token: The token used to replace unknown characters.
+    :param tokens: Mapping between starting tokens and end tokens to extract text with their entities.
+    """
+    (output / LANGUAGE_DIR).mkdir(parents=True, exist_ok=True)
+
+    LanguageModelBuilder(
+        output=output,
+        subword_vocab_size=subword_vocab_size,
+        unknown_token=unknown_token,
+        tokens=tokens,
+    ).run()
--- a/dan/datasets/language_model/utils.py
+++ b/dan/datasets/language_model/utils.py
+# Copyright Teklia (contact@teklia.com) & Denis Coquenet
+# This code is licensed under CeCILL-C
+
+# -*- coding: utf-8 -*-
+import itertools
+import logging
+import operator
+from dataclasses import dataclass, field
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import List
+
+import sentencepiece as spm
+from nltk import wordpunct_tokenize
+
+from dan.utils import EntityType, LMTokenMapping
+
+logger = logging.getLogger(__name__)
+
+
+def get_vocabulary(tokenized_text: List[str]) -> set[str]:
+    """
+    Compute set of vocabulary from tokenzied text.
+    :param tokenized_text: List of tokenized text.
+    """
+    return sorted(set([token for doc in tokenized_text for token in doc.split()]))
+
+
+@dataclass
+class Tokenizer:
+    """
+    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
+    :param training_corpus: List of training text.
+    :param outdir: Path to save the subword tokenizer.
+    :param mapping: Mapping between displayed and encoded versions of special characters.
+    :param tokens: Start and end tokens used to represent named entities.
+    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
+    """
+
+    training_corpus: List[str]
+    charset: List[str]
+    unknown_token: str
+    outdir: Path
+    mapping: LMTokenMapping
+    tokens: EntityType | None = None
+    subword_vocab_size: int = 1000
+    sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
+
+    @property
+    def prefix(self) -> Path:
+        return self.outdir / "subword_tokenizer"
+
+    @property
+    def ner_tokens(self) -> List[str]:
+        if self.tokens is None:
+            return []
+        return list(
+            itertools.chain(
+                map(operator.attrgetter("start"), self.tokens.values()),
+                filter(
+                    operator.truth,
+                    map(operator.attrgetter("end"), self.tokens.values()),
+                ),
+            )
+        )
+
+    @property
+    def mapping_tokens(self) -> List[str]:
+        return [token.encoded for token in self.mapping]
+
+    @property
+    def special_tokens(self) -> List[str]:
+        return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
+
+    def __post_init__(self) -> None:
+        """
+        Train a sentencepiece model on the training corpus.
+        """
+        # Write the corpus in a text file
+        logger.info("Training a sentencepiece model for subword tokenization")
+        with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp_file:
+            tmp_file.write("\n".join(self.training_corpus))
+            tmp_file.flush()
+
+            try:
+                spm.SentencePieceTrainer.train(
+                    input=tmp_file.name,
+                    vocab_size=self.subword_vocab_size,
+                    model_prefix=self.prefix,
+                    user_defined_symbols=self.special_tokens,
+                    minloglevel=1,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to train a sentencepiece model for subword tokenization: {e} "
+                    "Try again by editing the `--subword-vocab-size` parameter."
+                )
+                self.sentencepiece_model = None
+                return
+
+        # Load the model
+        self.sentencepiece_model = spm.SentencePieceProcessor(
+            model_file=str(self.prefix.with_suffix(".model"))
+        )
+
+    def subword_tokenize(self, text: str) -> str:
+        """
+        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
+        """
+        tokens = self.sentencepiece_model.encode(text, out_type=str)
+        return " ".join(map("".join, map(self.encode, tokens)))
+
+    def word_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
+        :param text: Text to be tokenized.
+        """
+        words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
+        return " ".join(
+            [
+                f"{word} {self.mapping.space.encoded}"
+                if (i != len(words) - 1 and word not in self.ner_tokens)
+                else word
+                for i, word in enumerate(words)
+            ]
+        )
+
+    def char_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated characters.
+        :param text: Text to be tokenized.
+        """
+        return " ".join(
+            [
+                char if char in self.charset else self.unknown_token
+                for char in self.encode(text)
+            ]
+        )
+
+    def encode(self, text: List[str]) -> List[str]:
+        """
+        Encode special tokens.
+        :param text: Text to be encoded.
+        """
+        return map(self.mapping.encode_token, text)
--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -51,6 +51,14 @@ The library already has all the documents needed to run the [dataset download co
 teklia-dan dataset download --output .
 ```

+#### Dataset language-model command
+
+The library already has all the documents needed to run the [dataset language-model command](../usage/datasets/language_model.md) on a minimalist dataset. In the `tests/data/prediction` directory, you can run the following command and add any extra parameters you need:
+
+```shell
+teklia-dan dataset language-model --output . --subword-vocab-size 45
+```
+
 #### Dataset analyze command

 The library already has all the documents needed to run the [dataset analyze command](../usage/datasets/analyze.md) on a minimalist dataset. In the `tests/data/training/training_dataset` directory, you can run the following command and add any extra parameters you need:

--- a/docs/get_started/training.md
+++ b/docs/get_started/training.md
@@ -10,6 +10,7 @@ To extract the data, DAN uses an Arkindex export database in SQLite format. You
 1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
 1. Extract the data with the [extract command](../usage/datasets/extract.md).
 1. Download images with the [download command](../usage/datasets/download.md).
+1. Build language model resources with the [language-model command](../usage/datasets/language_model.md).

 These commands will extract and format the images and labels needed to train DAN. It will also tokenize the training corpus at character, subword, and word levels, allowing you to combine DAN with an explicit statistical language model to improve performance.


--- a/docs/ref/datasets/language_model/build.md
+++ b/docs/ref/datasets/language_model/build.md
+# Build
+
+::: dan.datasets.language_model.build
--- a/docs/ref/datasets/language_model/index.md
+++ b/docs/ref/datasets/language_model/index.md
+# Language model
+
+::: dan.datasets.language_model
--- a/docs/ref/datasets/language_model/utils.md
+++ b/docs/ref/datasets/language_model/utils.md
+# Utils
+
+::: dan.datasets.language_model.utils
--- a/docs/usage/datasets/download.md
+++ b/docs/usage/datasets/download.md
@@ -5,21 +5,18 @@
 Use the `teklia-dan dataset download` command to download images of a dataset from a split extracted by DAN. This will:

 - Store the set of characters encountered in the dataset (in the `charset.pkl` file),
- Generate the resources needed to build a n-gram language model at character, subword or word-level with [kenlm](https://github.com/kpu/kenlm) (in the `language_model/` folder).
 - Generate the images of each element (in the `images/` folder),
 - Create the mapping of the images that have been correctly uploaded (identified by its path) to the ground-truth transcription (with NER tokens if needed) (in the `labels.json` file).

 If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed.

-| Parameter              | Description                                                                                                         | Type           | Default |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
-| `--output`             | Path where the `split.json` file is stored and where the data will be generated.                                    | `pathlib.Path` |         |
-| `--max-width`          | Images larger than this width will be resized to this width.                                                        | `int`          |         |
-| `--max-height`         | Images larger than this height will be resized to this height.                                                      | `int`          |         |
-| `--image-format`       | Images will be saved under this format.                                                                             | `str`          | `.jpg`  |
-| `--unknown-token`      | Token to use to replace character in the validation/test sets that is not included in the training set.             | `str`          | `⁇`     |
-| `--tokens`             | Mapping between starting tokens and end tokens to extract text with their entities.                                 | `pathlib.Path` |         |
-| `--subword-vocab-size` | Size of the vocabulary used to train the sentencepiece subword tokenizer used to train the optional language model. | `int`          | `1000`  |
+| Parameter         | Description                                                                                             | Type           | Default |
+| ----------------- | ------------------------------------------------------------------------------------------------------- | -------------- | ------- |
+| `--output`        | Path where the `split.json` file is stored and where the data will be generated.                        | `pathlib.Path` |         |
+| `--max-width`     | Images larger than this width will be resized to this width.                                            | `int`          |         |
+| `--max-height`    | Images larger than this height will be resized to this height.                                          | `int`          |         |
+| `--image-format`  | Images will be saved under this format.                                                                 | `str`          | `.jpg`  |
+| `--unknown-token` | Token to use to replace character in the validation/test sets that is not included in the training set. | `str`          | `⁇`     |

 The `--output` directory should have a `split.json` JSON-formatted file with a specific format. A mapping of the elements (identified by its ID) to the image information and the ground-truth transcription (with NER tokens if needed). This file can be generated by the `teklia-dan dataset extract` command. More details in the [dedicated page](./extract.md).

@@ -46,32 +43,6 @@ The `--output` directory should have a `split.json` JSON-formatted file with a s
 }
 ```

-The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. This file can be generated by the `teklia-dan dataset tokens` command. More details in the [dedicated page](./tokens.md).
-
-```yaml
-INTITULE: # Type of the entity on Arkindex
-  start: ⓘ # Starting token for this entity
-  end: Ⓘ # Optional ending token for this entity
-DATE:
-  start: ⓓ
-  end: Ⓓ
-COTE_SERIE:
-  start: ⓢ
-  end: Ⓢ
-ANALYSE_COMPL.:
-  start: ⓒ
-  end: Ⓒ
-PRECISIONS_SUR_COTE:
-  start: ⓟ
-  end: Ⓟ
-COTE_ARTICLE:
-  start: ⓐ
-  end: Ⓐ
-CLASSEMENT:
-  start: ⓛ
-  end: Ⓛ
-```
-
 ## Examples

 ### Download full images

--- a/docs/usage/datasets/index.md
+++ b/docs/usage/datasets/index.md
 # Datasets

-Two operations are available through subcommands:
+Several operations are available through subcommands:

 `teklia-dan dataset entities`
 : To extract entities from an [Arkindex export](https://doc.arkindex.org/howto/export/). More details in the [dedicated page](./entities.md).
@@ -14,5 +14,8 @@ Two operations are available through subcommands:
 `teklia-dan dataset download`
 : To download images of a dataset. More details in the [dedicated page](./download.md).

+`teklia-dan dataset language-model`
+: To build language model resources of a dataset. More details in the [dedicated page](./language_model.md).
+
 `teklia-dan dataset analyze`
 : To analyze datasets and display statistics. More details in the [dedicated page](./analyze.md).
--- a/docs/usage/datasets/language_model.md
+++ b/docs/usage/datasets/language_model.md
+# Dataset language model
+
+## Description
+
+Use the `teklia-dan dataset language-model` command to build language model resources of a dataset from a split extracted by DAN. This will:
+
+- Generate the resources needed to build a n-gram language model at character, subword or word-level with [kenlm](https://github.com/kpu/kenlm) (in the `language_model/` folder).
+
+| Parameter              | Description                                                                                                         | Type           | Default |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
+| `--output`             | Path where the `labels.json` and `charset.pkl` files are stored and where the data will be generated.               | `pathlib.Path` |         |
+| `--subword-vocab-size` | Size of the vocabulary used to train the sentencepiece subword tokenizer used to train the optional language model. | `int`          | `1000`  |
+| `--unknown-token`      | Token to use to replace character in the validation/test sets that is not included in the training set.             | `str`          | `⁇`     |
+| `--tokens`             | Mapping between starting tokens and end tokens to extract text with their entities.                                 | `pathlib.Path` |         |
+
+The `--output` directory should have:
+
+- A `charset.pkl` file of the set of characters encountered in the dataset,
+- A `labels.json` JSON-formatted file with a specific format. A mapping of the images (identified by its path) to the ground-truth transcription (with NER tokens if needed).
+
+These files can be generated by the `teklia-dan dataset download` command. More details in the [dedicated page](./download.md).
+
+```json
+{
+  "train": {
+    "<image_path>": "\u24e2Coufet \u24d5Bouis \u24d107.12.14"
+  },
+  "val": {},
+  "test": {}
+}
+```
+
+The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. This file can be generated by the `teklia-dan dataset tokens` command. More details in the [dedicated page](./tokens.md).
+
+```yaml
+INTITULE: # Type of the entity on Arkindex
+  start: ⓘ # Starting token for this entity
+  end: Ⓘ # Optional ending token for this entity
+DATE:
+  start: ⓓ
+  end: Ⓓ
+COTE_SERIE:
+  start: ⓢ
+  end: Ⓢ
+ANALYSE_COMPL.:
+  start: ⓒ
+  end: Ⓒ
+PRECISIONS_SUR_COTE:
+  start: ⓟ
+  end: Ⓟ
+COTE_ARTICLE:
+  start: ⓐ
+  end: Ⓐ
+CLASSEMENT:
+  start: ⓛ
+  end: Ⓛ
+```
+
+## Examples
+
+### HTR and NER data
+
+To build language model resources with NER data, please use the following:
+
+```shell
+teklia-dan dataset language-model \
+    --output data \
+    --tokens tokens.yml
+```
+
+### HTR data
+
+To build language model resources without NER data, please use the following:
+
+```shell
+teklia-dan dataset language-model \
+    --output data
+```
--- a/docs/usage/train/language_model.md
+++ b/docs/usage/train/language_model.md
@@ -9,7 +9,7 @@ To build the language model, you first need to install and compile [kenlm](https

 ## Build the language model

-The `teklia-dan dataset extract` automatically generate the files required to train a language model either at character, subword or word-level in `my_dataset/language_model/`.
+The `teklia-dan dataset language-model` automatically generate the files required to train a language model either at character, subword or word-level in `my_dataset/language_model/`.

 Note that linebreaks are replaced by spaces in the language model.


--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -65,6 +65,7 @@ nav:
      - Dataset tokens: usage/datasets/tokens.md
      - Dataset extraction: usage/datasets/extract.md
      - Dataset download: usage/datasets/download.md
+      - Dataset language model: usage/datasets/language_model.md
      - Dataset analysis: usage/datasets/analyze.md
    - Training:
      - usage/train/index.md
@@ -98,6 +99,10 @@ nav:
        - Utils: ref/datasets/extract/utils.md
        - Database management: ref/datasets/extract/db.md
        - Exceptions: ref/datasets/extract/exceptions.md
+      - Language model:
+        - ref/datasets/language_model/index.md
+        - Build: ref/datasets/language_model/build.md
+        - Utils: ref/datasets/language_model/utils.md
      - Tokens:
        - ref/datasets/tokens/index.md
        - Generate: ref/datasets/tokens/generate.md

--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -5,7 +5,6 @@
 import json
 import logging
 import pickle
-import re
 from operator import attrgetter, methodcaller
 from pathlib import Path

@@ -20,9 +19,6 @@ from tests import FIXTURES, change_split_content

 EXTRACTION_DATA_PATH = FIXTURES / "extraction"

-ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
-TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
-

 @pytest.mark.parametrize(
    "max_width, max_height, width, height, resize",
@@ -46,140 +42,21 @@ def test_get_iiif_size_arg(max_width, max_height, width, height, resize, tmp_pat
    )


+@pytest.mark.parametrize("load_entities", [True, False])
+@pytest.mark.parametrize("keep_spaces", [True, False])
 @pytest.mark.parametrize(
-    "load_entities,keep_spaces,transcription_entities_worker_version,expected_subword_language_corpus,subword_vocab_size",
-    (
-        (
-            True,
-            True,
-            "worker_version_id",
-            """▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
-▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
-▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
-▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
-▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
-▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
-▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
-            40,
-        ),
-        (
-            True,
-            False,
-            "worker_version_id",
-            """▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
-▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
-▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
-▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
-▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
-▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
-▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
-            40,
-        ),
-        (
-            False,
-            True,
-            "worker_version_id",
-            """▁ la u l ont ▁ f r an c oi s ▁ 8
-▁ c i re t ▁ an t oi ne ▁ 2 7
-▁ c i re t ▁ m a r ie ▁ 2 8
-▁ c i re t ▁ m a r ie ▁ 2
-▁ e u res t on ▁ so l an g e ▁ 1 0
-▁ t e r ont u ss ie u x ▁ j e an ▁ 2
-▁ p res so ne t ▁ m a r ie ▁ 1 2""",
-            40,
-        ),
-        (
-            False,
-            False,
-            "worker_version_id",
-            """▁ la u l ont ▁ f r an c oi s ▁ 8
-▁ c i re t ▁ an t oi ne ▁ 2 7
-▁ c i re t ▁ m a r ie ▁ 2 8
-▁ c i re t ▁ m a r ie ▁ 2
-▁ e u res t on ▁ so l an g e ▁ 1 0
-▁ t e r ont u ss ie u x ▁ j e an ▁ 2
-▁ p res so ne t ▁ m a r ie ▁ 1 2""",
-            40,
-        ),
-        (
-            True,
-            True,
-            False,
-            """▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
-▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
-▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
-▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
-▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
-▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
-▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
-            40,
-        ),
-        (
-            True,
-            True,
-            False,
-            """▁ ⓢ L a u l ont ▁ ⓕ F r an c oi s ▁ ⓑ 8
-▁ ⓢ C i re t ▁ ⓕ A n t oi n e ▁ ⓑ 2 7
-▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2 8
-▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2
-▁ ⓢ E u re s t on ▁ ⓕ S o l an g e ▁ ⓑ 1 0
-▁ ⓢ T e r ont u s s ie u x ▁ ⓕ J e an ▁ ⓑ 2
-▁ ⓢ P re s s on e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
-            45,
-        ),
-        (
-            True,
-            False,
-            False,
-            """▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
-▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
-▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
-▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
-▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
-▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
-▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
-            40,
-        ),
-        (
-            False,
-            True,
-            False,
-            """▁ L a u l ont ▁ F r an c oi s ▁ 8
-▁ C i re t ▁ A n t oi n e ▁ 2 7
-▁ C i re t ▁ M a r ie ▁ 2 8
-▁ C i re t ▁ M a r ie ▁ 2
-▁ E u re s t on ▁ S o l an g e ▁ 1 0
-▁ T e r ont u s s ie u x ▁ J e an ▁ 2
-▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
-            40,
-        ),
-        (
-            False,
-            False,
-            False,
-            """▁ L a u l ont ▁ F r an c oi s ▁ 8
-▁ C i re t ▁ A n t oi n e ▁ 2 7
-▁ C i re t ▁ M a r ie ▁ 2 8
-▁ C i re t ▁ M a r ie ▁ 2
-▁ E u re s t on ▁ S o l an g e ▁ 1 0
-▁ T e r ont u s s ie u x ▁ J e an ▁ 2
-▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
-            40,
-        ),
-    ),
+    "transcription_entities_worker_version", ["worker_version_id", False]
 )
 def test_download(
    load_entities,
    keep_spaces,
    transcription_entities_worker_version,
-    expected_subword_language_corpus,
-    subword_vocab_size,
    split_content,
    monkeypatch,
    tmp_path,
 ):
    output = tmp_path / "download"
-    (output / "language_model").mkdir(parents=True, exist_ok=True)
+    output.mkdir(parents=True, exist_ok=True)

    # Mock tokens
    tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
@@ -236,8 +113,6 @@ def test_download(
    extractor = ImageDownloader(
        output=output,
        image_extension=".jpg",
-        tokens=tokens_path if load_entities else None,
-        subword_vocab_size=subword_vocab_size,
    )
    # Mock build_image_url to simply return the path to the image
    extractor.build_iiif_url = mock_build_image_url
@@ -271,16 +146,6 @@ def test_download(
        VAL_DIR / "val-page_1-line_2.jpg",
        VAL_DIR / "val-page_1-line_3.jpg",
        output / "labels.json",
-        # Language resources
-        output / "language_model" / "corpus_characters.txt",
-        output / "language_model" / "corpus_subwords.txt",
-        output / "language_model" / "corpus_words.txt",
-        output / "language_model" / "lexicon_characters.txt",
-        output / "language_model" / "lexicon_subwords.txt",
-        output / "language_model" / "lexicon_words.txt",
-        output / "language_model" / "subword_tokenizer.model",
-        output / "language_model" / "subword_tokenizer.vocab",
-        output / "language_model" / "tokens.txt",
        output / "split.json",
    ]
    assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
@@ -298,97 +163,6 @@ def test_download(
    # Check "labels.json"
    assert json.loads((output / "labels.json").read_text()) == expected_labels

-    # Check "language_corpus.txt"
-    expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
-ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
-ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
-ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
-ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
-ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
-ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
-
-    expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
-ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
-ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
-ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
-ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
-ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
-ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
-
-    # Transcriptions with worker version are in lowercase
-    if transcription_entities_worker_version:
-        expected_char_language_corpus = expected_char_language_corpus.lower()
-        expected_word_language_corpus = expected_word_language_corpus.lower()
-        expected_subword_language_corpus = expected_subword_language_corpus.lower()
-
-    # If we do not load entities, remove tokens
-    if not load_entities:
-        expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
-            "", expected_char_language_corpus
-        )
-        expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
-            "", expected_word_language_corpus
-        )
-        expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
-            "", expected_subword_language_corpus
-        )
-    # Replace double spaces with regular space
-    if not keep_spaces:
-        expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
-            "▁", expected_char_language_corpus
-        )
-        expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
-            "▁", expected_word_language_corpus
-        )
-        expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
-            "▁", expected_subword_language_corpus
-        )
-
-    assert (
-        output / "language_model" / "corpus_characters.txt"
-    ).read_text() == expected_char_language_corpus
-
-    assert (
-        output / "language_model" / "corpus_words.txt"
-    ).read_text() == expected_word_language_corpus
-
-    assert (
-        output / "language_model" / "corpus_subwords.txt"
-    ).read_text() == expected_subword_language_corpus
-
-    # Check "language_tokens.txt"
-    expected_language_tokens = [
-        "▁" if t.isspace() else t for t in sorted(list(expected_charset))
-    ]
-    expected_language_tokens.append("◌")
-    assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
-        expected_language_tokens
-    )
-
-    # Check "language_lexicon.txt"
-    expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
-    assert (
-        output / "language_model" / "lexicon_characters.txt"
-    ).read_text() == "\n".join(expected_language_char_lexicon)
-
-    word_vocab = set([word for word in expected_word_language_corpus.split()])
-    expected_language_word_lexicon = [
-        f"{word} {' '.join(word)}" for word in sorted(word_vocab)
-    ]
-    assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
-        expected_language_word_lexicon
-    )
-
-    subword_vocab = set(
-        [subword for subword in expected_subword_language_corpus.split()]
-    )
-    expected_language_subword_lexicon = [
-        f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
-    ]
-    assert (
-        output / "language_model" / "lexicon_subwords.txt"
-    ).read_text() == "\n".join(expected_language_subword_lexicon)
-
    # Check cropped images
    for expected_path in expected_paths:
        if expected_path.suffix != ".jpg":

--- a/tests/test_language_model.py
+++ b/tests/test_language_model.py
+# Copyright Teklia (contact@teklia.com) & Denis Coquenet
+# This code is licensed under CeCILL-C
+
+# -*- coding: utf-8 -*-
+import json
+import pickle
+import re
+from operator import methodcaller
+
+import pytest
+
+from dan.datasets.language_model.build import LanguageModelBuilder
+from dan.utils import parse_tokens
+from tests import FIXTURES, change_split_content
+
+EXTRACTION_DATA_PATH = FIXTURES / "extraction"
+
+ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
+TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
+
+
+@pytest.mark.parametrize(
+    "load_entities,transcription_entities_worker_version,expected_subword_language_corpus",
+    (
+        (
+            True,
+            "worker_version_id",
+            """▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
+▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
+▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
+▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
+▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
+▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
+▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
+        ),
+        (
+            False,
+            "worker_version_id",
+            """▁ la u l ont ▁ f r an c oi s ▁ 8
+▁ c i re t ▁ an t oi ne ▁ 2 7
+▁ c i re t ▁ m a r ie ▁ 2 8
+▁ c i re t ▁ m a r ie ▁ 2
+▁ e u res t on ▁ so l an g e ▁ 1 0
+▁ t e r ont u ss ie u x ▁ j e an ▁ 2
+▁ p res so ne t ▁ m a r ie ▁ 1 2""",
+        ),
+        (
+            True,
+            False,
+            """▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
+▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
+▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
+▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
+▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
+▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
+▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
+        ),
+        (
+            False,
+            False,
+            """▁ L a u l ont ▁ F r an c oi s ▁ 8
+▁ C i re t ▁ A n t oi n e ▁ 2 7
+▁ C i re t ▁ M a r ie ▁ 2 8
+▁ C i re t ▁ M a r ie ▁ 2
+▁ E u re s t on ▁ S o l an g e ▁ 1 0
+▁ T e r ont u s s ie u x ▁ J e an ▁ 2
+▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
+        ),
+    ),
+)
+@pytest.mark.parametrize("keep_spaces", [True, False])
+def test_language_model(
+    load_entities,
+    keep_spaces,
+    transcription_entities_worker_version,
+    expected_subword_language_corpus,
+    split_content,
+    tmp_path,
+):
+    output = tmp_path / "build"
+    (output / "language_model").mkdir(parents=True, exist_ok=True)
+
+    # Mock tokens
+    tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
+    tokens = [
+        token
+        for entity_type in parse_tokens(tokens_path).values()
+        for token in [entity_type.start, entity_type.end]
+        if token
+    ]
+
+    # Mock "labels.json"
+    _, labels_content = change_split_content(
+        load_entities,
+        transcription_entities_worker_version,
+        keep_spaces,
+        split_content,
+        tokens,
+        {
+            "test": {
+                "images/test/dataset_id/test-page_1-line_1.jpg": "ⓢLeunaut  ⓕClau⁇e  ⓑ⁇⁇",
+                "images/test/dataset_id/test-page_1-line_2.jpg": "ⓢ⁇aurac⁇o  ⓕClau⁇ine  ⓑ⁇⁇",
+                "images/test/dataset_id/test-page_1-line_3.jpg": "ⓢLaurent  ⓕJac⁇use  ⓑ21",
+                "images/test/dataset_id/test-page_2-line_1.jpg": "ⓢ⁇alette  ⓕElisa⁇et⁇  ⓑ7⁇",
+                "images/test/dataset_id/test-page_2-line_2.jpg": "ⓢTan⁇ol  ⓕJean  ⓑ7⁇",
+                "images/test/dataset_id/test-page_2-line_3.jpg": "ⓢ⁇auret  ⓕJean  ⓑ⁇⁇",
+            },
+            "train": {
+                "images/train/dataset_id/train-page_1-line_1.jpg": "ⓢLaulont  ⓕFrancois  ⓑ8",
+                "images/train/dataset_id/train-page_1-line_2.jpg": "ⓢCiret  ⓕAntoine  ⓑ27",
+                "images/train/dataset_id/train-page_1-line_3.jpg": "ⓢCiret  ⓕMarie  ⓑ28",
+                "images/train/dataset_id/train-page_1-line_4.jpg": "ⓢCiret  ⓕMarie  ⓑ2",
+                "images/train/dataset_id/train-page_2-line_1.jpg": "ⓢEureston  ⓕSolange  ⓑ10",
+                "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux  ⓕJean  ⓑ2",
+                "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet  ⓕMarie  ⓑ12",
+            },
+            "val": {
+                "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
+                "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
+                "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
+            },
+        },
+    )
+    (output / "labels.json").write_text(json.dumps(labels_content))
+
+    # Mock "charset.pkl"
+    expected_charset = {"⁇"}
+    for value in labels_content["train"].values():
+        expected_charset.update(set(value))
+    if load_entities:
+        expected_charset.update(tokens)
+    (output / "charset.pkl").write_bytes(pickle.dumps(sorted(list(expected_charset))))
+
+    extractor = LanguageModelBuilder(
+        output=output,
+        tokens=tokens_path if load_entities else None,
+        subword_vocab_size=40,
+    )
+    extractor.run()
+
+    # Check files
+    expected_paths = [
+        # Previous files
+        output / "charset.pkl",
+        output / "labels.json",
+        # Language resources
+        output / "language_model" / "corpus_characters.txt",
+        output / "language_model" / "corpus_subwords.txt",
+        output / "language_model" / "corpus_words.txt",
+        output / "language_model" / "lexicon_characters.txt",
+        output / "language_model" / "lexicon_subwords.txt",
+        output / "language_model" / "lexicon_words.txt",
+        output / "language_model" / "subword_tokenizer.model",
+        output / "language_model" / "subword_tokenizer.vocab",
+        output / "language_model" / "tokens.txt",
+    ]
+    assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
+
+    # Check "language_corpus.txt"
+    expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
+ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
+ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
+ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
+ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
+ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
+ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
+
+    expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
+ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
+ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
+ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
+ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
+ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
+ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
+
+    # Transcriptions with worker version are in lowercase
+    if transcription_entities_worker_version:
+        expected_char_language_corpus = expected_char_language_corpus.lower()
+        expected_word_language_corpus = expected_word_language_corpus.lower()
+        expected_subword_language_corpus = expected_subword_language_corpus.lower()
+
+    # If we do not load entities, remove tokens
+    if not load_entities:
+        expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
+            "", expected_char_language_corpus
+        )
+        expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
+            "", expected_word_language_corpus
+        )
+        expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
+            "", expected_subword_language_corpus
+        )
+    # Replace double spaces with regular space
+    if not keep_spaces:
+        expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_char_language_corpus
+        )
+        expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_word_language_corpus
+        )
+        expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_subword_language_corpus
+        )
+
+    assert (
+        output / "language_model" / "corpus_characters.txt"
+    ).read_text() == expected_char_language_corpus
+
+    assert (
+        output / "language_model" / "corpus_words.txt"
+    ).read_text() == expected_word_language_corpus
+
+    assert (
+        output / "language_model" / "corpus_subwords.txt"
+    ).read_text() == expected_subword_language_corpus
+
+    # Check "language_tokens.txt"
+    expected_language_tokens = [
+        "▁" if t.isspace() else t for t in sorted(list(expected_charset))
+    ]
+    expected_language_tokens.append("◌")
+    assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
+        expected_language_tokens
+    )
+
+    # Check "language_lexicon.txt"
+    expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
+    assert (
+        output / "language_model" / "lexicon_characters.txt"
+    ).read_text() == "\n".join(expected_language_char_lexicon)
+
+    word_vocab = set([word for word in expected_word_language_corpus.split()])
+    expected_language_word_lexicon = [
+        f"{word} {' '.join(word)}" for word in sorted(word_vocab)
+    ]
+    assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
+        expected_language_word_lexicon
+    )
+
+    subword_vocab = set(
+        [subword for subword in expected_subword_language_corpus.split()]
+    )
+    expected_language_subword_lexicon = [
+        f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
+    ]
+    assert (
+        output / "language_model" / "lexicon_subwords.txt"
+    ).read_text() == "\n".join(expected_language_subword_lexicon)
+
+
+@pytest.mark.parametrize(
+    "expected_subword_language_corpus,subword_vocab_size",
+    (
+        (
+            """▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
+▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
+▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
+▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
+▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
+▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
+▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
+            40,
+        ),
+        (
+            """▁ ⓢ L a u l ont ▁ ⓕ F r an c oi s ▁ ⓑ 8
+▁ ⓢ C i re t ▁ ⓕ A n t oi n e ▁ ⓑ 2 7
+▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2 8
+▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2
+▁ ⓢ E u re s t on ▁ ⓕ S o l an g e ▁ ⓑ 1 0
+▁ ⓢ T e r ont u s s ie u x ▁ ⓕ J e an ▁ ⓑ 2
+▁ ⓢ P re s s on e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
+            45,
+        ),
+    ),
+)
+@pytest.mark.parametrize("keep_spaces", [True, False])
+def test_language_model_subword_vocab_size(
+    keep_spaces,
+    expected_subword_language_corpus,
+    subword_vocab_size,
+    split_content,
+    tmp_path,
+):
+    output = tmp_path / "build"
+    (output / "language_model").mkdir(parents=True, exist_ok=True)
+
+    # Mock tokens
+    tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
+    tokens = [
+        token
+        for entity_type in parse_tokens(tokens_path).values()
+        for token in [entity_type.start, entity_type.end]
+        if token
+    ]
+
+    # Mock "labels.json"
+    _, labels_content = change_split_content(
+        True,
+        False,
+        keep_spaces,
+        split_content,
+        tokens,
+        {
+            "test": {
+                "images/test/dataset_id/test-page_1-line_1.jpg": "ⓢLeunaut  ⓕClau⁇e  ⓑ⁇⁇",
+                "images/test/dataset_id/test-page_1-line_2.jpg": "ⓢ⁇aurac⁇o  ⓕClau⁇ine  ⓑ⁇⁇",
+                "images/test/dataset_id/test-page_1-line_3.jpg": "ⓢLaurent  ⓕJac⁇use  ⓑ21",
+                "images/test/dataset_id/test-page_2-line_1.jpg": "ⓢ⁇alette  ⓕElisa⁇et⁇  ⓑ7⁇",
+                "images/test/dataset_id/test-page_2-line_2.jpg": "ⓢTan⁇ol  ⓕJean  ⓑ7⁇",
+                "images/test/dataset_id/test-page_2-line_3.jpg": "ⓢ⁇auret  ⓕJean  ⓑ⁇⁇",
+            },
+            "train": {
+                "images/train/dataset_id/train-page_1-line_1.jpg": "ⓢLaulont  ⓕFrancois  ⓑ8",
+                "images/train/dataset_id/train-page_1-line_2.jpg": "ⓢCiret  ⓕAntoine  ⓑ27",
+                "images/train/dataset_id/train-page_1-line_3.jpg": "ⓢCiret  ⓕMarie  ⓑ28",
+                "images/train/dataset_id/train-page_1-line_4.jpg": "ⓢCiret  ⓕMarie  ⓑ2",
+                "images/train/dataset_id/train-page_2-line_1.jpg": "ⓢEureston  ⓕSolange  ⓑ10",
+                "images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux  ⓕJean  ⓑ2",
+                "images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet  ⓕMarie  ⓑ12",
+            },
+            "val": {
+                "images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇  ⓕAntoine  ⓑ⁇⁇",
+                "images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇  ⓕPriser  ⓑ⁇⁇",
+                "images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇  ⓕElisa⁇et⁇  ⓑ⁇⁇",
+            },
+        },
+    )
+    (output / "labels.json").write_text(json.dumps(labels_content))
+
+    # Mock "charset.pkl"
+    expected_charset = {"⁇"}
+    for value in labels_content["train"].values():
+        expected_charset.update(set(value))
+    expected_charset.update(tokens)
+    (output / "charset.pkl").write_bytes(pickle.dumps(sorted(list(expected_charset))))
+
+    extractor = LanguageModelBuilder(
+        output=output,
+        tokens=tokens_path,
+        subword_vocab_size=subword_vocab_size,
+    )
+    extractor.run()
+
+    # Check files
+    expected_paths = [
+        # Previous files
+        output / "charset.pkl",
+        output / "labels.json",
+        # Language resources
+        output / "language_model" / "corpus_characters.txt",
+        output / "language_model" / "corpus_subwords.txt",
+        output / "language_model" / "corpus_words.txt",
+        output / "language_model" / "lexicon_characters.txt",
+        output / "language_model" / "lexicon_subwords.txt",
+        output / "language_model" / "lexicon_words.txt",
+        output / "language_model" / "subword_tokenizer.model",
+        output / "language_model" / "subword_tokenizer.vocab",
+        output / "language_model" / "tokens.txt",
+    ]
+    assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
+
+    # Check "language_corpus.txt"
+    expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
+ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
+ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
+ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
+ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
+ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
+ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
+
+    expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
+ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
+ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
+ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
+ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
+ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
+ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
+
+    # Replace double spaces with regular space
+    if not keep_spaces:
+        expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_char_language_corpus
+        )
+        expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_word_language_corpus
+        )
+        expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "▁", expected_subword_language_corpus
+        )
+
+    assert (
+        output / "language_model" / "corpus_characters.txt"
+    ).read_text() == expected_char_language_corpus
+
+    assert (
+        output / "language_model" / "corpus_words.txt"
+    ).read_text() == expected_word_language_corpus
+
+    assert (
+        output / "language_model" / "corpus_subwords.txt"
+    ).read_text() == expected_subword_language_corpus
+
+    # Check "language_tokens.txt"
+    expected_language_tokens = [
+        "▁" if t.isspace() else t for t in sorted(list(expected_charset))
+    ]
+    expected_language_tokens.append("◌")
+    assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
+        expected_language_tokens
+    )
+
+    # Check "language_lexicon.txt"
+    expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
+    assert (
+        output / "language_model" / "lexicon_characters.txt"
+    ).read_text() == "\n".join(expected_language_char_lexicon)
+
+    word_vocab = set([word for word in expected_word_language_corpus.split()])
+    expected_language_word_lexicon = [
+        f"{word} {' '.join(word)}" for word in sorted(word_vocab)
+    ]
+    assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
+        expected_language_word_lexicon
+    )
+
+    subword_vocab = set(
+        [subword for subword in expected_subword_language_corpus.split()]
+    )
+    expected_language_subword_lexicon = [
+        f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
+    ]
+    assert (
+        output / "language_model" / "lexicon_subwords.txt"
+    ).read_text() == "\n".join(expected_language_subword_lexicon)