Compare revisions

Manon Blanco · Yoann Schneider · Manon Blanco · Yoann Schneider · Manon Blanco · Yoann Schneider
--- a/.gitattributes
+++ b/.gitattributes
 *gif filter=lfs diff=lfs merge=lfs -text
 **/*.pt filter=lfs diff=lfs merge=lfs -text
+tests/data/prediction/language_model.arpa filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ To apply DAN to an image, one needs to first add a few imports and to load an im

 ```python
 import cv2
-from dan.ocr.predict.prediction import DAN
+from dan.ocr.predict.inference import DAN

 image = cv2.cvtColor(cv2.imread(IMAGE_PATH), cv2.COLOR_BGR2RGB)
 ```
@@ -84,16 +84,16 @@ This package provides three subcommands. To get more information about any subco

 ### Get started

-See the [dedicated section](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.

 ### Data extraction from Arkindex

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.

 ### Model training

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.

 ### Model prediction

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
--- a/dan/datasets/__init__.py
+++ b/dan/datasets/__init__.py
@@ -4,7 +4,9 @@ Preprocess datasets for training.
 """

 from dan.datasets.analyze import add_analyze_parser
+from dan.datasets.entities import add_entities_parser
 from dan.datasets.extract import add_extract_parser
+from dan.datasets.tokens import add_tokens_parser


 def add_dataset_parser(subcommands) -> None:
@@ -17,3 +19,5 @@ def add_dataset_parser(subcommands) -> None:

    add_extract_parser(subcommands)
    add_analyze_parser(subcommands)
+    add_entities_parser(subcommands)
+    add_tokens_parser(subcommands)
--- a/dan/datasets/entities/__init__.py
+++ b/dan/datasets/entities/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Extract entities from Arkindex using a corpus export.
+"""
+from pathlib import Path
+
+from dan.datasets.entities.extract import run
+
+
+def add_entities_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "entities",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.add_argument(
+        "database",
+        type=Path,
+        help="Path where the data were exported from Arkindex.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=Path("entities.yml"),
+        required=False,
+        help="Path to a YAML file to save the extracted entities.",
+    )
+    parser.set_defaults(func=run)
--- a/dan/datasets/entities/extract.py
+++ b/dan/datasets/entities/extract.py
+# -*- coding: utf-8 -*-
+from operator import itemgetter
+from pathlib import Path
+
+import yaml
+
+from arkindex_export import EntityType, open_database
+
+
+def run(database: Path, output_file: Path) -> None:
+    # Load SQLite database
+    open_database(database)
+
+    # Extract and save entities to YAML
+    entities = list(
+        map(itemgetter(0), EntityType.select(EntityType.name).distinct().tuples())
+    )
+    output_file.write_text(
+        yaml.safe_dump({"entities": entities}, explicit_start=True, allow_unicode=True)
+    )
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
 # -*- coding: utf-8 -*-
 """
-Extract dataset from Arkindex using API.
+Extract dataset from Arkindex using a corpus export.
 """

 import argparse
 import pathlib
 from uuid import UUID

-from dan.datasets.extract.extract import run
+from dan.datasets.extract.arkindex import run

 MANUAL_SOURCE = "manual"

@@ -144,7 +144,14 @@ def add_extract_parser(subcommands) -> None:
    parser.add_argument(
        "--max-height",
        type=int,
-        help="Images larger than this height will be resized to this width.",
+        help="Images larger than this height will be resized to this height.",
+    )
+
+    parser.add_argument(
+        "--subword-vocab-size",
+        type=int,
+        default=1000,
+        help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
    )

    # Formatting arguments

--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -30,8 +30,10 @@ from dan.datasets.extract.exceptions import (
    UnknownTokenInText,
 )
 from dan.datasets.extract.utils import (
+    Tokenizer,
    download_image,
    get_bbox,
+    get_vocabulary,
    insert_token,
    normalize_linebreaks,
    normalize_spaces,
@@ -77,6 +79,7 @@ class ArkindexExtractor:
        keep_spaces: bool = False,
        image_extension: str = "",
        allow_empty: bool = False,
+        subword_vocab_size: int = 1000,
    ) -> None:
        self.folders = folders
        self.element_type = element_type
@@ -92,14 +95,14 @@ class ArkindexExtractor:
        self.image_extension = image_extension
        self.allow_empty = allow_empty
        self.mapping = LMTokenMapping()
-
        self.keep_spaces = keep_spaces
+        self.subword_vocab_size = subword_vocab_size

        self.data: Dict = defaultdict(dict)
        self.charset = set()
-        self.language_corpus = []
+        self.language_corpus = defaultdict(list)
        self.language_tokens = []
-        self.language_lexicon = []
+        self.language_lexicon = defaultdict(list)

        # Image download tasks to process
        self.tasks: List[Dict[str, str]] = []
@@ -275,12 +278,6 @@ class ArkindexExtractor:
            )
        return text.strip()

-    def format_text_language_model(self, text: str):
-        """
-        Format text for the language model. Return the text tokenized at character-level.
-        """
-        return " ".join(map(self.mapping.encode_token, list(text.strip())))
-
    def process_element(
        self,
        element: Element,
@@ -319,10 +316,6 @@ class ArkindexExtractor:
        self.data[split][str(image_path)] = text
        self.charset = self.charset.union(set(text))

-        # Language model should be built using only text from the training set
-        if split == "train":
-            self.language_corpus.append(self.format_text_language_model(text))
-
    def process_parent(
        self,
        pbar,
@@ -361,6 +354,11 @@ class ArkindexExtractor:
        """
        Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
        """
+        logger.info("Preparing language resources")
+        # Add unknown token to charset
+        self.charset.add(self.unknown_token)
+
+        # Build LM tokens
        for token in sorted(list(self.charset)):
            assert (
                token not in self.mapping.encode.values()
@@ -368,15 +366,40 @@ class ArkindexExtractor:
            self.language_tokens.append(
                self.mapping.encode[token]
            ) if token in self.mapping.encode else self.language_tokens.append(token)
-
-        # Add the special blank token
        self.language_tokens.append(self.mapping.ctc.encoded)

-        # Build lexicon
-        assert all(
-            [len(token) == 1 for token in self.language_lexicon]
-        ), "Tokens should be single characters."
-        self.language_lexicon = [f"{token} {token}" for token in self.language_tokens]
+        # Build LM corpus
+        train_corpus = [
+            text.replace(self.mapping.linebreak.display, self.mapping.space.display)
+            for text in self.data["train"].values()
+        ]
+
+        tokenizer = Tokenizer(
+            training_corpus=train_corpus,
+            charset=self.language_tokens,
+            unknown_token=self.unknown_token,
+            outdir=self.output / "language_model",
+            mapping=self.mapping,
+            tokens=self.tokens,
+            subword_vocab_size=self.subword_vocab_size,
+        )
+
+        for level, tokenize in (
+            ("characters", tokenizer.char_tokenize),
+            ("words", tokenizer.word_tokenize),
+            ("subwords", tokenizer.subword_tokenize),
+        ):
+            self.language_corpus[level] = list(map(tokenize, train_corpus))
+
+        # Build LM lexicon
+        self.language_lexicon["characters"] = [
+            f"{token} {token}" for token in self.language_tokens
+        ]
+        for level in ["words", "subwords"]:
+            self.language_lexicon[level] = [
+                f"{token} {tokenizer.char_tokenize(token)}"
+                for token in get_vocabulary(self.language_corpus[level])
+            ]

    def export(self):
        (self.output / "labels.json").write_text(
@@ -386,15 +409,16 @@ class ArkindexExtractor:
                indent=4,
            )
        )
-        (self.output / "language_model" / "corpus.txt").write_text(
-            "\n".join(self.language_corpus)
-        )
+        for level in ["characters", "words", "subwords"]:
+            (self.output / "language_model" / f"corpus_{level}.txt").write_text(
+                "\n".join(self.language_corpus[level])
+            )
+            (self.output / "language_model" / f"lexicon_{level}.txt").write_text(
+                "\n".join(self.language_lexicon[level])
+            )
        (self.output / "language_model" / "tokens.txt").write_text(
            "\n".join(self.language_tokens)
        )
-        (self.output / "language_model" / "lexicon.txt").write_text(
-            "\n".join(self.language_lexicon)
-        )
        (self.output / "charset.pkl").write_bytes(
            pickle.dumps(sorted(list(self.charset)))
        )
@@ -477,6 +501,7 @@ def run(
    image_format: str,
    keep_spaces: bool,
    allow_empty: bool,
+    subword_vocab_size: int,
 ):
    assert database.exists(), f"No file found @ {database}"
    open_database(path=database)
@@ -503,4 +528,5 @@ def run(
        keep_spaces=keep_spaces,
        image_extension=image_format,
        allow_empty=allow_empty,
+        subword_vocab_size=subword_vocab_size,
    ).run()
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
 # -*- coding: utf-8 -*-
+import itertools
 import logging
+import operator
 import re
+from dataclasses import dataclass, field
 from io import BytesIO
-from typing import List
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Iterator, List, Optional, Union

 import requests
+import sentencepiece as spm
+from nltk import wordpunct_tokenize
 from PIL import Image, ImageOps
 from tenacity import (
    retry,
@@ -13,7 +20,7 @@ from tenacity import (
    wait_exponential,
 )

-from dan.utils import EntityType
+from dan.utils import EntityType, LMTokenMapping

 logger = logging.getLogger(__name__)

@@ -117,3 +124,117 @@ def get_bbox(polygon: List[List[int]]) -> str:
    x, y = min(all_x), min(all_y)
    width, height = max(all_x) - x, max(all_y) - y
    return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
+
+
+def get_vocabulary(tokenized_text: List[str]) -> set[str]:
+    """
+    Compute set of vocabulary from tokenzied text.
+    :param tokenized_text: List of tokenized text.
+    """
+    return sorted(set([token for doc in tokenized_text for token in doc.split()]))
+
+
+@dataclass
+class Tokenizer:
+    """
+    A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
+    :param training_corpus: List of training text.
+    :param outdir: Path to save the subword tokenizer.
+    :param mapping: Mapping between displayed and encoded versions of special characters.
+    :param tokens: Start and end tokens used to represent named entities.
+    :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
+    """
+
+    training_corpus: List[str]
+    charset: List[str]
+    unknown_token: str
+    outdir: Path
+    mapping: LMTokenMapping
+    tokens: Optional[EntityType] = None
+    subword_vocab_size: int = 1000
+    sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
+
+    @property
+    def prefix(self):
+        return self.outdir / "subword_tokenizer"
+
+    @property
+    def ner_tokens(self) -> Union[List[str], Iterator[str]]:
+        if self.tokens is None:
+            return []
+        return itertools.chain(
+            map(operator.attrgetter("start"), self.tokens.values()),
+            filter(
+                operator.truth, map(operator.attrgetter("end"), self.tokens.values())
+            ),
+        )
+
+    @property
+    def mapping_tokens(self) -> List[str]:
+        return [token.encoded for token in self.mapping]
+
+    @property
+    def special_tokens(self) -> List[str]:
+        return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
+
+    def __post_init__(self) -> None:
+        """
+        Train a sentencepiece model on the training corpus.
+        """
+        # Write the corpus in a text file
+        logger.info("Training a sentencepiece model for subword tokenization")
+        with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
+            tmp.write("\n".join(self.training_corpus))
+            tmp.flush()
+            spm.SentencePieceTrainer.train(
+                input=tmp.name,
+                vocab_size=self.subword_vocab_size,
+                model_prefix=self.prefix,
+                user_defined_symbols=self.special_tokens,
+            )
+
+        # Load the model
+        self.sentencepiece_model = spm.SentencePieceProcessor(
+            model_file=str(self.prefix.with_suffix(".model"))
+        )
+
+    def subword_tokenize(self, text: str) -> str:
+        """
+        Tokenize into subwords. Sampling is disabled to ensure reproducibility.
+        """
+        tokens = self.sentencepiece_model.encode(text, out_type=str)
+        return " ".join(map("".join, map(self.encode, tokens)))
+
+    def word_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
+        :param text: Text to be tokenized.
+        """
+        words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
+        return " ".join(
+            [
+                word + f" {self.mapping.space.encoded}"
+                if (i != len(words) - 1 and word not in self.ner_tokens)
+                else word
+                for i, word in enumerate(words)
+            ]
+        )
+
+    def char_tokenize(self, text: str) -> str:
+        """
+        Tokenize text into a string of space-separated characters.
+        :param text: Text to be tokenized.
+        """
+        return " ".join(
+            [
+                char if char in self.charset else self.unknown_token
+                for char in self.encode(text)
+            ]
+        )
+
+    def encode(self, text: List[str]) -> List[str]:
+        """
+        Encode special tokens.
+        :param text: Text to be encoded.
+        """
+        return map(self.mapping.encode_token, text)
--- a/dan/datasets/tokens/__init__.py
+++ b/dan/datasets/tokens/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Generate the YAML file containing entities and their token(s) to train a DAN model
+"""
+from pathlib import Path
+
+from dan.datasets.tokens.generate import run
+
+
+def add_tokens_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "tokens",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.add_argument(
+        "entities",
+        type=Path,
+        help="Path to a YAML file containing the extracted entities.",
+    )
+    parser.add_argument(
+        "--end-tokens",
+        action="store_true",
+        help="Whether to generate end tokens along with starting tokens.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=Path("tokens.yml"),
+        required=False,
+        help="Path to a YAML file to save the entities and their token(s).",
+    )
+
+    parser.set_defaults(func=run)
--- a/dan/datasets/tokens/generate.py
+++ b/dan/datasets/tokens/generate.py
+# -*- coding: utf-8 -*-
+from pathlib import Path
+from typing import Iterable
+
+import yaml
+
+OFFSET = 86
+LIMIT = 160
+
+STARTING_TOKEN = "\u2460"
+
+
+def get_token() -> Iterable[str]:
+    offset = OFFSET
+
+    while offset < LIMIT:
+        yield chr(ord(STARTING_TOKEN) + offset % LIMIT)
+        offset += 1
+
+    raise Exception(f"More than {LIMIT} tokens asked")
+
+
+def run(entities: Path, end_tokens: bool, output_file: Path) -> None:
+    # Load extracted entities
+    entities = yaml.safe_load(entities.read_text())
+
+    # Generate associated starting/ending token
+    token_generator = get_token()
+    tokens = {}
+    for entity in entities.get("entities", []):
+        tokens[entity] = {
+            "start": next(token_generator),
+            "end": next(token_generator) if end_tokens else "",
+        }
+
+    # Save entities & tokens to YAML
+    output_file.write_text(
+        yaml.safe_dump(tokens, explicit_start=True, allow_unicode=True, sort_keys=False)
+    )
--- a/dan/ocr/predict/__init__.py
+++ b/dan/ocr/predict/__init__.py
@@ -6,7 +6,7 @@ Predict on an image using a trained DAN model.
 import pathlib

 from dan.ocr.predict.attention import Level
-from dan.ocr.predict.prediction import run
+from dan.ocr.predict.inference import run
 from dan.utils import parse_tokens


@@ -70,7 +70,7 @@ def add_predict_parser(subcommands) -> None:
        "--temperature",
        type=float,
        default=1.0,
-        help="Temperature scaling scalar parameter",
+        help="Temperature scaling scalar parameter.",
        required=False,
    )
    parser.add_argument(
@@ -104,7 +104,7 @@ def add_predict_parser(subcommands) -> None:
        "--attention-map-scale",
        type=float,
        default=0.5,
-        help="Image scaling factor before creating the GIF",
+        help="Image scaling factor before creating the GIF.",
        required=False,
    )
    parser.add_argument(

--- a/dan/ocr/predict/prediction.py
+++ b/dan/ocr/predict/prediction.py
@@ -356,62 +356,62 @@ def process_batch(
    logger.info("Prediction parsing...")
    for idx, image_path in enumerate(image_batch):
        predicted_text = prediction["text"][idx]
-        result = {"text": predicted_text}
-
-        # Return LM results
-        if use_language_model:
-            result["language_model"] = {
-                "text": prediction["language_model"]["text"][idx],
-                "confidence": prediction["language_model"]["confidence"][idx],
-            }
-
-        # Return extracted objects (coordinates, text, confidence)
-        if predict_objects:
-            result["objects"] = prediction["objects"][idx]
-
-        # Return mean confidence score
-        if confidence_score:
-            result["confidences"] = {}
-            char_confidences = prediction["confidences"][idx]
-            result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
-
-            for level in confidence_score_levels:
-                result["confidences"][level.value] = []
-                texts, confidences, _ = split_text_and_confidences(
-                    predicted_text,
-                    char_confidences,
-                    level,
-                    word_separators,
-                    line_separators,
-                    tokens,
-                )
-
-                for text, conf in zip(texts, confidences):
-                    result["confidences"][level.value].append(
-                        {"text": text, "confidence": conf}
+        result = {"text": predicted_text, "confidences": {}, "language_model": {}}
+
+        if predicted_text:
+            # Return LM results
+            if use_language_model:
+                result["language_model"] = {
+                    "text": prediction["language_model"]["text"][idx],
+                    "confidence": prediction["language_model"]["confidence"][idx],
+                }
+
+            # Return extracted objects (coordinates, text, confidence)
+            if predict_objects:
+                result["objects"] = prediction["objects"][idx]
+
+            # Return mean confidence score
+            if confidence_score:
+                char_confidences = prediction["confidences"][idx]
+                result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
+
+                for level in confidence_score_levels:
+                    result["confidences"][level.value] = []
+                    texts, confidences, _ = split_text_and_confidences(
+                        predicted_text,
+                        char_confidences,
+                        level,
+                        word_separators,
+                        line_separators,
+                        tokens,
                    )

-        # Save gif with attention map
-        if attention_map:
-            attentions = prediction["attentions"][idx]
-            gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
-            logger.info(f"Creating attention GIF in {gif_filename}")
-            plot_attention(
-                image=visu_tensor[idx],
-                text=predicted_text,
-                weights=attentions,
-                level=attention_map_level,
-                scale=attention_map_scale,
-                word_separators=word_separators,
-                line_separators=line_separators,
-                tokens=tokens,
-                display_polygons=predict_objects,
-                threshold_method=threshold_method,
-                threshold_value=threshold_value,
-                max_object_height=max_object_height,
-                outname=gif_filename,
-            )
-            result["attention_gif"] = gif_filename
+                    for text, conf in zip(texts, confidences):
+                        result["confidences"][level.value].append(
+                            {"text": text, "confidence": conf}
+                        )
+
+            # Save gif with attention map
+            if attention_map:
+                attentions = prediction["attentions"][idx]
+                gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
+                logger.info(f"Creating attention GIF in {gif_filename}")
+                plot_attention(
+                    image=visu_tensor[idx],
+                    text=predicted_text,
+                    weights=attentions,
+                    level=attention_map_level,
+                    scale=attention_map_scale,
+                    word_separators=word_separators,
+                    line_separators=line_separators,
+                    tokens=tokens,
+                    display_polygons=predict_objects,
+                    threshold_method=threshold_method,
+                    threshold_value=threshold_value,
+                    max_object_height=max_object_height,
+                    outname=gif_filename,
+                )
+                result["attention_gif"] = gif_filename

        json_filename = Path(output, image_path.stem).with_suffix(".json")
        logger.info(f"Saving JSON prediction in {json_filename}")

--- a/dan/ocr/transforms.py
+++ b/dan/ocr/transforms.py
@@ -28,12 +28,20 @@ from torchvision.transforms.functional import resize as resize_tensor


 class Preprocessing(str, Enum):
-    # If the image is bigger than the given size, resize it while keeping the original ratio
    MaxResize = "max_resize"
-    # Resize the height to a fixed value while keeping the original ratio
+    """
+    If the image is bigger than the given size, resize it while keeping the original ratio
+    """
+
    FixedHeightResize = "fixed_height_resize"
-    # Resize the width to a fixed value while keeping the original ratio
+    """
+    Resize the height to a fixed value while keeping the original ratio
+    """
+
    FixedWidthResize = "fixed_width_resize"
+    """
+    Resize the width to a fixed value while keeping the original ratio
+    """


 class FixedHeightResize:

--- a/dan/utils.py
+++ b/dan/utils.py
@@ -22,7 +22,7 @@ class Token(NamedTuple):


 class LMTokenMapping(NamedTuple):
-    space: Token = Token("⎵", " ")
+    space: Token = Token("▁", " ")
    linebreak: Token = Token("↵", "\n")
    ctc: Token = Token("◌", "<ctc>")

@@ -139,7 +139,9 @@ def parse_tokens(filename: str) -> Dict[str, EntityType]:

 def read_yaml(yaml_path: str) -> Dict:
    """
-    Read YAML tokens file
+    Read YAML tokens file.
+    :param yaml_path: Path of the YAML file to read.
+    :return: The content of the read file.
    """
    filename = Path(yaml_path)
    assert filename.exists(), f"{yaml_path} does not resolve."
@@ -152,6 +154,8 @@ def read_yaml(yaml_path: str) -> Dict:
 def read_json(json_path: str) -> Dict:
    """
    Read labels JSON file
+    :param json_path: Path of the JSON file to read.
+    :return: The content of the read file.
    """
    filename = Path(json_path)
    assert filename.exists(), f"{json_path} does not resolve."

--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
 # Get started

-To use DAN in your own environment, you need to first clone via:
+## Installation
+
+To use DAN in your own environment, you need to install it as a dependency or manually.
+
+### As a dependency
+
+To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
+
+```shell
+teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
+```
+
+Then you can install it via pip:
+
+```shell
+pip install -r requirements.txt
+```
+
+### Manually
+
+To install DAN manually, you need to first clone via:

 ```shell
 git clone git@gitlab.teklia.com:atr/dan.git
@@ -9,9 +29,11 @@ git clone git@gitlab.teklia.com:atr/dan.git
 Then you can install it via pip:

 ```shell
-pip install -e .
+pip install .
 ```

+---
+
 To learn more about the newly installed `teklia-dan` command, make sure to run:

 ```shell

--- a/docs/get_started/training.md
+++ b/docs/get_started/training.md
@@ -4,13 +4,15 @@ There are a several steps to follow when training a DAN model.

 ## 1. Extract data

-The data must be extracted and formatted for training. To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:
+To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:

 1. Structure the data into folders (`train` / `val` / `test`) in [Arkindex](https://demo.arkindex.org/).
 1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
 1. Extract the data with the [extract command](../usage/datasets/extract.md).

-At the end, you should have a tree structure like this:
+This command will extract and format the images and labels needed to train DAN. It will also tokenize the training corpus at character, subword, and word levels, allowing you to combine DAN with an explicit statistical language model to improve performance.
+
+At the end, you should get the following tree structure:

 ```
 output/
@@ -21,8 +23,14 @@ output/
 │   ├── val
 │   └── test
 ├── language_model
-│   ├── corpus.txt
-│   ├── lexicon.txt
+│   ├── corpus_characters.txt
+│   ├── lexicon_characters.txt
+│   ├── corpus_subwords.txt
+│   ├── lexicon_subwords.txt
+│   ├── corpus_words.txt
+│   ├── lexicon_words.txt
+│   ├── subword_tokenizer.model
+│   ├── subword_tokenizer.vocab
 │   └── tokens.txt
 ```


--- a/docs/ref/cli.md
+++ b/docs/ref/cli.md
+# Command Line Interface
+
+::: dan.cli
--- a/docs/ref/datasets/analyze/index.md
+++ b/docs/ref/datasets/analyze/index.md
 # Analysis
+
+::: dan.datasets.analyze
--- a/docs/ref/datasets/entities/extract.md
+++ b/docs/ref/datasets/entities/extract.md
+# Extract
+
+::: dan.datasets.entities.extract
--- a/docs/ref/datasets/entities/index.md
+++ b/docs/ref/datasets/entities/index.md
+# Entities
+
+::: dan.datasets.entities
No results found