Compare revisions

Yoann Schneider · Mélodie Boillet · Manon Blanco · Yoann Schneider · Manon Blanco · Solene Tarride
--- a/configs/eval.json
+++ b/configs/eval.json
+{
+    "dataset": {
+        "datasets": {
+            "training": "tests/data/training/training_dataset"
+        },
+        "train": {
+            "name": "training-train",
+            "datasets": [
+                ["training", "train"]
+            ]
+        },
+        "val": {
+            "training-val": [
+                ["training", "val"]
+            ]
+        },
+        "test": {
+            "training-test": [
+                ["training", "test"]
+            ]
+        },
+        "max_char_prediction": 30,
+        "tokens": null
+    },
+    "model": {
+        "transfered_charset": true,
+        "additional_tokens": 1,
+        "encoder": {
+            "dropout": 0.5,
+            "nb_layers": 5
+        },
+        "h_max": 500,
+        "w_max": 1000,
+        "decoder": {
+            "l_max": 15000,
+            "dec_num_layers": 8,
+            "dec_num_heads": 4,
+            "dec_res_dropout": 0.1,
+            "dec_pred_dropout": 0.1,
+            "dec_att_dropout": 0.1,
+            "dec_dim_feedforward": 256,
+            "attention_win": 100,
+            "enc_dim": 256
+        }
+    },
+    "training": {
+        "data": {
+            "batch_size": 2,
+            "load_in_memory": true,
+            "worker_per_gpu": 4,
+            "preprocessings": [
+                {
+                    "type": "max_resize",
+                    "max_width": 2000,
+                    "max_height": 2000
+                }
+            ],
+            "augmentation": true
+        },
+        "device": {
+            "use_ddp": false,
+            "ddp_port": "20027",
+            "use_amp": true,
+            "nb_gpu": 0,
+            "force": "cpu"
+        },
+        "metrics": {
+            "train": [
+                "loss_ce",
+                "cer",
+                "wer",
+                "wer_no_punct"
+            ],
+            "eval": [
+                "cer",
+                "wer",
+                "wer_no_punct"
+            ]
+        },
+        "validation": {
+            "eval_on_valid": true,
+            "eval_on_valid_interval": 2,
+            "set_name_focus_metric": "training-val"
+        },
+        "output_folder": "tests/data/evaluate",
+        "gradient_clipping": {},
+        "max_nb_epochs": 4,
+        "load_epoch": "best",
+        "optimizers": {
+            "all": {
+                "args": {
+                    "lr": 0.0001,
+                    "amsgrad": false
+                }
+            }
+        },
+        "lr_schedulers": null,
+        "label_noise_scheduler": {
+            "min_error_rate": 0.2,
+            "max_error_rate": 0.2,
+            "total_num_steps": 5e4
+        },
+        "transfer_learning": null
+    }
+}
--- a/dan/cli.py
+++ b/dan/cli.py
@@ -3,7 +3,7 @@ import argparse
 import errno

 from dan.datasets import add_dataset_parser
-from dan.ocr import add_predict_parser, add_train_parser
+from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser


 def get_parser():
@@ -12,6 +12,7 @@ def get_parser():

    add_dataset_parser(subcommands)
    add_train_parser(subcommands)
+    add_evaluate_parser(subcommands)
    add_predict_parser(subcommands)
    return parser


--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -384,6 +384,9 @@ class ArkindexExtractor:
            subword_vocab_size=self.subword_vocab_size,
        )

+        if not tokenizer.sentencepiece_model:
+            return
+
        for level, tokenize in (
            ("characters", tokenizer.char_tokenize),
            ("words", tokenizer.word_tokenize),
@@ -478,6 +481,11 @@ class ArkindexExtractor:
                    pbar.update()
                    pbar.refresh()

+        if not self.data:
+            raise Exception(
+                "No data was extracted using the provided export database and parameters."
+            )
+
        self.download_images()
        self.format_lm_files()
        self.export()

--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -186,12 +186,22 @@ class Tokenizer:
        with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
            tmp.write("\n".join(self.training_corpus))
            tmp.flush()
-            spm.SentencePieceTrainer.train(
-                input=tmp.name,
-                vocab_size=self.subword_vocab_size,
-                model_prefix=self.prefix,
-                user_defined_symbols=self.special_tokens,
-            )
+
+            try:
+                spm.SentencePieceTrainer.train(
+                    input=tmp.name,
+                    vocab_size=self.subword_vocab_size,
+                    model_prefix=self.prefix,
+                    user_defined_symbols=self.special_tokens,
+                    minloglevel=1,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to train a sentencepiece model for subword tokenization: {e} "
+                    "Try again by editing the `--subword-vocab-size` parameter."
+                )
+                self.sentencepiece_model = None
+                return

        # Load the model
        self.sentencepiece_model = spm.SentencePieceProcessor(

--- a/dan/ocr/__init__.py
+++ b/dan/ocr/__init__.py
@@ -3,6 +3,8 @@
 Train a new DAN model.
 """

+
+from dan.ocr.evaluate import add_evaluate_parser  # noqa
 from dan.ocr.predict import add_predict_parser  # noqa
 from dan.ocr.train import run
 from dan.utils import read_json

--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
+# -*- coding: utf-8 -*-
+"""
+Evaluate a trained DAN model.
+"""
+
+import logging
+import random
+
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+
+from dan.ocr.manager.training import Manager
+from dan.ocr.utils import update_config
+from dan.utils import read_json
+
+logger = logging.getLogger(__name__)
+
+
+def add_evaluate_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "evaluate",
+        description=__doc__,
+        help=__doc__,
+    )
+
+    parser.add_argument(
+        "--config",
+        type=read_json,
+        required=True,
+        help="Configuration file.",
+    )
+
+    parser.set_defaults(func=run)
+
+
+def eval(rank, config, mlflow_logging):
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    np.random.seed(0)
+    random.seed(0)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+    config["training"]["device"]["ddp_rank"] = rank
+
+    # Load best checkpoint
+    config["training"]["load_epoch"] = "best"
+
+    model = Manager(config)
+    model.load_model()
+
+    metrics = ["cer", "wer", "wer_no_punct", "time"]
+    for dataset_name in config["dataset"]["datasets"]:
+        for set_name in ["test", "val", "train"]:
+            logger.info(f"Evaluating on set `{set_name}`")
+            model.evaluate(
+                "{}-{}".format(dataset_name, set_name),
+                [
+                    (dataset_name, set_name),
+                ],
+                metrics,
+                output=True,
+                mlflow_logging=mlflow_logging,
+            )
+
+
+def run(config: dict):
+    update_config(config)
+
+    mlflow_logging = bool(config.get("mlflow"))
+
+    if mlflow_logging:
+        logger.info("MLflow logging enabled")
+
+    if (
+        config["training"]["device"]["use_ddp"]
+        and config["training"]["device"]["force"] in [None, "cuda"]
+        and torch.cuda.is_available()
+    ):
+        mp.spawn(
+            eval,
+            args=(config, mlflow_logging),
+            nprocs=config["training"]["device"]["nb_gpu"],
+        )
+    else:
+        eval(0, config, mlflow_logging)
--- a/dan/ocr/predict/inference.py
+++ b/dan/ocr/predict/inference.py
@@ -109,8 +109,8 @@ class DAN:
            )

        self.mean, self.std = (
-            torch.tensor(parameters["mean"]) / 255,
-            torch.tensor(parameters["std"]) / 255,
+            torch.tensor(parameters["mean"]) / 255 if "mean" in parameters else None,
+            torch.tensor(parameters["std"]) / 255 if "std" in parameters else None,
        )
        self.preprocessing_transforms = get_preprocessing_transforms(
            parameters.get("preprocessings", [])
@@ -124,11 +124,21 @@ class DAN:
        """
        image = read_image(path)
        preprocessed_image = self.preprocessing_transforms(image)
-        normalized_image = torch.zeros(preprocessed_image.shape)
-        for ch in range(preprocessed_image.shape[0]):
+
+        if self.mean is None and self.std is None:
+            return preprocessed_image, preprocessed_image
+
+        size = preprocessed_image.shape
+        normalized_image = torch.zeros(size)
+
+        mean = self.mean if self.mean is not None else torch.zeros(size[0])
+        std = self.std if self.std is not None else torch.ones(size[0])
+
+        for ch in range(size[0]):
            normalized_image[ch, :, :] = (
-                preprocessed_image[ch, :, :] - self.mean[ch]
-            ) / self.std[ch]
+                preprocessed_image[ch, :, :] - mean[ch]
+            ) / std[ch]
+
        return preprocessed_image, normalized_image

    def predict(

--- a/dan/ocr/train.py
+++ b/dan/ocr/train.py
@@ -3,18 +3,14 @@ import json
 import logging
 import random
 from copy import deepcopy
-from pathlib import Path

 import numpy as np
 import torch
 import torch.multiprocessing as mp
-from torch.optim import Adam

-from dan.ocr.decoder import GlobalHTADecoder
-from dan.ocr.encoder import FCN_Encoder
 from dan.ocr.manager.training import Manager
 from dan.ocr.mlflow import MLFLOW_AVAILABLE
-from dan.ocr.transforms import Preprocessing
+from dan.ocr.utils import update_config
 from dan.utils import MLflowNotInstalled

 if MLFLOW_AVAILABLE:
@@ -26,7 +22,7 @@ if MLFLOW_AVAILABLE:
 logger = logging.getLogger(__name__)


-def train_and_test(rank, params, mlflow_logging=False):
+def train(rank, params, mlflow_logging=False):
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    np.random.seed(0)
@@ -43,67 +39,6 @@ def train_and_test(rank, params, mlflow_logging=False):

    model.train(mlflow_logging=mlflow_logging)

-    # load weights giving best CER on valid set
-    model.params["training"]["load_epoch"] = "best"
-    model.load_model()
-
-    metrics = ["cer", "wer", "wer_no_punct", "time"]
-    for dataset_name in params["dataset"]["datasets"]:
-        for set_name in ["test", "val", "train"]:
-            model.evaluate(
-                "{}-{}".format(dataset_name, set_name),
-                [
-                    (dataset_name, set_name),
-                ],
-                metrics,
-                output=True,
-                mlflow_logging=mlflow_logging,
-            )
-
-
-def update_config(config: dict):
-    """
-    Update some fields for easier
-    """
-
-    # .dataset.datasets cast all values to Path
-    config["dataset"]["datasets"] = {
-        name: Path(path) for name, path in config["dataset"]["datasets"].items()
-    }
-
-    # .model.encoder.class = FCN_ENCODER
-    config["model"]["encoder"]["class"] = FCN_Encoder
-
-    # .model.decoder.class = GlobalHTADecoder
-    config["model"]["decoder"]["class"] = GlobalHTADecoder
-
-    # Update preprocessing type
-    for prepro in config["training"]["data"]["preprocessings"]:
-        prepro["type"] = Preprocessing(prepro["type"])
-
-    # .training.output_folder to Path
-    config["training"]["output_folder"] = Path(config["training"]["output_folder"])
-
-    if config["training"]["transfer_learning"]:
-        # .training.transfer_learning.encoder[1]
-        config["training"]["transfer_learning"]["encoder"][1] = Path(
-            config["training"]["transfer_learning"]["encoder"][1]
-        )
-
-        # .training.transfer_learning.decoder[1]
-        config["training"]["transfer_learning"]["decoder"][1] = Path(
-            config["training"]["transfer_learning"]["decoder"][1]
-        )
-
-    # Parse optimizers
-    for optimizer_setup in config["training"]["optimizers"].values():
-        # Only supported optimizer is Adam
-        optimizer_setup["class"] = Adam
-
-    # set nb_gpu if not present
-    if config["training"]["device"]["nb_gpu"] is None:
-        config["training"]["device"]["nb_gpu"] = torch.cuda.device_count()
-

 def serialize_config(config):
    """
@@ -150,12 +85,12 @@ def start_training(config, mlflow_logging: bool) -> None:
        and torch.cuda.is_available()
    ):
        mp.spawn(
-            train_and_test,
+            train,
            args=(config, mlflow_logging),
            nprocs=config["training"]["device"]["nb_gpu"],
        )
    else:
-        train_and_test(0, config, mlflow_logging)
+        train(0, config, mlflow_logging)


 def run(config: dict):

--- a/dan/ocr/utils.py
+++ b/dan/ocr/utils.py
+# -*- coding: utf-8 -*-
+from pathlib import Path
+
+import torch
+from torch.optim import Adam
+
+from dan.ocr.decoder import GlobalHTADecoder
+from dan.ocr.encoder import FCN_Encoder
+from dan.ocr.transforms import Preprocessing
+
+
+def update_config(config: dict):
+    """
+    Complete the fields that are not JSON serializable.
+    """
+
+    # .dataset.datasets cast all values to Path
+    config["dataset"]["datasets"] = {
+        name: Path(path) for name, path in config["dataset"]["datasets"].items()
+    }
+
+    # .model.encoder.class = FCN_ENCODER
+    config["model"]["encoder"]["class"] = FCN_Encoder
+
+    # .model.decoder.class = GlobalHTADecoder
+    config["model"]["decoder"]["class"] = GlobalHTADecoder
+
+    # Update preprocessing type
+    for prepro in config["training"]["data"]["preprocessings"]:
+        prepro["type"] = Preprocessing(prepro["type"])
+
+    # .training.output_folder to Path
+    config["training"]["output_folder"] = Path(config["training"]["output_folder"])
+
+    if config["training"]["transfer_learning"]:
+        # .training.transfer_learning.encoder[1]
+        config["training"]["transfer_learning"]["encoder"][1] = Path(
+            config["training"]["transfer_learning"]["encoder"][1]
+        )
+
+        # .training.transfer_learning.decoder[1]
+        config["training"]["transfer_learning"]["decoder"][1] = Path(
+            config["training"]["transfer_learning"]["decoder"][1]
+        )
+
+    # Parse optimizers
+    for optimizer_setup in config["training"]["optimizers"].values():
+        # Only supported optimizer is Adam
+        optimizer_setup["class"] = Adam
+
+    # set nb_gpu if not present
+    if config["training"]["device"]["nb_gpu"] is None:
+        config["training"]["device"]["nb_gpu"] = torch.cuda.device_count()
--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -46,6 +46,12 @@ teklia-dan predict \
    --output /tmp/dan-predict
 ```

+The library already has all the documents needed to run the [evaluation command](../usage/evaluate/index.md) on a minimalist dataset. You can use the configuration available at `configs/eval.json`. It is already populated with the parameters used in the unit tests.
+
+```shell
+teklia-dan evaluate --config configs/eval.json
+```
+
 ## Documentation

 This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).

--- a/docs/ref/ocr/evaluate.md
+++ b/docs/ref/ocr/evaluate.md
+# Evaluation
+
+::: dan.ocr.evaluate
--- a/docs/usage/evaluate/index.md
+++ b/docs/usage/evaluate/index.md
+# Evaluation
+
+Use the `teklia-dan evaluate` command to evaluate a trained DAN model.
+
+To evaluate DAN on your dataset:
+
+1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters.
+1. Run `teklia-dan evaluate --config path/to/your/config.json`.
+1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration.
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@@ -8,5 +8,8 @@ When `teklia-dan` is installed in your environment, you may use the following co
 `teklia-dan train`
 : To train a new DAN model. More details in the [dedicated page](./train/index.md).

+`teklia-dan evaluate`
+: To evaluate a trained DAN model. More details in the [dedicated page](./evaluate/index.md).
+
 `teklia-dan predict`
 : To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md).
--- a/docs/usage/predict/index.md
+++ b/docs/usage/predict/index.md
-# Predict
+# Prediction

 Use the `teklia-dan predict` command to apply a trained DAN model on an image.


--- a/docs/usage/train/index.md
+++ b/docs/usage/train/index.md
 # Train

-Use the `teklia-dan train` command to train a new DAN model. It is able to train a DAN model at line or document-level and evaluate it.
+Use the `teklia-dan train` command to train a new DAN model. It is able to train a DAN model at line or document-level.

 To train DAN on your dataset:

 1. Create a training JSON configuration file. Refer to the [dedicated page](config.md) for a description of parameters.
 1. Run `teklia-dan train --config path/to/your/config.json`.
-1. Look into evaluation results in the output folder indicated in your configuration:
-    - `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
-    - `results` contains the tensorboard log file, the parameters file, and the evaluation results for the best epoch.
 1. (Optional) Train a language model. Refer to the [dedicated page](language_model.md).
+1. Look into the training results in the output folder indicated in your configuration:
+    - `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
+    - `results` contains the tensorboard log file and the parameters file.

 ## Additional pages


--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -71,7 +71,8 @@ nav:
      - Data augmentation: usage/train/augmentation.md
      - Language model: usage/train/language_model.md
      - Jean Zay tutorial: usage/train/jeanzay.md
-    - Predict: usage/predict/index.md
+    - Evaluation: usage/evaluate/index.md
+    - Prediction: usage/predict/index.md

  - Python Reference:
    - Datasets:
@@ -101,6 +102,7 @@ nav:
        - OCR managers: ref/ocr/managers/ocr.md
        - Training managers: ref/ocr/managers/training.md
      - Training: ref/ocr/train.md
+      - Evaluation: ref/ocr/evaluate.md
      - Prediction:
        - ref/ocr/predict/index.md
        - Inference: ref/ocr/predict/inference.md

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -19,7 +19,6 @@ from arkindex_export import (
    WorkerVersion,
    database,
 )
-from dan.ocr.train import update_config
 from tests import FIXTURES


@@ -184,9 +183,12 @@ def mock_database(tmp_path_factory):

 @pytest.fixture
 def training_config():
-    config = json.loads((FIXTURES.parent.parent / "configs" / "tests.json").read_text())
-    update_config(config)
-    return config
+    return json.loads((FIXTURES.parent.parent / "configs" / "tests.json").read_text())
+
+
+@pytest.fixture
+def evaluate_config():
+    return json.loads((FIXTURES.parent.parent / "configs" / "eval.json").read_text())


 @pytest.fixture

--- a/tests/data/evaluate/checkpoints/best_0.pt
+++ b/tests/data/evaluate/checkpoints/best_0.pt
--- a/tests/data/evaluate/checkpoints/last_3.pt
+++ b/tests/data/evaluate/checkpoints/last_3.pt
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
+# -*- coding: utf-8 -*-
+
+import shutil
+
+import pytest
+import yaml
+
+from dan.ocr import evaluate
+from tests import FIXTURES
+
+
+@pytest.mark.parametrize(
+    "training_res, val_res, test_res",
+    (
+        (
+            {
+                "nb_chars": 43,
+                "cer": 1.3023,
+                "nb_words": 9,
+                "wer": 1.0,
+                "nb_words_no_punct": 9,
+                "wer_no_punct": 1.0,
+                "nb_samples": 2,
+            },
+            {
+                "nb_chars": 41,
+                "cer": 1.2683,
+                "nb_words": 9,
+                "wer": 1.0,
+                "nb_words_no_punct": 9,
+                "wer_no_punct": 1.0,
+                "nb_samples": 2,
+            },
+            {
+                "nb_chars": 49,
+                "cer": 1.1224,
+                "nb_words": 9,
+                "wer": 1.0,
+                "nb_words_no_punct": 9,
+                "wer_no_punct": 1.0,
+                "nb_samples": 2,
+            },
+        ),
+    ),
+)
+def test_evaluate(training_res, val_res, test_res, evaluate_config):
+    # Use the tmp_path as base folder
+    evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
+
+    evaluate.run(evaluate_config)
+
+    # Check that the evaluation results are correct
+    for split_name, expected_res in zip(
+        ["train", "val", "test"], [training_res, val_res, test_res]
+    ):
+        filename = (
+            evaluate_config["training"]["output_folder"]
+            / "results"
+            / f"predict_training-{split_name}_0.yaml"
+        )
+
+        with filename.open() as f:
+            # Remove the times from the results as they vary
+            res = {
+                metric: value
+                for metric, value in yaml.safe_load(f).items()
+                if "time" not in metric
+            }
+            assert res == expected_res
+
+    # Remove results files
+    shutil.rmtree(evaluate_config["training"]["output_folder"] / "results")
No results found