Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (4)
Showing
with 406 additions and 92 deletions
{
"dataset": {
"datasets": {
"training": "tests/data/training/training_dataset"
},
"train": {
"name": "training-train",
"datasets": [
["training", "train"]
]
},
"val": {
"training-val": [
["training", "val"]
]
},
"test": {
"training-test": [
["training", "test"]
]
},
"max_char_prediction": 30,
"tokens": null
},
"model": {
"transfered_charset": true,
"additional_tokens": 1,
"encoder": {
"dropout": 0.5,
"nb_layers": 5
},
"h_max": 500,
"w_max": 1000,
"decoder": {
"l_max": 15000,
"dec_num_layers": 8,
"dec_num_heads": 4,
"dec_res_dropout": 0.1,
"dec_pred_dropout": 0.1,
"dec_att_dropout": 0.1,
"dec_dim_feedforward": 256,
"attention_win": 100,
"enc_dim": 256
}
},
"training": {
"data": {
"batch_size": 2,
"load_in_memory": true,
"worker_per_gpu": 4,
"preprocessings": [
{
"type": "max_resize",
"max_width": 2000,
"max_height": 2000
}
],
"augmentation": true
},
"device": {
"use_ddp": false,
"ddp_port": "20027",
"use_amp": true,
"nb_gpu": 0,
"force": "cpu"
},
"metrics": {
"train": [
"loss_ce",
"cer",
"wer",
"wer_no_punct"
],
"eval": [
"cer",
"wer",
"wer_no_punct"
]
},
"validation": {
"eval_on_valid": true,
"eval_on_valid_interval": 2,
"set_name_focus_metric": "training-val"
},
"output_folder": "tests/data/evaluate",
"gradient_clipping": {},
"max_nb_epochs": 4,
"load_epoch": "best",
"optimizers": {
"all": {
"args": {
"lr": 0.0001,
"amsgrad": false
}
}
},
"lr_schedulers": null,
"label_noise_scheduler": {
"min_error_rate": 0.2,
"max_error_rate": 0.2,
"total_num_steps": 5e4
},
"transfer_learning": null
}
}
......@@ -3,7 +3,7 @@ import argparse
import errno
from dan.datasets import add_dataset_parser
from dan.ocr import add_predict_parser, add_train_parser
from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser
def get_parser():
......@@ -12,6 +12,7 @@ def get_parser():
add_dataset_parser(subcommands)
add_train_parser(subcommands)
add_evaluate_parser(subcommands)
add_predict_parser(subcommands)
return parser
......
......@@ -384,6 +384,9 @@ class ArkindexExtractor:
subword_vocab_size=self.subword_vocab_size,
)
if not tokenizer.sentencepiece_model:
return
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
......@@ -478,6 +481,11 @@ class ArkindexExtractor:
pbar.update()
pbar.refresh()
if not self.data:
raise Exception(
"No data was extracted using the provided export database and parameters."
)
self.download_images()
self.format_lm_files()
self.export()
......
......@@ -186,12 +186,22 @@ class Tokenizer:
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
tmp.write("\n".join(self.training_corpus))
tmp.flush()
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
try:
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
minloglevel=1,
)
except Exception as e:
logger.warning(
f"Failed to train a sentencepiece model for subword tokenization: {e} "
"Try again by editing the `--subword-vocab-size` parameter."
)
self.sentencepiece_model = None
return
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
......
......@@ -3,6 +3,8 @@
Train a new DAN model.
"""
from dan.ocr.evaluate import add_evaluate_parser # noqa
from dan.ocr.predict import add_predict_parser # noqa
from dan.ocr.train import run
from dan.utils import read_json
......
# -*- coding: utf-8 -*-
"""
Evaluate a trained DAN model.
"""
import logging
import random
import numpy as np
import torch
import torch.multiprocessing as mp
from dan.ocr.manager.training import Manager
from dan.ocr.utils import update_config
from dan.utils import read_json
logger = logging.getLogger(__name__)
def add_evaluate_parser(subcommands) -> None:
parser = subcommands.add_parser(
"evaluate",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"--config",
type=read_json,
required=True,
help="Configuration file.",
)
parser.set_defaults(func=run)
def eval(rank, config, mlflow_logging):
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
config["training"]["device"]["ddp_rank"] = rank
# Load best checkpoint
config["training"]["load_epoch"] = "best"
model = Manager(config)
model.load_model()
metrics = ["cer", "wer", "wer_no_punct", "time"]
for dataset_name in config["dataset"]["datasets"]:
for set_name in ["test", "val", "train"]:
logger.info(f"Evaluating on set `{set_name}`")
model.evaluate(
"{}-{}".format(dataset_name, set_name),
[
(dataset_name, set_name),
],
metrics,
output=True,
mlflow_logging=mlflow_logging,
)
def run(config: dict):
update_config(config)
mlflow_logging = bool(config.get("mlflow"))
if mlflow_logging:
logger.info("MLflow logging enabled")
if (
config["training"]["device"]["use_ddp"]
and config["training"]["device"]["force"] in [None, "cuda"]
and torch.cuda.is_available()
):
mp.spawn(
eval,
args=(config, mlflow_logging),
nprocs=config["training"]["device"]["nb_gpu"],
)
else:
eval(0, config, mlflow_logging)
......@@ -109,8 +109,8 @@ class DAN:
)
self.mean, self.std = (
torch.tensor(parameters["mean"]) / 255,
torch.tensor(parameters["std"]) / 255,
torch.tensor(parameters["mean"]) / 255 if "mean" in parameters else None,
torch.tensor(parameters["std"]) / 255 if "std" in parameters else None,
)
self.preprocessing_transforms = get_preprocessing_transforms(
parameters.get("preprocessings", [])
......@@ -124,11 +124,21 @@ class DAN:
"""
image = read_image(path)
preprocessed_image = self.preprocessing_transforms(image)
normalized_image = torch.zeros(preprocessed_image.shape)
for ch in range(preprocessed_image.shape[0]):
if self.mean is None and self.std is None:
return preprocessed_image, preprocessed_image
size = preprocessed_image.shape
normalized_image = torch.zeros(size)
mean = self.mean if self.mean is not None else torch.zeros(size[0])
std = self.std if self.std is not None else torch.ones(size[0])
for ch in range(size[0]):
normalized_image[ch, :, :] = (
preprocessed_image[ch, :, :] - self.mean[ch]
) / self.std[ch]
preprocessed_image[ch, :, :] - mean[ch]
) / std[ch]
return preprocessed_image, normalized_image
def predict(
......
......@@ -3,18 +3,14 @@ import json
import logging
import random
from copy import deepcopy
from pathlib import Path
import numpy as np
import torch
import torch.multiprocessing as mp
from torch.optim import Adam
from dan.ocr.decoder import GlobalHTADecoder
from dan.ocr.encoder import FCN_Encoder
from dan.ocr.manager.training import Manager
from dan.ocr.mlflow import MLFLOW_AVAILABLE
from dan.ocr.transforms import Preprocessing
from dan.ocr.utils import update_config
from dan.utils import MLflowNotInstalled
if MLFLOW_AVAILABLE:
......@@ -26,7 +22,7 @@ if MLFLOW_AVAILABLE:
logger = logging.getLogger(__name__)
def train_and_test(rank, params, mlflow_logging=False):
def train(rank, params, mlflow_logging=False):
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
......@@ -43,67 +39,6 @@ def train_and_test(rank, params, mlflow_logging=False):
model.train(mlflow_logging=mlflow_logging)
# load weights giving best CER on valid set
model.params["training"]["load_epoch"] = "best"
model.load_model()
metrics = ["cer", "wer", "wer_no_punct", "time"]
for dataset_name in params["dataset"]["datasets"]:
for set_name in ["test", "val", "train"]:
model.evaluate(
"{}-{}".format(dataset_name, set_name),
[
(dataset_name, set_name),
],
metrics,
output=True,
mlflow_logging=mlflow_logging,
)
def update_config(config: dict):
"""
Update some fields for easier
"""
# .dataset.datasets cast all values to Path
config["dataset"]["datasets"] = {
name: Path(path) for name, path in config["dataset"]["datasets"].items()
}
# .model.encoder.class = FCN_ENCODER
config["model"]["encoder"]["class"] = FCN_Encoder
# .model.decoder.class = GlobalHTADecoder
config["model"]["decoder"]["class"] = GlobalHTADecoder
# Update preprocessing type
for prepro in config["training"]["data"]["preprocessings"]:
prepro["type"] = Preprocessing(prepro["type"])
# .training.output_folder to Path
config["training"]["output_folder"] = Path(config["training"]["output_folder"])
if config["training"]["transfer_learning"]:
# .training.transfer_learning.encoder[1]
config["training"]["transfer_learning"]["encoder"][1] = Path(
config["training"]["transfer_learning"]["encoder"][1]
)
# .training.transfer_learning.decoder[1]
config["training"]["transfer_learning"]["decoder"][1] = Path(
config["training"]["transfer_learning"]["decoder"][1]
)
# Parse optimizers
for optimizer_setup in config["training"]["optimizers"].values():
# Only supported optimizer is Adam
optimizer_setup["class"] = Adam
# set nb_gpu if not present
if config["training"]["device"]["nb_gpu"] is None:
config["training"]["device"]["nb_gpu"] = torch.cuda.device_count()
def serialize_config(config):
"""
......@@ -150,12 +85,12 @@ def start_training(config, mlflow_logging: bool) -> None:
and torch.cuda.is_available()
):
mp.spawn(
train_and_test,
train,
args=(config, mlflow_logging),
nprocs=config["training"]["device"]["nb_gpu"],
)
else:
train_and_test(0, config, mlflow_logging)
train(0, config, mlflow_logging)
def run(config: dict):
......
# -*- coding: utf-8 -*-
from pathlib import Path
import torch
from torch.optim import Adam
from dan.ocr.decoder import GlobalHTADecoder
from dan.ocr.encoder import FCN_Encoder
from dan.ocr.transforms import Preprocessing
def update_config(config: dict):
"""
Complete the fields that are not JSON serializable.
"""
# .dataset.datasets cast all values to Path
config["dataset"]["datasets"] = {
name: Path(path) for name, path in config["dataset"]["datasets"].items()
}
# .model.encoder.class = FCN_ENCODER
config["model"]["encoder"]["class"] = FCN_Encoder
# .model.decoder.class = GlobalHTADecoder
config["model"]["decoder"]["class"] = GlobalHTADecoder
# Update preprocessing type
for prepro in config["training"]["data"]["preprocessings"]:
prepro["type"] = Preprocessing(prepro["type"])
# .training.output_folder to Path
config["training"]["output_folder"] = Path(config["training"]["output_folder"])
if config["training"]["transfer_learning"]:
# .training.transfer_learning.encoder[1]
config["training"]["transfer_learning"]["encoder"][1] = Path(
config["training"]["transfer_learning"]["encoder"][1]
)
# .training.transfer_learning.decoder[1]
config["training"]["transfer_learning"]["decoder"][1] = Path(
config["training"]["transfer_learning"]["decoder"][1]
)
# Parse optimizers
for optimizer_setup in config["training"]["optimizers"].values():
# Only supported optimizer is Adam
optimizer_setup["class"] = Adam
# set nb_gpu if not present
if config["training"]["device"]["nb_gpu"] is None:
config["training"]["device"]["nb_gpu"] = torch.cuda.device_count()
......@@ -46,6 +46,12 @@ teklia-dan predict \
--output /tmp/dan-predict
```
The library already has all the documents needed to run the [evaluation command](../usage/evaluate/index.md) on a minimalist dataset. You can use the configuration available at `configs/eval.json`. It is already populated with the parameters used in the unit tests.
```shell
teklia-dan evaluate --config configs/eval.json
```
## Documentation
This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).
......
# Evaluation
::: dan.ocr.evaluate
# Evaluation
Use the `teklia-dan evaluate` command to evaluate a trained DAN model.
To evaluate DAN on your dataset:
1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters.
1. Run `teklia-dan evaluate --config path/to/your/config.json`.
1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration.
......@@ -8,5 +8,8 @@ When `teklia-dan` is installed in your environment, you may use the following co
`teklia-dan train`
: To train a new DAN model. More details in the [dedicated page](./train/index.md).
`teklia-dan evaluate`
: To evaluate a trained DAN model. More details in the [dedicated page](./evaluate/index.md).
`teklia-dan predict`
: To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md).
# Predict
# Prediction
Use the `teklia-dan predict` command to apply a trained DAN model on an image.
......
# Train
Use the `teklia-dan train` command to train a new DAN model. It is able to train a DAN model at line or document-level and evaluate it.
Use the `teklia-dan train` command to train a new DAN model. It is able to train a DAN model at line or document-level.
To train DAN on your dataset:
1. Create a training JSON configuration file. Refer to the [dedicated page](config.md) for a description of parameters.
1. Run `teklia-dan train --config path/to/your/config.json`.
1. Look into evaluation results in the output folder indicated in your configuration:
- `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
- `results` contains the tensorboard log file, the parameters file, and the evaluation results for the best epoch.
1. (Optional) Train a language model. Refer to the [dedicated page](language_model.md).
1. Look into the training results in the output folder indicated in your configuration:
- `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
- `results` contains the tensorboard log file and the parameters file.
## Additional pages
......
......@@ -71,7 +71,8 @@ nav:
- Data augmentation: usage/train/augmentation.md
- Language model: usage/train/language_model.md
- Jean Zay tutorial: usage/train/jeanzay.md
- Predict: usage/predict/index.md
- Evaluation: usage/evaluate/index.md
- Prediction: usage/predict/index.md
- Python Reference:
- Datasets:
......@@ -101,6 +102,7 @@ nav:
- OCR managers: ref/ocr/managers/ocr.md
- Training managers: ref/ocr/managers/training.md
- Training: ref/ocr/train.md
- Evaluation: ref/ocr/evaluate.md
- Prediction:
- ref/ocr/predict/index.md
- Inference: ref/ocr/predict/inference.md
......
......@@ -19,7 +19,6 @@ from arkindex_export import (
WorkerVersion,
database,
)
from dan.ocr.train import update_config
from tests import FIXTURES
......@@ -184,9 +183,12 @@ def mock_database(tmp_path_factory):
@pytest.fixture
def training_config():
config = json.loads((FIXTURES.parent.parent / "configs" / "tests.json").read_text())
update_config(config)
return config
return json.loads((FIXTURES.parent.parent / "configs" / "tests.json").read_text())
@pytest.fixture
def evaluate_config():
return json.loads((FIXTURES.parent.parent / "configs" / "eval.json").read_text())
@pytest.fixture
......
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
# -*- coding: utf-8 -*-
import shutil
import pytest
import yaml
from dan.ocr import evaluate
from tests import FIXTURES
@pytest.mark.parametrize(
"training_res, val_res, test_res",
(
(
{
"nb_chars": 43,
"cer": 1.3023,
"nb_words": 9,
"wer": 1.0,
"nb_words_no_punct": 9,
"wer_no_punct": 1.0,
"nb_samples": 2,
},
{
"nb_chars": 41,
"cer": 1.2683,
"nb_words": 9,
"wer": 1.0,
"nb_words_no_punct": 9,
"wer_no_punct": 1.0,
"nb_samples": 2,
},
{
"nb_chars": 49,
"cer": 1.1224,
"nb_words": 9,
"wer": 1.0,
"nb_words_no_punct": 9,
"wer_no_punct": 1.0,
"nb_samples": 2,
},
),
),
)
def test_evaluate(training_res, val_res, test_res, evaluate_config):
# Use the tmp_path as base folder
evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
evaluate.run(evaluate_config)
# Check that the evaluation results are correct
for split_name, expected_res in zip(
["train", "val", "test"], [training_res, val_res, test_res]
):
filename = (
evaluate_config["training"]["output_folder"]
/ "results"
/ f"predict_training-{split_name}_0.yaml"
)
with filename.open() as f:
# Remove the times from the results as they vary
res = {
metric: value
for metric, value in yaml.safe_load(f).items()
if "time" not in metric
}
assert res == expected_res
# Remove results files
shutil.rmtree(evaluate_config["training"]["output_folder"] / "results")