diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..7d6f8a61515402d96d9d86de1546165d8d529a87
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "nerval"]
+	path = nerval
+	url = ../../ner/nerval.git
diff --git a/Dockerfile b/Dockerfile
index 5e12de4fdfa493b77667ff763b19c35a1a7dc9b2..7d50ce7bc4a70b549a6830ebeb334681e8e1d6a7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,12 @@ RUN apt-get -y update && \
 
 WORKDIR /src
 
-# Install DAN as a package
+# Copy submodule data
+COPY nerval nerval
+
+# Copy DAN data
 COPY dan dan
 COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
+
+# Install DAN as a package
 RUN pip install . --no-cache-dir
diff --git a/dan/ocr/evaluate.py b/dan/ocr/evaluate.py
index 7d514a40714df6c6eb846f7e7463a570affc6c29..079b0e5d11455426132c410adf0f5383e9329e1f 100644
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -5,17 +5,42 @@ Evaluate a trained DAN model.
 
 import logging
 import random
+from argparse import ArgumentTypeError
+from pathlib import Path
+from typing import Dict, List
 
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 
+from dan.bio import convert
+from dan.ocr.manager.metrics import Inference
 from dan.ocr.manager.training import Manager
 from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
-from dan.utils import read_json
+from dan.utils import parse_tokens, read_json
+from nerval.evaluate import evaluate
+from nerval.parse import parse_bio
+from nerval.utils import print_results
 
 logger = logging.getLogger(__name__)
 
+NERVAL_THRESHOLD = 0.30
+
+
+def parse_threshold(value: str) -> float:
+    """
+    Check that the string passed as parameter is a correct floating point number between 0 and 1
+    """
+    try:
+        value = float(value)
+    except ValueError:
+        raise ArgumentTypeError("Must be a floating point number.")
+
+    if value < 0 or value > 1:
+        raise ArgumentTypeError("Must be between 0 and 1.")
+
+    return value
+
 
 def add_evaluate_parser(subcommands) -> None:
     parser = subcommands.add_parser(
@@ -31,10 +56,55 @@ def add_evaluate_parser(subcommands) -> None:
         help="Configuration file.",
     )
 
+    parser.add_argument(
+        "--nerval-threshold",
+        help="Distance threshold for the match between gold and predicted entity during Nerval evaluation.",
+        default=NERVAL_THRESHOLD,
+        type=parse_threshold,
+    )
+
     parser.set_defaults(func=run)
 
 
-def eval(rank, config, mlflow_logging):
+def eval_nerval(
+    all_inferences: Dict[str, List[Inference]],
+    tokens: Path,
+    threshold: float,
+):
+    print("\n#### Nerval evaluation")
+
+    def inferences_to_parsed_bio(attr: str):
+        bio_values = []
+        for inference in inferences:
+            value = getattr(inference, attr)
+            bio_value = convert(value, ner_tokens=tokens)
+            bio_values.extend(bio_value.split("\n"))
+
+        # Parse this BIO format
+        return parse_bio(bio_values)
+
+    # Evaluate with Nerval
+    tokens = parse_tokens(tokens)
+    for split_name, inferences in all_inferences.items():
+        ground_truths = inferences_to_parsed_bio("ground_truth")
+        predictions = inferences_to_parsed_bio("prediction")
+
+        if not (ground_truths and predictions):
+            continue
+
+        scores = {
+            key: {
+                k: round(value * 100, 2) if k in ["P", "R", "F1"] else value
+                for k, value in values.items()
+            }
+            for key, values in evaluate(ground_truths, predictions, threshold).items()
+        }
+
+        print(f"\n##### {split_name}\n")
+        print_results(scores)
+
+
+def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
     np.random.seed(0)
@@ -62,10 +132,12 @@ def eval(rank, config, mlflow_logging):
         metric_names.append("ner")
 
     metrics_table = create_metrics_table(metric_names)
+    all_inferences = {}
+
     for dataset_name in config["dataset"]["datasets"]:
         for set_name in ["train", "val", "test"]:
             logger.info(f"Evaluating on set `{set_name}`")
-            metrics = model.evaluate(
+            metrics, inferences = model.evaluate(
                 "{}-{}".format(dataset_name, set_name),
                 [
                     (dataset_name, set_name),
@@ -75,11 +147,20 @@ def eval(rank, config, mlflow_logging):
             )
 
             add_metrics_table_row(metrics_table, set_name, metrics)
+            all_inferences[set_name] = inferences
 
+    print("\n#### DAN evaluation\n")
     print(metrics_table)
 
+    if "ner" in metric_names:
+        eval_nerval(
+            all_inferences,
+            tokens=config["dataset"]["tokens"],
+            threshold=nerval_threshold,
+        )
+
 
-def run(config: dict):
+def run(config: dict, nerval_threshold: float):
     update_config(config)
 
     mlflow_logging = bool(config.get("mlflow"))
@@ -94,8 +175,8 @@ def run(config: dict):
     ):
         mp.spawn(
             eval,
-            args=(config, mlflow_logging),
+            args=(config, nerval_threshold, mlflow_logging),
             nprocs=config["training"]["device"]["nb_gpu"],
         )
     else:
-        eval(0, config, mlflow_logging)
+        eval(0, config, nerval_threshold, mlflow_logging)
diff --git a/dan/ocr/manager/metrics.py b/dan/ocr/manager/metrics.py
index 102fddafbaa334845dc9251c630df18a6ae353dc..988c90d073c95cb80a682cf3a916972da540233f 100644
--- a/dan/ocr/manager/metrics.py
+++ b/dan/ocr/manager/metrics.py
@@ -3,7 +3,7 @@ import re
 from collections import defaultdict
 from operator import attrgetter
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, NamedTuple
 
 import editdistance
 import numpy as np
@@ -23,6 +23,16 @@ REGEX_ONLY_ONE_SPACE = re.compile(r"\s+")
 METRICS_KEYWORD = {"cer": "chars", "wer": "words", "ner": "tokens"}
 
 
+class Inference(NamedTuple):
+    """
+    Store a prediction with its ground truth to avoid
+    inferring again when we need to compute new metrics
+    """
+
+    ground_truth: str
+    prediction: str
+
+
 class MetricManager:
     def __init__(self, metric_names: List[str], dataset_name: str, tokens: Path | None):
         self.dataset_name: str = dataset_name
diff --git a/dan/ocr/manager/training.py b/dan/ocr/manager/training.py
index ba5160fa1d3ad9ca1037077292b3530861520231..ad35f823e312c51a21975229fff0da38bac0545d 100644
--- a/dan/ocr/manager/training.py
+++ b/dan/ocr/manager/training.py
@@ -6,7 +6,7 @@ from copy import deepcopy
 from enum import Enum
 from pathlib import Path
 from time import time
-from typing import Dict
+from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -20,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 
-from dan.ocr.manager.metrics import MetricManager
+from dan.ocr.manager.metrics import Inference, MetricManager
 from dan.ocr.manager.ocr import OCRDatasetManager
 from dan.ocr.mlflow import MLFLOW_AVAILABLE, logging_metrics, logging_tags_metrics
 from dan.ocr.schedulers import DropoutScheduler
@@ -750,7 +750,7 @@ class GenericTrainingManager:
 
     def evaluate(
         self, custom_name, sets_list, metric_names, mlflow_logging=False
-    ) -> Dict[str, int | float]:
+    ) -> Tuple[Dict[str, int | float], List[Inference]]:
         """
         Main loop for evaluation
         """
@@ -768,6 +768,9 @@ class GenericTrainingManager:
             tokens=self.tokens,
         )
 
+        # Keep inferences in memory to evaluate with Nerval
+        inferences = []
+
         with tqdm(total=len(loader.dataset)) as pbar:
             pbar.set_description("Evaluation")
             with torch.no_grad():
@@ -792,6 +795,10 @@ class GenericTrainingManager:
                     pbar.set_postfix(values=str(display_values))
                     pbar.update(len(batch_data["names"]) * self.nb_workers)
 
+                    inferences.extend(
+                        map(Inference, batch_values["str_y"], batch_values["str_x"])
+                    )
+
                 # log metrics in MLflow
                 logging_name = custom_name.split("-")[1]
                 logging_tags_metrics(
@@ -810,7 +817,7 @@ class GenericTrainingManager:
             # Log mlflow artifacts
             mlflow.log_artifact(path, "predictions")
 
-        return metrics
+        return metrics, inferences
 
     def output_pred(self, name):
         path = self.paths["results"] / "predict_{}_{}.yaml".format(
diff --git a/docs/usage/datasets/analyze.md b/docs/usage/datasets/analyze.md
index fec46d59b16ae9441ac06521df3d81f60e080bf8..799774e8fbb23e9de00c78308d3cdcdffaeb05c1 100644
--- a/docs/usage/datasets/analyze.md
+++ b/docs/usage/datasets/analyze.md
@@ -4,8 +4,6 @@
 
 Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in [Markdown](https://www.markdownguide.org/) format.
 
-The available arguments are:
-
 | Parameter       | Description                      | Type           | Default |
 | --------------- | -------------------------------- | -------------- | ------- |
 | `--labels`      | Path to the `labels.json` file.  | `pathlib.Path` |         |
diff --git a/docs/usage/evaluate/index.md b/docs/usage/evaluate/index.md
index f1cc273be210825c853219f5182625f0dc19f21a..2153ebd776c0767b8e55edf8ffaa9cec2422f307 100644
--- a/docs/usage/evaluate/index.md
+++ b/docs/usage/evaluate/index.md
@@ -1,18 +1,70 @@
 # Evaluation
 
+## Description
+
 Use the `teklia-dan evaluate` command to evaluate a trained DAN model.
 
 To evaluate DAN on your dataset:
 
 1. Create a JSON configuration file. You can base the configuration file off the training one. Refer to the [dedicated page](../train/config.md) for a description of parameters.
 1. Run `teklia-dan evaluate --config path/to/your/config.json`.
-1. Evaluation results for every split are available in the `results` subfolder of the output folder indicated in your configuration.
-1. A metrics Markdown table, providing results for each evaluated split, is also printed in the console (see table example below).
 
-### Example output - Metrics Markdown table
+This will, for each evaluated split:
+
+1. Create a YAML file with the evaluation results in the `results` subfolder of the `training.output_folder` indicated in your configuration.
+1. Print in the console a metrics Markdown table (see [table example below](#htr-evaluation)).
+1. Print in the console a [Nerval](https://gitlab.teklia.com/ner/nerval) metrics Markdown table, if the `dataset.tokens` parameter in your configuration is defined (see [table example below](#htr-and-ner-evaluation)).
+
+| Parameter            | Description                                                                                                                                                                                              | Type           | Default |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
+| `--config`           | Path to the configuration file.                                                                                                                                                                          | `pathlib.Path` |         |
+| `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float`        | `0.3`   |
+
+## Example output
+
+### HTR evaluation
+
+```
+#### DAN evaluation
+
+| Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) |
+| :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: |
+| train |       x       |     x     |       x       |     x     |         x          |
+|  val  |       x       |     x     |       x       |     x     |         x          |
+| test  |       x       |     x     |       x       |     x     |         x          |
+```
+
+### HTR and NER evaluation
+
+```
+#### DAN evaluation
 
 | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER |
 | :---: | :-----------: | :-------: | :-----------: | :-------: | :----------------: | :-: |
 | train |       x       |     x     |       x       |     x     |         x          |  x  |
 |  val  |       x       |     x     |       x       |     x     |         x          |  x  |
 | test  |       x       |     x     |       x       |     x     |         x          |  x  |
+
+#### Nerval evaluation
+
+##### train
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+
+##### val
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+
+##### test
+
+|   tag   | predicted | matched | Precision | Recall | F1  | Support |
+| :-----: | :-------: | :-----: | :-------: | :----: | :-: | :-----: |
+| Surname |     x     |    x    |     x     |   x    |  x  |    x    |
+|   All   |     x     |    x    |     x     |   x    |  x  |    x    |
+```
diff --git a/docs/usage/predict/index.md b/docs/usage/predict/index.md
index b96149dbafaba253ec694f2d16cf16629aa6fc67..823399de80299417f9e9fa28a66ed0aaf67fe771 100644
--- a/docs/usage/predict/index.md
+++ b/docs/usage/predict/index.md
@@ -1,9 +1,9 @@
 # Prediction
 
-Use the `teklia-dan predict` command to apply a trained DAN model on an image.
-
 ## Description
 
+Use the `teklia-dan predict` command to apply a trained DAN model on an image.
+
 | Parameter                   | Description                                                                                                                           | Type           | Default       |
 | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------------- |
 | `--image-dir`               | Path to the folder where the images to predict are stored. Must not be provided with `--image`.                                       | `pathlib.Path` |               |
diff --git a/nerval b/nerval
new file mode 160000
index 0000000000000000000000000000000000000000..525c1a9e6d5a33075669085148247e2604dd092f
--- /dev/null
+++ b/nerval
@@ -0,0 +1 @@
+Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
diff --git a/requirements.txt b/requirements.txt
index 445189ae68ef6febce26b477a5360a308208a7e3..d065f01db77260ee824b2c39ea3e778332bb93a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
+-e ./nerval
 albumentations==1.3.1
 arkindex-export==0.1.9
 boto3==1.26.124
-editdistance==0.6.2
 flashlight-text==0.0.4
 imageio==2.26.1
 imagesize==1.4.1
@@ -9,7 +9,6 @@ lxml==4.9.3
 mdutils==1.6.0
 nltk==3.8.1
 numpy==1.24.3
-prettytable==3.8.0
 PyYAML==6.0
 scipy==1.10.1
 sentencepiece==0.1.99
diff --git a/tests/data/evaluate/metrics_table.md b/tests/data/evaluate/metrics_table.md
index d67456d844071ebb8a9332638a2f527d93c5e405..107bef4188e44a6f8e5509c74b7ceb8d4ded4625 100644
--- a/tests/data/evaluate/metrics_table.md
+++ b/tests/data/evaluate/metrics_table.md
@@ -1,5 +1,49 @@
+#### DAN evaluation
+
 | Split | CER (HTR-NER) | CER (HTR) | WER (HTR-NER) | WER (HTR) | WER (HTR no punct) | NER  |
 |:-----:|:-------------:|:---------:|:-------------:|:---------:|:------------------:|:----:|
 | train |     18.89     |   21.05   |     26.67     |   26.67   |       26.67        | 7.14 |
 |  val  |      8.82     |   11.54   |      50.0     |    50.0   |        50.0        | 0.0  |
 |  test |      2.78     |    3.33   |     14.29     |   14.29   |       14.29        | 0.0  |
+
+#### Nerval evaluation
+
+##### train
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|   Patron  |     2     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Operai  |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|   Louche  |     2     |    1    |    50.0   |  50.0  |  50.0 |    2    |
+|   Koala   |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+| Firstname |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+| Chalumeau |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|  Batiment |     2     |    2    |   100.0   | 100.0  | 100.0 |    2    |
+|    All    |     15    |    12   |    80.0   | 85.71  | 82.76 |    14   |
+
+##### val
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Patron  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Operai  |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|   Louche  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Koala   |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Firstname |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Chalumeau |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|  Batiment |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|    All    |     8     |    6    |    75.0   |  75.0  |  75.0 |    8    |
+
+##### test
+
+|    tag    | predicted | matched | Precision | Recall |   F1  | Support |
+|:---------:|:---------:|:-------:|:---------:|:------:|:-----:|:-------:|
+|  Surname  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Louche  |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|   Koala   |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Firstname |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+| Chalumeau |     1     |    0    |    0.0    |  0.0   |   0   |    1    |
+|  Batiment |     1     |    1    |   100.0   | 100.0  | 100.0 |    1    |
+|    All    |     6     |    5    |   83.33   | 83.33  | 83.33 |    6    |
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 98fbd93898f09b75377b3b8e9cd21ddbc1b7521b..0bdf51968cf985a7c1fa1dcac00aedbe4b474091 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -103,7 +103,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
     # Use the tmp_path as base folder
     evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
 
-    evaluate.run(evaluate_config)
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
 
     # Check that the evaluation results are correct
     for split_name, expected_res in zip(
@@ -129,7 +129,7 @@ def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
 
     # Check the metrics Markdown table
     captured_std = capsys.readouterr()
-    last_printed_lines = captured_std.out.split("\n")[-6:]
+    last_printed_lines = captured_std.out.split("\n")[10:]
     assert (
         "\n".join(last_printed_lines)
         == Path(FIXTURES / "evaluate" / "metrics_table.md").read_text()