diff --git a/dan/ocr/evaluate.py b/dan/ocr/evaluate.py
index 7c100d730401056cd2fa0c4f6ec2137d80414094..9a5805b9943811b0aed880a1a2ead99bf60f3648 100644
--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -6,6 +6,7 @@
 Evaluate a trained DAN model.
 """
 
+import json
 import logging
 import random
 from argparse import ArgumentTypeError
@@ -73,6 +74,13 @@ def add_evaluate_parser(subcommands) -> None:
         type=parse_threshold,
     )
 
+    parser.add_argument(
+        "--output-json",
+        help="Where to save evaluation results in JSON format.",
+        default=None,
+        type=Path,
+    )
+
     parser.set_defaults(func=run)
 
 
@@ -161,7 +169,13 @@ def eval_nerval(
         print_results(scores)
 
 
-def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
+def eval(
+    rank,
+    config: dict,
+    nerval_threshold: float,
+    output_json: Path | None,
+    mlflow_logging: bool,
+):
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
     np.random.seed(0)
@@ -218,8 +232,12 @@ def eval(rank, config: dict, nerval_threshold: float, mlflow_logging: bool):
 
     print_worst_predictions(all_inferences)
 
+    # Save to JSON
+    if output_json is not None:
+        output_json.write_text(json.dumps(all_inferences, indent=2))
+
 
-def run(config: dict, nerval_threshold: float):
+def run(config: dict, nerval_threshold: float, output_json: Path | None):
     update_config(config)
 
     mlflow_logging = bool(config.get("mlflow"))
@@ -234,8 +252,8 @@ def run(config: dict, nerval_threshold: float):
     ):
         mp.spawn(
             eval,
-            args=(config, nerval_threshold, mlflow_logging),
+            args=(config, nerval_threshold, output_json, mlflow_logging),
             nprocs=config["training"]["device"]["nb_gpu"],
         )
     else:
-        eval(0, config, nerval_threshold, mlflow_logging)
+        eval(0, config, nerval_threshold, output_json, mlflow_logging)
diff --git a/docs/usage/evaluate/index.md b/docs/usage/evaluate/index.md
index 94d57d18c812acbe184c721c0de3e52ad196d8a2..3f77356b3248ca1b288a9a994e8261d622b0d10a 100644
--- a/docs/usage/evaluate/index.md
+++ b/docs/usage/evaluate/index.md
@@ -23,6 +23,7 @@ This will, for each evaluated split:
 | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
 | `--config`           | Path to the configuration file.                                                                                                                                                                          | `pathlib.Path` |         |
 | `--nerval-threshold` | Distance threshold for the match between gold and predicted entity during Nerval evaluation. `0` would impose perfect matches, `1` would allow completely different strings to be considered as a match. | `float`        | `0.3`   |
+| `--output-json`      | Where to save evaluation results in JSON format.                                                                                                                                                         | `pathlib.Path` | `None`  |
 
 ## Examples
 
diff --git a/tests/data/evaluate/inference.json b/tests/data/evaluate/inference.json
new file mode 100644
index 0000000000000000000000000000000000000000..6aed3409128701223d850f182504b950550f5f64
--- /dev/null
+++ b/tests/data/evaluate/inference.json
@@ -0,0 +1,36 @@
+{
+    "train": [
+        [
+            "0a56e8b3-95cd-4fa5-a17b-5b0ff9e6ea84.png",
+            "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c512241",
+            "\u24c8Bellisson \u24bbGeorges \u24b791 \u24c1P \u24b8M \u24c0Ch \u24c4Plombier \u24c5Patron?12241",
+            "",
+            0.125
+        ],
+        [
+            "0dfe8bcd-ed0b-453e-bf19-cc697012296e.png",
+            "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1J \u24c0ch \u24c4E dachyle",
+            "\u24c8Templi\u00e9 \u24bbMarcelle \u24b793 \u24c1S \u24c0ch \u24c4E dactylo \u24c518376",
+            "",
+            0.4286
+        ]
+    ],
+    "val": [
+        [
+            "2c242f5c-e979-43c4-b6f2-a6d4815b651d.png",
+            "\u24c8A \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4A \u24c514331",
+            "\u24c8d \u24bbCharles \u24b711 \u24c1P \u24b8C \u24c0F \u24c4d \u24c514 31",
+            "",
+            0.5
+        ]
+    ],
+    "test": [
+        [
+            "ffdec445-7f14-4f5f-be44-68d0844d0df1.png",
+            "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8V \u24c0Belle m\u00e8re",
+            "\u24c8Naudin \u24bbMarie \u24b753 \u24c1S \u24b8v \u24c0Belle m\u00e8re",
+            "",
+            0.1429
+        ]
+    ]
+}
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 26b3c1a43b4f9457fa7c05155ed452534428a502..77a9c806dc0a9b5eb4855d11f4f066d9b18ce881 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -3,6 +3,7 @@
 
 # -*- coding: utf-8 -*-
 
+import json
 import shutil
 from pathlib import Path
 
@@ -199,11 +200,23 @@ def test_eval_nerval(capsys, evaluate_config):
         ),
     ),
 )
-def test_evaluate(capsys, training_res, val_res, test_res, evaluate_config):
+@pytest.mark.parametrize("is_output_json", ((True, False)))
+def test_evaluate(
+    capsys, training_res, val_res, test_res, is_output_json, evaluate_config, tmp_path
+):
+    evaluate_path = FIXTURES / "evaluate"
+
     # Use the tmp_path as base folder
-    evaluate_config["training"]["output_folder"] = FIXTURES / "evaluate"
+    evaluate_config["training"]["output_folder"] = evaluate_path
 
-    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
+    output_json = tmp_path / "inference.json" if is_output_json else None
+
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=output_json)
+
+    if is_output_json:
+        assert json.loads(output_json.read_text()) == json.loads(
+            (evaluate_path / "inference.json").read_text()
+        )
 
     # Check that the evaluation results are correct
     for split_name, expected_res in zip(
@@ -365,7 +378,7 @@ def test_evaluate_language_model(
         "weight": language_model_weight,
     }
 
-    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD)
+    evaluate.run(evaluate_config, evaluate.NERVAL_THRESHOLD, output_json=None)
 
     # Check that the evaluation results are correct
     for split_name, expected_res in [