diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index bb9fb9ddca02cedc9ba6a0150c86efe2489fc313..93c2fe2f6dfd44c2ec947449b0f73ceaa317a0fd 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -83,7 +83,14 @@ def add_extract_parser(subcommands) -> None:
 
     # Optional arguments.
     parser.add_argument(
-        "--load-entities", action="store_true", help="Extract text with their entities."
+        "--load-entities",
+        action="store_true",
+        help="Extract text with their entities.",
+    )
+    parser.add_argument(
+        "--only-entities",
+        action="store_true",
+        help="Remove all text that does not belong to the tokens.",
     )
     parser.add_argument(
         "--allow-unknown-entities",
diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py
index 22c47a6c0faafe0f8aa1796fceae1f40a7c22fd4..da8fba65541f8df9c075cba963b3e23481f6aae0 100644
--- a/dan/datasets/extract/exceptions.py
+++ b/dan/datasets/extract/exceptions.py
@@ -62,3 +62,18 @@ class UnknownLabelError(ProcessingError):
 
     def __str__(self) -> str:
         return f"Label `{self.label}` is missing in the NER configuration."
+
+
+class NoEndTokenError(ProcessingError):
+    """
+    Raised when the specified label has no end token and there is potentially additional text around the labels
+    """
+
+    label: str
+
+    def __init__(self, label: str, *args: object) -> None:
+        super().__init__(*args)
+        self.label = label
+
+    def __str__(self) -> str:
+        return f"Label `{self.label}` has no end token."
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index bbededdcd1062a270ece83e5d6de2052546c9f07..71160fda4a35b6ec2c1982618e59a0cf25809c85 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -17,11 +17,13 @@ from dan.datasets.extract.db import (
     get_transcriptions,
 )
 from dan.datasets.extract.exceptions import (
+    NoEndTokenError,
     NoTranscriptionError,
     ProcessingError,
     UnknownLabelError,
 )
 from dan.datasets.extract.utils import (
+    EntityType,
     Subset,
     download_image,
     insert_token,
@@ -34,6 +36,8 @@ IMAGES_DIR = "images"  # Subpath to the images directory.
 LABELS_DIR = "labels"  # Subpath to the labels directory.
 SPLIT_NAMES = ["train", "val", "test"]
 
+EMPTY_CHARS = [" ", "\n", "\t", "\r"]
+
 
 class ArkindexExtractor:
     """
@@ -48,6 +52,7 @@ class ArkindexExtractor:
         output: Path = None,
         load_entities: bool = False,
         allow_unknown_entities: bool = False,
+        only_entities: bool = False,
         tokens: Path = None,
         use_existing_split: bool = None,
         transcription_worker_version: Optional[Union[str, bool]] = None,
@@ -62,6 +67,7 @@ class ArkindexExtractor:
         self.output = output
         self.load_entities = load_entities
         self.allow_unknown_entities = allow_unknown_entities
+        self.only_entities = only_entities
         self.tokens = parse_tokens(tokens) if self.load_entities else None
         self.use_existing_split = use_existing_split
         self.transcription_worker_version = transcription_worker_version
@@ -100,31 +106,46 @@ class ArkindexExtractor:
     def get_random_split(self):
         return next(self._assign_random_split())
 
-    def reconstruct_text(self, text: str, entities):
+    def reconstruct_text(self, full_text: str, entities):
         """
         Insert tokens delimiting the start/end of each entity on the transcription.
         """
+        text, text_offset = "", 0
+        for entity in entities:
+            # Text before entity
+            if not self.only_entities:
+                text += full_text[text_offset : entity.offset]
 
-        # Filter entities
-        for entity in entities.copy():
-            # Tokens known for this entity
-            if entity.type in self.tokens:
-                continue
-            # Tokens unknown for this entity
-            if not self.allow_unknown_entities:
+            entity_type: EntityType = self.tokens.get(entity.type)
+            if not entity_type and not self.allow_unknown_entities:
                 raise UnknownLabelError(entity.type)
-            entities.remove(entity)
-
-        # Only keep text of the filtered entities
-        return " ".join(
-            insert_token(
-                text,
-                entity_type=self.tokens[entity.type],
-                offset=entity.offset,
-                length=entity.length,
-            )
-            for entity in entities
-        )
+            if entity_type and not entity_type.end and not self.only_entities:
+                raise NoEndTokenError(entity.type)
+
+            # Entity text (optionally with tokens)
+            if entity_type or not self.only_entities:
+                text += insert_token(
+                    full_text,
+                    entity_type,
+                    offset=entity.offset,
+                    length=entity.length,
+                )
+            text_offset = entity.offset + entity.length
+
+            # Keep the exact separator after entity (it can be a space, a line break etc.)
+            if self.only_entities:
+                separator = next(
+                    (char for char in full_text[text_offset:] if char in EMPTY_CHARS),
+                    "",
+                )
+                text += separator
+
+        # Remaining text
+        if not self.only_entities:
+            text += full_text[text_offset:]
+
+        # Remove extra spaces
+        return text.strip("".join(EMPTY_CHARS))
 
     def extract_transcription(self, element: Element):
         """
@@ -139,7 +160,7 @@ class ArkindexExtractor:
 
         transcription = random.choice(transcriptions)
 
-        if self.load_entities:
+        if self.load_entities or self.only_entities:
             entities = get_transcription_entities(
                 transcription.id, self.entity_worker_version
             )
@@ -228,6 +249,7 @@ def run(
     output: Path,
     load_entities: bool,
     allow_unknown_entities: bool,
+    only_entities: bool,
     tokens: Path,
     use_existing_split: bool,
     train_folder: UUID,
@@ -280,6 +302,7 @@ def run(
         output=output,
         load_entities=load_entities,
         allow_unknown_entities=allow_unknown_entities,
+        only_entities=only_entities,
         tokens=tokens,
         use_existing_split=use_existing_split,
         transcription_worker_version=transcription_worker_version,
diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 3a12533709bc63ea0071bfae83e600130b8438f5..3194cccd86112bff93754ef7e7e5f12c2c3a87c2 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -80,11 +80,11 @@ def insert_token(text: str, entity_type: EntityType, offset: int, length: int) -
     """
     return (
         # Starting token
-        entity_type.start
+        (entity_type.start if entity_type else "")
         # Entity
         + text[offset : offset + length]
         # End token
-        + entity_type.end
+        + (entity_type.end if entity_type else "")
     )
 
 
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index c8aa34163ba389a004252b418e8afd11854ec800..41df7a7ba48e05dee1baabeaf4fb1db6ec6abaf6 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -12,6 +12,7 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 | `--parent-element-type`          | Type of the parent element containing the data.                                     | `str`           | `page`  |
 | `--output`                       | Folder where the data will be generated.                                            | `Path`          |         |
 | `--load-entities`                | Extract text with their entities. Needed for NER tasks.                             | `bool`          | `False` |
+| `--only-entities`                | Remove all text that does not belong to the tokens.                                 | `bool`          | `False` |
 | `--allow-unknown-entities`       | Ignore entities that do not appear in the list of tokens.                           | `bool`          | `False` |
 | `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.               | `Path`          |         |
 | `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                 | `bool`          |         |
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 7f1b363e1879926b719b28f9704228ec17a4cc43..68b08a1de765a1266e7ddb2e8b13cccaeaa3f2fa 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -25,14 +25,23 @@ def test_insert_token(text, offset, length, expected):
 
 
 @pytest.mark.parametrize(
-    "tokens,text,entities,expected",
+    "only_entities,expected",
     (
-        (
-            {
-                "P": EntityType(start="â“Ÿ", end="â“…"),
-                "D": EntityType(start="â““", end="â’¹"),
-            },
-            "nÂ°1 x 16 janvier 1611",
+        (False, "â“ŸnÂ°1â“… x â““16 janvier 1611â’¹ x Michou"),
+        (True, "â“ŸnÂ°1â“… â““16 janvier 1611â’¹"),
+    ),
+)
+def test_reconstruct_text(only_entities, expected):
+    arkindex_extractor = ArkindexExtractor(
+        allow_unknown_entities=True, only_entities=only_entities
+    )
+    arkindex_extractor.tokens = {
+        "P": EntityType(start="â“Ÿ", end="â“…"),
+        "D": EntityType(start="â““", end="â’¹"),
+    }
+    assert (
+        arkindex_extractor.reconstruct_text(
+            "nÂ°1 x 16 janvier 1611 x Michou",
             [
                 Entity(
                     offset=0,
@@ -46,33 +55,13 @@ def test_insert_token(text, offset, length, expected):
                     type="D",
                     value="16 janvier 1611",
                 ),
-            ],
-            "â“ŸnÂ°1â“… â““16 janvier 1611â’¹",
-        ),
-        (
-            {
-                "P": EntityType(start="â“Ÿ", end="â“…"),
-            },
-            "nÂ°1 x 16 janvier 1611",
-            [
                 Entity(
-                    offset=0,
-                    length=3,
-                    type="P",
-                    value="nÂ°1",
-                ),
-                Entity(
-                    offset=6,
-                    length=15,
-                    type="D",
-                    value="16 janvier 1611",
+                    offset=24,
+                    length=6,
+                    type="N",
+                    value="Michou",
                 ),
             ],
-            "â“ŸnÂ°1â“…",
-        ),
-    ),
-)
-def test_reconstruct_text(tokens, text, entities, expected):
-    arkindex_extractor = ArkindexExtractor(allow_unknown_entities=True)
-    arkindex_extractor.tokens = tokens
-    assert arkindex_extractor.reconstruct_text(text, entities) == expected
+        )
+        == expected
+    )