diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 538e6ef20925e11a5f57df0b0e9f3ea86ab3ff03..cf6ae8896446e0d28e26da428463033d8b6533dd 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
 
 logger = logging.getLogger(__name__)
 
-# replace \t with regular space and consecutive spaces
+# Replace \t with regular space and consecutive spaces
 TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
 TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
 
+# Remove invalid characters to build valid XML tag name
+SLUG_PATTERN = re.compile(r"[\W]+")
+
 # Some characters are encoded in XML but we don't want them encoded in the end
 ENCODING_MAP = {
     "
": "\r",
@@ -174,9 +177,9 @@ class Tokenizer:
 
 def slugify(text: str):
     """
-    Replace space in text to underscores to use it as XML tag.
+    Replace invalid characters in text to underscores to use it as XML tag.
     """
-    return text.replace(" ", "_")
+    return SLUG_PATTERN.sub("_", text)
 
 
 def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:
diff --git a/tests/conftest.py b/tests/conftest.py
index f6f4b36a2cb1fbeacca8550db64958f2a74540bc..fd2a6ddb5fb42155b97e8258149ca78f3cc92dcd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
         element=Element.select().first(),
     )
 
+    # Create worker version
     WorkerVersion.bulk_create(
         [
             WorkerVersion(
@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
                 revision="main",
                 type="worker",
             )
-            for nestation in ("nested", "non-nested")
+            for nestation in ("nested", "non-nested", "special-chars")
         ]
     )
 
-    entities = [
+    # Create entities
+    for entity in [
         # Non-nested entities
         {
             "worker_version": "non-nested-id",
@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
             "name": "us",
             "offset": 43,
         },
-    ]
-    for entity in entities:
+        # Special characters
+        {
+            "worker_version": "special-chars-id",
+            "type": "Arkindex's entity",
+            "name": "great",
+            "offset": 4,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": '"Name" (1)',
+            "name": "Charles",
+            "offset": 15,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": "Person /!\\",
+            "name": "us",
+            "offset": 43,
+        },
+    ]:
         create_transcription_entity(transcription=transcription, **entity)
 
     return database_path
diff --git a/tests/data/entities.yml b/tests/data/entities.yml
index c690cbc0d122060622d8172ef1c57ff0700765ba..4bf7513a40cbeec0b6ce5367fcf0925848277d79 100644
--- a/tests/data/entities.yml
+++ b/tests/data/entities.yml
@@ -1,5 +1,8 @@
 ---
 entities:
+- '"Name" (1)'
+- Arkindex's entity
+- Person /!\
 - adj
 - birthdate
 - firstname
diff --git a/tests/data/tokens/end_tokens.yml b/tests/data/tokens/end_tokens.yml
index c55ca9946d22e52e0259045727f988240ba8c812..c660b7788fc5fafd48d197be29a55b5dd10e21fa 100644
--- a/tests/data/tokens/end_tokens.yml
+++ b/tests/data/tokens/end_tokens.yml
@@ -1,22 +1,31 @@
 ---
-adj:
+'"Name" (1)':
   start: â’¶
   end: â’·
-birthdate:
+Arkindex's entity:
   start: â’¸
   end: â’¹
-firstname:
+Person /!\:
   start: â’º
   end: â’»
-fullname:
+adj:
   start: â’¼
   end: â’½
-name:
+birthdate:
   start: â’¾
   end: â’¿
-person:
+firstname:
   start: â“€
   end: Ⓛ
-surname:
+fullname:
   start: â“‚
   end: Ⓝ
+name:
+  start: â“„
+  end: â“…
+person:
+  start: Ⓠ
+  end: Ⓡ
+surname:
+  start: Ⓢ
+  end: Ⓣ
diff --git a/tests/data/tokens/no_end_tokens.yml b/tests/data/tokens/no_end_tokens.yml
index 49ab427c21e83e276130886c15fec8596e2fd7fb..a19597267d1952aad03b58c66dafa5fe43e5f52b 100644
--- a/tests/data/tokens/no_end_tokens.yml
+++ b/tests/data/tokens/no_end_tokens.yml
@@ -1,22 +1,31 @@
 ---
-adj:
+'"Name" (1)':
   start: â’¶
   end: ''
-birthdate:
+Arkindex's entity:
   start: â’·
   end: ''
-firstname:
+Person /!\:
   start: â’¸
   end: ''
-fullname:
+adj:
   start: â’¹
   end: ''
-name:
+birthdate:
   start: â’º
   end: ''
-person:
+firstname:
   start: â’»
   end: ''
-surname:
+fullname:
   start: â’¼
   end: ''
+name:
+  start: â’½
+  end: ''
+person:
+  start: â’¾
+  end: ''
+surname:
+  start: â’¿
+  end: ''
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 1bf92c3ac1e318a51a15e421408e74d5db83e3eb..733449a56975c2865716c7394c1cc10c9e62df16 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
             "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
             ["\n", " "],
         ),
+        # Special characters in entities
+        (
+            "special-chars-id",
+            "<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
+            None,
+        ),
     ),
 )
 def test_entities_to_xml(mock_database, nestation, xml_output, separators):
@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
             predictions=get_transcription_entities(
                 transcription_id="tr-with-entities",
                 entity_worker_versions=[nestation],
-                supported_types=["name", "fullname", "person", "adj"],
+                supported_types=[
+                    "name",
+                    "fullname",
+                    "person",
+                    "adj",
+                    "Arkindex's entity",
+                    '"Name" (1)',
+                    "Person /!\\",
+                ],
             ),
             entity_separators=separators,
         )