From 1a59fadf4587c5c90f99a4f7756790e70ffa3b86 Mon Sep 17 00:00:00 2001
From: Manon Blanco <blanco@teklia.com>
Date: Fri, 2 Feb 2024 12:45:31 +0000
Subject: [PATCH] Remove invalid characters to build valid XML tag

---
 dan/datasets/extract/utils.py       |  9 ++++++---
 tests/conftest.py                   | 28 ++++++++++++++++++++++++----
 tests/data/entities.yml             |  3 +++
 tests/data/tokens/end_tokens.yml    | 23 ++++++++++++++++-------
 tests/data/tokens/no_end_tokens.yml | 23 ++++++++++++++++-------
 tests/test_extract.py               | 16 +++++++++++++++-
 6 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 538e6ef2..cf6ae889 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
 
 logger = logging.getLogger(__name__)
 
-# replace \t with regular space and consecutive spaces
+# Replace \t with regular space and consecutive spaces
 TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
 TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
 
+# Remove invalid characters to build valid XML tag name
+SLUG_PATTERN = re.compile(r"[\W]+")
+
 # Some characters are encoded in XML but we don't want them encoded in the end
 ENCODING_MAP = {
     "&#13;": "\r",
@@ -174,9 +177,9 @@ class Tokenizer:
 
 def slugify(text: str):
     """
-    Replace space in text to underscores to use it as XML tag.
+    Replace invalid characters in text to underscores to use it as XML tag.
     """
-    return text.replace(" ", "_")
+    return SLUG_PATTERN.sub("_", text)
 
 
 def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:
diff --git a/tests/conftest.py b/tests/conftest.py
index f6f4b36a..fd2a6ddb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
         element=Element.select().first(),
     )
 
+    # Create worker version
     WorkerVersion.bulk_create(
         [
             WorkerVersion(
@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
                 revision="main",
                 type="worker",
             )
-            for nestation in ("nested", "non-nested")
+            for nestation in ("nested", "non-nested", "special-chars")
         ]
     )
 
-    entities = [
+    # Create entities
+    for entity in [
         # Non-nested entities
         {
             "worker_version": "non-nested-id",
@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
             "name": "us",
             "offset": 43,
         },
-    ]
-    for entity in entities:
+        # Special characters
+        {
+            "worker_version": "special-chars-id",
+            "type": "Arkindex's entity",
+            "name": "great",
+            "offset": 4,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": '"Name" (1)',
+            "name": "Charles",
+            "offset": 15,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": "Person /!\\",
+            "name": "us",
+            "offset": 43,
+        },
+    ]:
         create_transcription_entity(transcription=transcription, **entity)
 
     return database_path
diff --git a/tests/data/entities.yml b/tests/data/entities.yml
index c690cbc0..4bf7513a 100644
--- a/tests/data/entities.yml
+++ b/tests/data/entities.yml
@@ -1,5 +1,8 @@
 ---
 entities:
+- '"Name" (1)'
+- Arkindex's entity
+- Person /!\
 - adj
 - birthdate
 - firstname
diff --git a/tests/data/tokens/end_tokens.yml b/tests/data/tokens/end_tokens.yml
index c55ca994..c660b778 100644
--- a/tests/data/tokens/end_tokens.yml
+++ b/tests/data/tokens/end_tokens.yml
@@ -1,22 +1,31 @@
 ---
-adj:
+'"Name" (1)':
   start: â’¶
   end: â’·
-birthdate:
+Arkindex's entity:
   start: â’¸
   end: â’¹
-firstname:
+Person /!\:
   start: â’º
   end: â’»
-fullname:
+adj:
   start: â’¼
   end: â’½
-name:
+birthdate:
   start: â’¾
   end: â’¿
-person:
+firstname:
   start: â“€
   end: Ⓛ
-surname:
+fullname:
   start: â“‚
   end: Ⓝ
+name:
+  start: â“„
+  end: â“…
+person:
+  start: Ⓠ
+  end: Ⓡ
+surname:
+  start: Ⓢ
+  end: Ⓣ
diff --git a/tests/data/tokens/no_end_tokens.yml b/tests/data/tokens/no_end_tokens.yml
index 49ab427c..a1959726 100644
--- a/tests/data/tokens/no_end_tokens.yml
+++ b/tests/data/tokens/no_end_tokens.yml
@@ -1,22 +1,31 @@
 ---
-adj:
+'"Name" (1)':
   start: â’¶
   end: ''
-birthdate:
+Arkindex's entity:
   start: â’·
   end: ''
-firstname:
+Person /!\:
   start: â’¸
   end: ''
-fullname:
+adj:
   start: â’¹
   end: ''
-name:
+birthdate:
   start: â’º
   end: ''
-person:
+firstname:
   start: â’»
   end: ''
-surname:
+fullname:
   start: â’¼
   end: ''
+name:
+  start: â’½
+  end: ''
+person:
+  start: â’¾
+  end: ''
+surname:
+  start: â’¿
+  end: ''
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 1bf92c3a..733449a5 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
             "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
             ["\n", " "],
         ),
+        # Special characters in entities
+        (
+            "special-chars-id",
+            "<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
+            None,
+        ),
     ),
 )
 def test_entities_to_xml(mock_database, nestation, xml_output, separators):
@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
             predictions=get_transcription_entities(
                 transcription_id="tr-with-entities",
                 entity_worker_versions=[nestation],
-                supported_types=["name", "fullname", "person", "adj"],
+                supported_types=[
+                    "name",
+                    "fullname",
+                    "person",
+                    "adj",
+                    "Arkindex's entity",
+                    '"Name" (1)',
+                    "Person /!\\",
+                ],
             ),
             entity_separators=separators,
         )
-- 
GitLab