Remove invalid characters to build valid XML tag

1a59fadf · Manon Blanco · Yoann Schneider · 88024ccd · 1a59fadf · 1a59fadf
Commit 1a59fadf authored 1 year ago by Manon Blanco Committed by Yoann Schneider 1 year ago
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
 logger = logging.getLogger(__name__)
-# replace \t with regular space and consecutive spaces
+# Replace \t with regular space and consecutive spaces
 TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
 TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
+# Remove invalid characters to build valid XML tag name
+SLUG_PATTERN = re.compile(r"[\W]+")
 # Some characters are encoded in XML but we don't want them encoded in the end
 ENCODING_MAP = {
    "&#13;": "\r",
@@ -174,9 +177,9 @@ class Tokenizer:
 def slugify(text: str):
    """
-    Replace space in text to underscores to use it as XML tag.
+    Replace invalid characters in text to underscores to use it as XML tag.
    """
-    return text.replace(" ", "_")
+    return SLUG_PATTERN.sub("_", text)
 def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
        element=Element.select().first(),
    )
+    # Create worker version
    WorkerVersion.bulk_create(
        [
            WorkerVersion(
@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
                revision="main",
                type="worker",
            )
-            for nestation in ("nested", "non-nested")
+            for nestation in ("nested", "non-nested", "special-chars")
        ]
    )
-    entities = [
+    # Create entities
+    for entity in [
        # Non-nested entities
        {
            "worker_version": "non-nested-id",
@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
            "name": "us",
            "offset": 43,
        },
-    ]
+        # Special characters
-    for entity in entities:
+        {
+            "worker_version": "special-chars-id",
+            "type": "Arkindex's entity",
+            "name": "great",
+            "offset": 4,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": '"Name" (1)',
+            "name": "Charles",
+            "offset": 15,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": "Person /!\\",
+            "name": "us",
+            "offset": 43,
+        },
+    ]:
        create_transcription_entity(transcription=transcription, **entity)
    return database_path

--- a/tests/data/entities.yml
+++ b/tests/data/entities.yml
 ---
 entities:
+- '"Name" (1)'
+- Arkindex's entity
+- Person /!\
 - adj
 - birthdate
 - firstname

--- a/tests/data/tokens/end_tokens.yml
+++ b/tests/data/tokens/end_tokens.yml
 ---
-adj:
+'"Name" (1)':
  start: Ⓐ
  end: Ⓑ
-birthdate:
+Arkindex's entity:
  start: Ⓒ
  end: Ⓓ
-firstname:
+Person /!\:
  start: Ⓔ
  end: Ⓕ
-fullname:
+adj:
  start: Ⓖ
  end: Ⓗ
-name:
+birthdate:
  start: Ⓘ
  end: Ⓙ
-person:
+firstname:
  start: Ⓚ
  end: Ⓛ
-surname:
+fullname:
  start: Ⓜ
  end: Ⓝ
+name:
+  start: Ⓞ
+  end: Ⓟ
+person:
+  start: Ⓠ
+  end: Ⓡ
+surname:
+  start: Ⓢ
+  end: Ⓣ
--- a/tests/data/tokens/no_end_tokens.yml
+++ b/tests/data/tokens/no_end_tokens.yml
 ---
-adj:
+'"Name" (1)':
  start: Ⓐ
  end: ''
-birthdate:
+Arkindex's entity:
  start: Ⓑ
  end: ''
-firstname:
+Person /!\:
  start: Ⓒ
  end: ''
-fullname:
+adj:
  start: Ⓓ
  end: ''
-name:
+birthdate:
  start: Ⓔ
  end: ''
-person:
+firstname:
  start: Ⓕ
  end: ''
-surname:
+fullname:
  start: Ⓖ
  end: ''
+name:
+  start: Ⓗ
+  end: ''
+person:
+  start: Ⓘ
+  end: ''
+surname:
+  start: Ⓙ
+  end: ''
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
            "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
            ["\n", " "],
        ),
+        # Special characters in entities
+        (
+            "special-chars-id",
+            "<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
+            None,
+        ),
    ),
 )
 def test_entities_to_xml(mock_database, nestation, xml_output, separators):
@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
            predictions=get_transcription_entities(
                transcription_id="tr-with-entities",
                entity_worker_versions=[nestation],
-                supported_types=["name", "fullname", "person", "adj"],
+                supported_types=[
+                    "name",
+                    "fullname",
+                    "person",
+                    "adj",
+                    "Arkindex's entity",
+                    '"Name" (1)',
+                    "Person /!\\",
+                ],
            ),
            entity_separators=separators,
        )