Skip to content
Snippets Groups Projects
Commit 1a59fadf authored by Manon Blanco's avatar Manon Blanco Committed by Yoann Schneider
Browse files

Remove invalid characters to build valid XML tag

parent 88024ccd
No related branches found
No related tags found
1 merge request!350Remove invalid characters to build valid XML tag
...@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping ...@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# replace \t with regular space and consecutive spaces # Replace \t with regular space and consecutive spaces
TRIM_SPACE_REGEX = re.compile(r"[\t ]+") TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
TRIM_RETURN_REGEX = re.compile(r"[\r\n]+") TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
# Remove invalid characters to build valid XML tag name
SLUG_PATTERN = re.compile(r"[\W]+")
# Some characters are encoded in XML but we don't want them encoded in the end # Some characters are encoded in XML but we don't want them encoded in the end
ENCODING_MAP = { ENCODING_MAP = {
"
": "\r", "
": "\r",
...@@ -174,9 +177,9 @@ class Tokenizer: ...@@ -174,9 +177,9 @@ class Tokenizer:
def slugify(text: str): def slugify(text: str):
""" """
Replace space in text to underscores to use it as XML tag. Replace invalid characters in text to underscores to use it as XML tag.
""" """
return text.replace(" ", "_") return SLUG_PATTERN.sub("_", text)
def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None: def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:
......
...@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory): ...@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
element=Element.select().first(), element=Element.select().first(),
) )
# Create worker version
WorkerVersion.bulk_create( WorkerVersion.bulk_create(
[ [
WorkerVersion( WorkerVersion(
...@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory): ...@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
revision="main", revision="main",
type="worker", type="worker",
) )
for nestation in ("nested", "non-nested") for nestation in ("nested", "non-nested", "special-chars")
] ]
) )
entities = [ # Create entities
for entity in [
# Non-nested entities # Non-nested entities
{ {
"worker_version": "non-nested-id", "worker_version": "non-nested-id",
...@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory): ...@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
"name": "us", "name": "us",
"offset": 43, "offset": 43,
}, },
] # Special characters
for entity in entities: {
"worker_version": "special-chars-id",
"type": "Arkindex's entity",
"name": "great",
"offset": 4,
},
{
"worker_version": "special-chars-id",
"type": '"Name" (1)',
"name": "Charles",
"offset": 15,
},
{
"worker_version": "special-chars-id",
"type": "Person /!\\",
"name": "us",
"offset": 43,
},
]:
create_transcription_entity(transcription=transcription, **entity) create_transcription_entity(transcription=transcription, **entity)
return database_path return database_path
......
--- ---
entities: entities:
- '"Name" (1)'
- Arkindex's entity
- Person /!\
- adj - adj
- birthdate - birthdate
- firstname - firstname
......
--- ---
adj: '"Name" (1)':
start: start:
end: end:
birthdate: Arkindex's entity:
start: start:
end: end:
firstname: Person /!\:
start: start:
end: end:
fullname: adj:
start: start:
end: end:
name: birthdate:
start: start:
end: end:
person: firstname:
start: start:
end: end:
surname: fullname:
start: start:
end: end:
name:
start:
end:
person:
start:
end:
surname:
start:
end:
--- ---
adj: '"Name" (1)':
start: start:
end: '' end: ''
birthdate: Arkindex's entity:
start: start:
end: '' end: ''
firstname: Person /!\:
start: start:
end: '' end: ''
fullname: adj:
start: start:
end: '' end: ''
name: birthdate:
start: start:
end: '' end: ''
person: firstname:
start: start:
end: '' end: ''
surname: fullname:
start: start:
end: '' end: ''
name:
start:
end: ''
person:
start:
end: ''
surname:
start:
end: ''
...@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens): ...@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
"<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>", "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
["\n", " "], ["\n", " "],
), ),
# Special characters in entities
(
"special-chars-id",
"<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
None,
),
), ),
) )
def test_entities_to_xml(mock_database, nestation, xml_output, separators): def test_entities_to_xml(mock_database, nestation, xml_output, separators):
...@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators): ...@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
predictions=get_transcription_entities( predictions=get_transcription_entities(
transcription_id="tr-with-entities", transcription_id="tr-with-entities",
entity_worker_versions=[nestation], entity_worker_versions=[nestation],
supported_types=["name", "fullname", "person", "adj"], supported_types=[
"name",
"fullname",
"person",
"adj",
"Arkindex's entity",
'"Name" (1)',
"Person /!\\",
],
), ),
entity_separators=separators, entity_separators=separators,
) )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment