diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 538e6ef20925e11a5f57df0b0e9f3ea86ab3ff03..cf6ae8896446e0d28e26da428463033d8b6533dd 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping logger = logging.getLogger(__name__) -# replace \t with regular space and consecutive spaces +# Replace \t with regular space and consecutive spaces TRIM_SPACE_REGEX = re.compile(r"[\t ]+") TRIM_RETURN_REGEX = re.compile(r"[\r\n]+") +# Remove invalid characters to build valid XML tag name +SLUG_PATTERN = re.compile(r"[\W]+") + # Some characters are encoded in XML but we don't want them encoded in the end ENCODING_MAP = { " ": "\r", @@ -174,9 +177,9 @@ class Tokenizer: def slugify(text: str): """ - Replace space in text to underscores to use it as XML tag. + Replace invalid characters in text to underscores to use it as XML tag. """ - return text.replace(" ", "_") + return SLUG_PATTERN.sub("_", text) def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None: diff --git a/tests/conftest.py b/tests/conftest.py index f6f4b36a2cb1fbeacca8550db64958f2a74540bc..fd2a6ddb5fb42155b97e8258149ca78f3cc92dcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -213,6 +213,7 @@ def mock_database(tmp_path_factory): element=Element.select().first(), ) + # Create worker version WorkerVersion.bulk_create( [ WorkerVersion( @@ -223,11 +224,12 @@ def mock_database(tmp_path_factory): revision="main", type="worker", ) - for nestation in ("nested", "non-nested") + for nestation in ("nested", "non-nested", "special-chars") ] ) - entities = [ + # Create entities + for entity in [ # Non-nested entities { "worker_version": "non-nested-id", @@ -266,8 +268,26 @@ def mock_database(tmp_path_factory): "name": "us", "offset": 43, }, - ] - for entity in entities: + # Special characters + { + "worker_version": "special-chars-id", + "type": "Arkindex's entity", + "name": "great", + "offset": 4, + }, + { + "worker_version": "special-chars-id", + "type": '"Name" (1)', + "name": "Charles", + "offset": 15, + }, + { + "worker_version": "special-chars-id", + "type": "Person /!\\", + "name": "us", + "offset": 43, + }, + ]: create_transcription_entity(transcription=transcription, **entity) return database_path diff --git a/tests/data/entities.yml b/tests/data/entities.yml index c690cbc0d122060622d8172ef1c57ff0700765ba..4bf7513a40cbeec0b6ce5367fcf0925848277d79 100644 --- a/tests/data/entities.yml +++ b/tests/data/entities.yml @@ -1,5 +1,8 @@ --- entities: +- '"Name" (1)' +- Arkindex's entity +- Person /!\ - adj - birthdate - firstname diff --git a/tests/data/tokens/end_tokens.yml b/tests/data/tokens/end_tokens.yml index c55ca9946d22e52e0259045727f988240ba8c812..c660b7788fc5fafd48d197be29a55b5dd10e21fa 100644 --- a/tests/data/tokens/end_tokens.yml +++ b/tests/data/tokens/end_tokens.yml @@ -1,22 +1,31 @@ --- -adj: +'"Name" (1)': start: Ⓐ end: Ⓑ -birthdate: +Arkindex's entity: start: Ⓒ end: Ⓓ -firstname: +Person /!\: start: Ⓔ end: Ⓕ -fullname: +adj: start: Ⓖ end: Ⓗ -name: +birthdate: start: Ⓘ end: Ⓙ -person: +firstname: start: Ⓚ end: ⓠ-surname: +fullname: start: Ⓜ end: Ⓝ +name: + start: Ⓞ + end: Ⓟ +person: + start: Ⓠ + end: Ⓡ +surname: + start: Ⓢ + end: Ⓣ diff --git a/tests/data/tokens/no_end_tokens.yml b/tests/data/tokens/no_end_tokens.yml index 49ab427c21e83e276130886c15fec8596e2fd7fb..a19597267d1952aad03b58c66dafa5fe43e5f52b 100644 --- a/tests/data/tokens/no_end_tokens.yml +++ b/tests/data/tokens/no_end_tokens.yml @@ -1,22 +1,31 @@ --- -adj: +'"Name" (1)': start: Ⓐ end: '' -birthdate: +Arkindex's entity: start: Ⓑ end: '' -firstname: +Person /!\: start: Ⓒ end: '' -fullname: +adj: start: Ⓓ end: '' -name: +birthdate: start: Ⓔ end: '' -person: +firstname: start: Ⓕ end: '' -surname: +fullname: start: Ⓖ end: '' +name: + start: Ⓗ + end: '' +person: + start: Ⓘ + end: '' +surname: + start: Ⓙ + end: '' diff --git a/tests/test_extract.py b/tests/test_extract.py index 1bf92c3ac1e318a51a15e421408e74d5db83e3eb..733449a56975c2865716c7394c1cc10c9e62df16 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens): "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>", ["\n", " "], ), + # Special characters in entities + ( + "special-chars-id", + "<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>", + None, + ), ), ) def test_entities_to_xml(mock_database, nestation, xml_output, separators): @@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators): predictions=get_transcription_entities( transcription_id="tr-with-entities", entity_worker_versions=[nestation], - supported_types=["name", "fullname", "person", "adj"], + supported_types=[ + "name", + "fullname", + "person", + "adj", + "Arkindex's entity", + '"Name" (1)', + "Person /!\\", + ], ), entity_separators=separators, )