Fix decoding issue on transkribus XML import
Refs https://gitlab.com/arkindex/requests/-/issues/627
These files are triggering the issue :
Download these files in a folder, write crash.py
as below, create a folder in a corpus you own on preprod or your own instance, update the references, and run...
When running:
export TRANSKRIBUS_WORKER_VERSION=71a8170fef50d399e4b7268819e9a7104d296e98
export ARKINDEX_API_URL=https://preprod.arkindex.teklia.com/api/v1
export ARKINDEX_API_TOKEN=..
python crash.py
from arkindex_tasks.import_transkribus import TranskribusElement
from zipfile import ZipFile
te = TranskribusElement(
archive=ZipFile("test.zip"),
image_filename="0019_3603842.png",
xml_filename="0019_3603842.xml",
page_id="0019_3603842",
page_number=1,
corpus_id="c36dae80-6ab7-4890-9f84-f5f657eb552d",
folder_id="ba8f7b3b-12c9-4e2b-b39a-743c780950dd",
element_type={"slug": "page"},
paragraph_type={"slug": "paragraph"},
line_type={"slug": "text_line"},
)
te.run()
Full traceback
[INFO] Numpy was not imported, continuing without requires()
[INFO] Extracting image 0019_3603842.png
[INFO] Creating image for page 0019_3603842
[INFO] Validating image 8dd6adec-79d0-42cb-baa4-85cb96c16297
[INFO] Creating page 0019_3603842
[INFO] Importing transcript 0019_3603842.xml to page f305d000-2fbb-4461-8ce2-2eec4ee12199
Traceback (most recent call last):
File "crash.py", line 16, in <module>
te.run()
File "/home/bastien/dev/ark/tasks/arkindex_tasks/import_transkribus.py", line 233, in run
return self.upload_transcriptions()
File "/home/bastien/dev/ark/tasks/arkindex_tasks/import_transkribus.py", line 145, in upload_transcriptions
parser = PageXmlParser(etree.fromstring(xml_text), WORKER_VERSION)
File "/home/bastien/dev/ark/tasks/arkindex_tasks/pagexml.py", line 14, in __init__
self.pagexml_page = PageXmlPage(path_or_xml)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 40, in __init__
super().__init__(path, namespaces=namespaces)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 27, in __init__
self.data = self.parse()
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 164, in parse
"page": self.get_instance(PageElement, "page:Page"),
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 75, in get_instance
return cls(child, **kwargs)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 40, in __init__
super().__init__(path, namespaces=namespaces)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 27, in __init__
self.data = self.parse()
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 128, in parse
"text_regions": self.get_instance(TextRegion, "page:TextRegion", many=True),
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 73, in get_instance
return list(cls(item, **kwargs) for item in child)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 73, in <genexpr>
return list(cls(item, **kwargs) for item in child)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 40, in __init__
super().__init__(path, namespaces=namespaces)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 27, in __init__
self.data = self.parse()
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 90, in parse
lines=self.get_instance(TextLine, "page:TextLine", many=True),
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 73, in get_instance
return list(cls(item, **kwargs) for item in child)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 73, in <genexpr>
return list(cls(item, **kwargs) for item in child)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 40, in __init__
super().__init__(path, namespaces=namespaces)
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/xml.py", line 27, in __init__
self.data = self.parse()
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 72, in parse
data = super().parse()
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 64, in parse
"tags": Tag.build(self.get_text("@custom")),
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 34, in build
tags.append(cls(*match.groups()))
File "/home/bastien/Envs/tasks/lib/python3.8/site-packages/transkribus/pagexml.py", line 22, in __init__
prop_value = prop_value.encode("latin1").decode("unicode_escape")
UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 31: ordinal not in range(256)