Solene Tarride
--- a/dan/datasets/extract/arkindex.py

+ 7

− 7
+++ b/dan/datasets/extract/arkindex.py

+ 7

− 7
 @@ -36,13 +36,13 @@ from dan.datasets.extract.utils import (
    normalize_linebreaks,
    normalize_spaces,
 )
-from dan.utils import LM_MAPPING, EntityType, parse_tokens
-from line_image_extractor.extractor import extract
 from line_image_extractor.image_utils import (
    BoundingBox,
    Extraction,
    polygon_to_bbox,
 )
+from dan.utils import EntityType, LMTokenMapping, parse_tokens
+from line_image_extractor.extractor import extract

 IMAGES_DIR = "images"  # Subpath to the images directory.
 LANGUAGE_DIR = "language_model"  # Subpath to the language model directory.
 @@ -281,7 +281,7 @@ class ArkindexExtractor:
        """
        return " ".join(
            [
-                LM_MAPPING[token] if token in LM_MAPPING else token
+                self.mapping.encode[token] if token in self.mapping else token
                for token in list(text.strip())
            ]
        )
 @@ -370,14 +370,14 @@ class ArkindexExtractor:
        """
        for token in sorted(list(self.charset)):
            assert (
-                token not in LM_MAPPING.values()
+                token not in self.mapping.encode.values()
            ), f"Special token {token} is reserved for language modeling."
            self.language_tokens.append(
-                LM_MAPPING[token]
-            ) if token in LM_MAPPING else self.language_tokens.append(token)
+                self.mapping.encode[token]
+            ) if token in self.mapping.encode else self.language_tokens.append(token)

        # Add the special blank token
-        self.language_tokens.append(LM_MAPPING["<ctc>"])
+        self.language_tokens.append(self.mapping.ctc.encoded)

        # Build lexicon
        assert all(