Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
All threads resolved!
3 files
+ 39
22
Compare changes
  • Side-by-side
  • Inline
Files
3
@@ -36,13 +36,13 @@ from dan.datasets.extract.utils import (
normalize_linebreaks,
normalize_spaces,
)
from dan.utils import LM_MAPPING, EntityType, parse_tokens
from line_image_extractor.extractor import extract
from line_image_extractor.image_utils import (
BoundingBox,
Extraction,
polygon_to_bbox,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
IMAGES_DIR = "images" # Subpath to the images directory.
LANGUAGE_DIR = "language_model" # Subpath to the language model directory.
@@ -281,7 +281,7 @@ class ArkindexExtractor:
"""
return " ".join(
[
LM_MAPPING[token] if token in LM_MAPPING else token
self.mapping.encode[token] if token in self.mapping else token
for token in list(text.strip())
]
)
@@ -370,14 +370,14 @@ class ArkindexExtractor:
"""
for token in sorted(list(self.charset)):
assert (
token not in LM_MAPPING.values()
token not in self.mapping.encode.values()
), f"Special token {token} is reserved for language modeling."
self.language_tokens.append(
LM_MAPPING[token]
) if token in LM_MAPPING else self.language_tokens.append(token)
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
# Add the special blank token
self.language_tokens.append(LM_MAPPING["<ctc>"])
self.language_tokens.append(self.mapping.ctc.encoded)
# Build lexicon
assert all(
Loading