Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
7 files
+ 29
20
Compare changes
  • Side-by-side
  • Inline
Files
7
@@ -279,12 +279,7 @@ class ArkindexExtractor:
"""
Format text for the language model. Return the text tokenized at character-level.
"""
return " ".join(
[
self.mapping.encode[token] if token in self.mapping.encode else token
for token in list(text.strip())
]
)
return " ".join(map(self.mapping.encode_token, list(text.strip())))
def process_element(
self,
@@ -323,6 +318,8 @@ class ArkindexExtractor:
self.data[split][str(image_path)] = text
self.charset = self.charset.union(set(text))
# Language model should be built using only text from the training set
if split == "train":
self.language_corpus.append(self.format_text_language_model(text))
Loading