Skip to content
Snippets Groups Projects

Charset should only include training characters

Merged Manon Blanco requested to merge training-charset into main
2 files
+ 6
6
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -277,7+277,7 @@
if self.unknown_token in text:
raise UnknownTokenInText(element_id=element.id)
image_path = Path(self.output, IMAGES_DIR, split, element.id).with_suffix(
self.image_extension
)
@@ -293,7+293,7 @@
}
)
self.data[split][str(image_path)] = self.format_text(
text = self.format_text(
text,
# Do not replace unknown characters in train split
charset=self.charset if split != TRAIN_NAME else None,
)
if split == TRAIN_NAME:
self.charset = self.charset.union(set(text))
self.data[split][str(image_path)] = text
self.charset = self.charset.union(set(text))
def process_parent(
self,
Loading