Manon Blanco
--- a/dan/datasets/extract/extract.py

+ 3

− 3
+++ b/dan/datasets/extract/extract.py

+ 3

− 3
 @@ -277,7+277,7 @@

        if self.unknown_token in text:
            raise UnknownTokenInText(element_id=element.id)

        image_path = Path(self.output, IMAGES_DIR, split, element.id).with_suffix(
            self.image_extension
        )
 @@ -293,7+293,7 @@
                }
            )

-        self.data[split][str(image_path)] = self.format_text(
+        text = self.format_text(
            text,
            # Do not replace unknown characters in train split
            charset=self.charset if split != TRAIN_NAME else None,
        )

-        if split == TRAIN_NAME:
-            self.charset = self.charset.union(set(text))
+        self.data[split][str(image_path)] = text
+        self.charset = self.charset.union(set(text))

    def process_parent(
        self,