diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 6b13c5d8ff1ece325f4c3d617a8ef856f5c9c979..88975536ce4a725b94e9ecafe3671d0d63d393f3 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -371,8 +371,11 @@ class ArkindexExtractor: text.replace(self.mapping.linebreak.display, self.mapping.space.display) for text in self.data["train"].values() ] + tokenizer = Tokenizer( training_corpus=train_corpus, + charset=self.language_tokens, + unknown_token=self.unknown_token, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens, diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index f0371b6aed7d4912de048f0358e0c372c0bfd7ee..60e597eceb191e5c9c5c53e58838f4427dca00b2 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -131,9 +131,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]: Compute set of vocabulary from tokenzied text. :param tokenized_text: List of tokenized text. """ - return sorted( - set([token for doc in tokenized_text for token in doc.split() if token != ""]) - ) + return sorted(set([token for doc in tokenized_text for token in doc.split()])) @dataclass @@ -148,6 +146,8 @@ class Tokenizer: """ training_corpus: List[str] + charset: List[str] + unknown_token: str outdir: Path mapping: LMTokenMapping tokens: Optional[EntityType] = None @@ -225,7 +225,11 @@ class Tokenizer: Tokenize text into a string of space-separated characters. :param text: Text to be tokenized. """ - return " ".join(self.encode(list(text))) + return " ".join( + self.encode( + [char if char in self.charset else self.unknown_token for char in text] + ) + ) def encode(self, text: List[str]) -> List[str]: """