Skip to content
Snippets Groups Projects
Commit 0f64242b authored by Solene Tarride's avatar Solene Tarride
Browse files

Map unknown characters

parent 6cf23a2d
No related branches found
No related tags found
1 merge request!287Support subword and word language models
......@@ -371,8 +371,11 @@ class ArkindexExtractor:
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
......
......@@ -131,9 +131,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]:
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(
set([token for doc in tokenized_text for token in doc.split() if token != ""])
)
return sorted(set([token for doc in tokenized_text for token in doc.split()]))
@dataclass
......@@ -148,6 +146,8 @@ class Tokenizer:
"""
training_corpus: List[str]
charset: List[str]
unknown_token: str
outdir: Path
mapping: LMTokenMapping
tokens: Optional[EntityType] = None
......@@ -225,7 +225,11 @@ class Tokenizer:
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(self.encode(list(text)))
return " ".join(
self.encode(
[char if char in self.charset else self.unknown_token for char in text]
)
)
def encode(self, text: List[str]) -> List[str]:
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment