Skip to content
Snippets Groups Projects
Commit 0f64242b authored by Solene Tarride's avatar Solene Tarride
Browse files

Map unknown characters

parent 6cf23a2d
No related branches found
No related tags found
1 merge request!287Support subword and word language models
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
...@@ -371,8 +371,11 @@ class ArkindexExtractor: ...@@ -371,8 +371,11 @@ class ArkindexExtractor:
text.replace(self.mapping.linebreak.display, self.mapping.space.display) text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values() for text in self.data["train"].values()
] ]
tokenizer = Tokenizer( tokenizer = Tokenizer(
training_corpus=train_corpus, training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / "language_model", outdir=self.output / "language_model",
mapping=self.mapping, mapping=self.mapping,
tokens=self.tokens, tokens=self.tokens,
......
...@@ -131,9 +131,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]: ...@@ -131,9 +131,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]:
Compute set of vocabulary from tokenzied text. Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text. :param tokenized_text: List of tokenized text.
""" """
return sorted( return sorted(set([token for doc in tokenized_text for token in doc.split()]))
set([token for doc in tokenized_text for token in doc.split() if token != ""])
)
@dataclass @dataclass
...@@ -148,6 +146,8 @@ class Tokenizer: ...@@ -148,6 +146,8 @@ class Tokenizer:
""" """
training_corpus: List[str] training_corpus: List[str]
charset: List[str]
unknown_token: str
outdir: Path outdir: Path
mapping: LMTokenMapping mapping: LMTokenMapping
tokens: Optional[EntityType] = None tokens: Optional[EntityType] = None
...@@ -225,7 +225,11 @@ class Tokenizer: ...@@ -225,7 +225,11 @@ class Tokenizer:
Tokenize text into a string of space-separated characters. Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized. :param text: Text to be tokenized.
""" """
return " ".join(self.encode(list(text))) return " ".join(
self.encode(
[char if char in self.charset else self.unknown_token for char in text]
)
)
def encode(self, text: List[str]) -> List[str]: def encode(self, text: List[str]) -> List[str]:
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment