Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
All threads resolved!
2 files
+ 18
28
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -124,8 +124,12 @@ def get_bbox(polygon: List[List[int]]) -> str:
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
def __init__(
@@ -169,7 +173,7 @@ class Tokenizer:
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
# Train the tokenizer
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(
input=str(corpus_file),
@@ -178,27 +182,24 @@ class Tokenizer:
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
# Delete the corpus file and load the model
corpus_file.unlink()
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str) -> List[str]:
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as words.
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join(
return " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
@@ -206,16 +207,17 @@ class Tokenizer:
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into characters
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
Loading