Skip to content
Snippets Groups Projects
Commit 5a228234 authored by Solene Tarride's avatar Solene Tarride
Browse files

Update docstring

parent 0b1af313
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -124,8 +124,12 @@ def get_bbox(polygon: List[List[int]]) -> str:
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
def __init__(
......@@ -169,7 +173,7 @@ class Tokenizer:
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
# Train the tokenizer
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(
input=str(corpus_file),
......@@ -178,27 +182,24 @@ class Tokenizer:
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
# Delete the corpus file and load the model
corpus_file.unlink()
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str) -> List[str]:
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as words.
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join(
return " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
......@@ -206,16 +207,17 @@ class Tokenizer:
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into characters
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment