diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 5cd8f5333f3043cb97fe9113e7c96c0731e15b1e..9ed48b583bce63590f89930fd9f3cfc33c9b9baa 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -124,8 +124,12 @@ def get_bbox(polygon: List[List[int]]) -> str: class Tokenizer: """ - A multi-level tokenizer (char, subword, word) - Subword tokenizer is trained using sentencepiece. + A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece. + :param training_corpus: List of training text. + :param outdir: Path to save the subword tokenizer. + :param mapping: Mapping between displayed and encoded versions of special characters + :param tokens: Start and end tokens used to represent named entities. + :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer. """ def __init__( @@ -169,7 +173,7 @@ class Tokenizer: corpus_file = Path(self.outdir / "tmp.txt") corpus_file.write_text("\n".join(self.corpus)) - # Train the tokenizer and load it + # Train the tokenizer logger.info("Training sentencepiece model for subword tokenization") spm.SentencePieceTrainer.train( input=str(corpus_file), @@ -178,27 +182,24 @@ class Tokenizer: user_defined_symbols=self.special_tokens, ) - # Delete the corpus file + # Delete the corpus file and load the model corpus_file.unlink() - - # Load the model and return it return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") - def subword_tokenize(self, text: str) -> List[str]: + def subword_tokenize(self, text: str) -> str: """ Tokenize into subwords. Sampling is disabled to ensure reproducibility. """ tokens = self.sentencepiece_model.encode(text, out_type=str) - # Return encoded tokenized text return " ".join(["".join(self.encode(subword)) for subword in tokens]) - def word_tokenize(self, text: str) -> List[str]: + def word_tokenize(self, text: str) -> str: """ - Tokenize text into words - Spaces (⎵) and NER tokens are considered as words. + Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words. + :param text: Text to be tokenized. """ words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)] - words = " ".join( + return " ".join( [ word + f" {self.mapping.space.encoded}" if (i != len(words) - 1 and word not in self.ner_tokens) @@ -206,16 +207,17 @@ class Tokenizer: for i, word in enumerate(words) ] ) - return words - def char_tokenize(self, text: str) -> List[str]: + def char_tokenize(self, text: str) -> str: """ - Tokenize text into characters + Tokenize text into a string of space-separated characters. + :param text: Text to be tokenized. """ return " ".join(self.encode(list(text))) def encode(self, text: List[str]) -> List[str]: """ - Encode special tokens + Encode special tokens. + :param text: Text to be encoded. """ return map(self.mapping.encode_token, text)