diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 7d5111d70883456c00c7c2abeebff305756a1eb1..b4ecf88be1645b54206c01ed29cefa2a720ffe2c 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -116,8 +116,12 @@ def get_bbox(polygon: List[List[int]]) -> str: class Tokenizer: """ - A multi-level tokenizer (char, subword, word) - Subword tokenizer is trained using sentencepiece. + A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece. + :param training_corpus: List of training text. + :param outdir: Path to save the subword tokenizer. + :param mapping: Mapping between displayed and encoded versions of special characters + :param tokens: Start and end tokens used to represent named entities. + :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer. """ def __init__( @@ -161,7 +165,7 @@ class Tokenizer: corpus_file = Path(self.outdir / "tmp.txt") corpus_file.write_text("\n".join(self.corpus)) - # Train the tokenizer and load it + # Train the tokenizer logger.info("Training sentencepiece model for subword tokenization") spm.SentencePieceTrainer.train( input=str(corpus_file), @@ -170,27 +174,24 @@ class Tokenizer: user_defined_symbols=self.special_tokens, ) - # Delete the corpus file + # Delete the corpus file and load the model corpus_file.unlink() - - # Load the model and return it return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") - def subword_tokenize(self, text: str) -> List[str]: + def subword_tokenize(self, text: str) -> str: """ Tokenize into subwords. Sampling is disabled to ensure reproducibility. """ tokens = self.sentencepiece_model.encode(text, out_type=str) - # Return encoded tokenized text return " ".join(["".join(self.encode(subword)) for subword in tokens]) - def word_tokenize(self, text: str) -> List[str]: + def word_tokenize(self, text: str) -> str: """ - Tokenize text into words - Spaces (⎵) and NER tokens are considered as words. + Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words. + :param text: Text to be tokenized. """ words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)] - words = " ".join( + return " ".join( [ word + f" {self.mapping.space.encoded}" if (i != len(words) - 1 and word not in self.ner_tokens) @@ -198,16 +199,17 @@ class Tokenizer: for i, word in enumerate(words) ] ) - return words - def char_tokenize(self, text: str) -> List[str]: + def char_tokenize(self, text: str) -> str: """ - Tokenize text into characters + Tokenize text into a string of space-separated characters. + :param text: Text to be tokenized. """ return " ".join(self.encode(list(text))) def encode(self, text: List[str]) -> List[str]: """ - Encode special tokens + Encode special tokens. + :param text: Text to be encoded. """ return map(self.mapping.encode_token, text) diff --git a/docs/usage/predict/examples.md b/docs/usage/predict/examples.md index b7bc086c910af28cfcca51cd8e6397ec53bd91fd..84170d7d241b6c2964c46cdf921dd77ebbcbf023 100644 --- a/docs/usage/predict/examples.md +++ b/docs/usage/predict/examples.md @@ -19,13 +19,7 @@ It will create the following JSON file named `dan_humu_page/predict/example.json ```json { "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.", -<<<<<<< HEAD - "confidences": { - "total": 0.99 - } -======= "confidence": 0.99 ->>>>>>> c0d6f93 (Fix doc) } ``` @@ -78,13 +72,7 @@ It will create the following JSON file named `dan_humu_page/predict/example.json ```json { "text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.", -<<<<<<< HEAD - "confidences": { - "total": 0.99 - }, -======= "confidence": 0.99, ->>>>>>> c0d6f93 (Fix doc) "attention_gif": "dan_humu_page/predict/example_word.gif" } ```