Skip to content
Snippets Groups Projects
Commit 369b51c6 authored by Solene Tarride's avatar Solene Tarride
Browse files

Update docstring

parent 619e1c5c
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -116,8 +116,12 @@ def get_bbox(polygon: List[List[int]]) -> str:
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
def __init__(
......@@ -161,7 +165,7 @@ class Tokenizer:
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
# Train the tokenizer
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(
input=str(corpus_file),
......@@ -170,27 +174,24 @@ class Tokenizer:
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
# Delete the corpus file and load the model
corpus_file.unlink()
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str) -> List[str]:
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as words.
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join(
return " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
......@@ -198,16 +199,17 @@ class Tokenizer:
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into characters
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
......@@ -19,13 +19,7 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
<<<<<<< HEAD
"confidences": {
"total": 0.99
}
=======
"confidence": 0.99
>>>>>>> c0d6f93 (Fix doc)
}
```
......@@ -78,13 +72,7 @@ It will create the following JSON file named `dan_humu_page/predict/example.json
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
<<<<<<< HEAD
"confidences": {
"total": 0.99
},
=======
"confidence": 0.99,
>>>>>>> c0d6f93 (Fix doc)
"attention_gif": "dan_humu_page/predict/example_word.gif"
}
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment