Skip to content
Snippets Groups Projects
Commit 3ca6578c authored by Solene Tarride's avatar Solene Tarride
Browse files

Simplify code

parent 12928ec0
No related branches found
No related tags found
1 merge request!287Support subword and word language models
...@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import ( ...@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
Tokenizer, Tokenizer,
download_image, download_image,
get_bbox, get_bbox,
get_vocabulary,
insert_token, insert_token,
normalize_linebreaks, normalize_linebreaks,
normalize_spaces, normalize_spaces,
...@@ -363,14 +364,13 @@ class ArkindexExtractor: ...@@ -363,14 +364,13 @@ class ArkindexExtractor:
self.language_tokens.append( self.language_tokens.append(
self.mapping.encode[token] self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token) ) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded) self.language_tokens.append(self.mapping.ctc.encoded)
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
# Build LM corpus # Build LM corpus
train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()] train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer( tokenizer = Tokenizer(
train_corpus, train_corpus,
outdir=self.output / "language_model", outdir=self.output / "language_model",
...@@ -388,36 +388,18 @@ class ArkindexExtractor: ...@@ -388,36 +388,18 @@ class ArkindexExtractor:
tokenizer.subword_tokenize(doc) for doc in train_corpus tokenizer.subword_tokenize(doc) for doc in train_corpus
] ]
# Build vocabulary
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon # Build LM lexicon
self.language_lexicon["characters"] = [ self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens f"{token} {token}" for token in self.language_tokens
] ]
self.language_lexicon["words"] = [ self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}" f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary) for word in get_vocabulary(self.language_corpus["words"])
if word != "" if word != ""
] ]
self.language_lexicon["subwords"] = [ self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}" f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary) for subword in get_vocabulary(self.language_corpus["subwords"])
] ]
def export(self): def export(self):
......
...@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str: ...@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(
set([token for doc in tokenized_text for token in doc.split() if token != ""])
)
class Tokenizer: class Tokenizer:
""" """
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece. A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text. :param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer. :param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters :param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities. :param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer. :param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment