Skip to content
Snippets Groups Projects
Commit 3ca6578c authored by Solene Tarride's avatar Solene Tarride
Browse files

Simplify code

parent 12928ec0
No related branches found
No related tags found
1 merge request!287Support subword and word language models
......@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
get_vocabulary,
insert_token,
normalize_linebreaks,
normalize_spaces,
......@@ -363,14 +364,13 @@ class ArkindexExtractor:
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
# Build LM corpus
train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
......@@ -388,36 +388,18 @@ class ArkindexExtractor:
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
for word in get_vocabulary(self.language_corpus["words"])
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
for subword in get_vocabulary(self.language_corpus["subwords"])
]
def export(self):
......
......@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(
set([token for doc in tokenized_text for token in doc.split() if token != ""])
)
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment