Skip to content
Snippets Groups Projects
Commit c19f9607 authored by Solene Tarride's avatar Solene Tarride
Browse files

Simplify code

parent 5a228234
No related branches found
No related tags found
No related merge requests found
......@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
get_vocabulary,
insert_token,
normalize_linebreaks,
normalize_spaces,
......@@ -363,14 +364,13 @@ class ArkindexExtractor:
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
# Build LM corpus
train_corpus = [text.replace("\n", " ") for text in self.data["train"].values()]
train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
......@@ -388,36 +388,18 @@ class ArkindexExtractor:
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
for word in get_vocabulary(self.language_corpus["words"])
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
for subword in get_vocabulary(self.language_corpus["subwords"])
]
def export(self):
......
......@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(
set([token for doc in tokenized_text for token in doc.split() if token != ""])
)
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment