Skip to content
Snippets Groups Projects
Commit beee846d authored by Solene Tarride's avatar Solene Tarride
Browse files

Fix linting

parent 46d044cc
No related branches found
No related tags found
No related merge requests found
......@@ -14,7 +14,6 @@ import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import wordpunct_tokenize
from arkindex_export import open_database
from dan.datasets.extract.db import (
......@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText,
)
from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
insert_token,
normalize_linebreaks,
normalize_spaces,
Tokenizer,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
......@@ -371,20 +370,53 @@ class ArkindexExtractor:
# Build LM corpus
train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
tokenizer.train_subword_tokenizer()
self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
]
def export(self):
(self.output / "labels.json").write_text(
......
......@@ -2,9 +2,12 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import List
import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps
from tenacity import (
retry,
......@@ -12,10 +15,8 @@ from tenacity import (
stop_after_attempt,
wait_exponential,
)
from pathlib import Path
from dan.utils import EntityType
import sentencepiece as spm
from nltk import wordpunct_tokenize
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
......@@ -111,68 +112,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
class Tokenizer():
"""
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
"""
def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
def __init__(
self,
training_corpus: List[str],
outdir: Path,
mapping: LMTokenMapping,
tokens: EntityType = None,
subword_vocab_size: int = 1000,
) -> None:
self.corpus = training_corpus
self.outdir = outdir
self.prefix = f"{self.outdir}/subword_tokenizer"
self.sentencepiece_model = None
self.mapping = mapping
self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property
def ner_tokens(self):
return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""]
def ner_tokens(self) -> List[str]:
if self.tokens is None:
return []
return [entity.start for entity in self.tokens.values()] + [
entity.end for entity in self.tokens.values() if entity.end != ""
]
@property
def mapping_tokens(self):
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self):
def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self):
"""
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file = Path(self.outdir / f"tmp_training_corpus.txt")
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens)
spm.SentencePieceTrainer.train(
input=str(corpus_file),
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
corpus_file.unlink()
# Load the corpus
self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]:
"""
Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways.
def subword_tokenize(self, text: str) -> List[str]:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size)
# Replace special sentencepiece space token
tokens = [t.replace("", "") for t in tokens]
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
"""
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as distinct words.
Spaces (⎵) and NER tokens are considered as words.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join([word + "" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)])
words = " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into characters
......@@ -180,5 +210,7 @@ class Tokenizer():
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
"""
return map(self.mapping.encode_token, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment