Skip to content
Snippets Groups Projects
Commit 022f6f64 authored by Solene Tarride's avatar Solene Tarride
Browse files

Fix linting

parent d413e1ad
No related branches found
No related tags found
1 merge request!287Support subword and word language models
......@@ -14,7 +14,6 @@ import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm
from nltk.tokenize import wordpunct_tokenize
from arkindex_export import open_database
from dan.datasets.extract.db import (
......@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText,
)
from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
insert_token,
normalize_linebreaks,
normalize_spaces,
Tokenizer,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
......@@ -371,20 +370,53 @@ class ArkindexExtractor:
# Build LM corpus
train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens)
tokenizer.train_subword_tokenizer()
self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus]
self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus]
self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")])
subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")])
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens]
self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary]
self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary]
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
]
def export(self):
(self.output / "labels.json").write_text(
......
......@@ -2,9 +2,12 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import List
import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps
from tenacity import (
retry,
......@@ -12,10 +15,8 @@ from tenacity import (
stop_after_attempt,
wait_exponential,
)
from pathlib import Path
from dan.utils import EntityType
import sentencepiece as spm
from nltk import wordpunct_tokenize
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
......@@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
class Tokenizer():
"""
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
"""
def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
def __init__(
self,
training_corpus: List[str],
outdir: Path,
mapping: LMTokenMapping,
tokens: EntityType = None,
subword_vocab_size: int = 1000,
) -> None:
self.corpus = training_corpus
self.outdir = outdir
self.prefix = f"{self.outdir}/subword_tokenizer"
self.sentencepiece_model = None
self.mapping = mapping
self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property
def ner_tokens(self):
return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""]
def ner_tokens(self) -> List[str]:
if self.tokens is None:
return []
return [entity.start for entity in self.tokens.values()] + [
entity.end for entity in self.tokens.values() if entity.end != ""
]
@property
def mapping_tokens(self):
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self):
def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self):
"""
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file = Path(self.outdir / f"tmp_training_corpus.txt")
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens)
spm.SentencePieceTrainer.train(
input=str(corpus_file),
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
corpus_file.unlink()
# Load the corpus
self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]:
"""
Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways.
def subword_tokenize(self, text: str) -> List[str]:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size)
# Replace special sentencepiece space token
tokens = [t.replace("", "") for t in tokens]
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
"""
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as distinct words.
Spaces (⎵) and NER tokens are considered as words.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join([word + "" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)])
words = " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into characters
......@@ -188,5 +218,7 @@ class Tokenizer():
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
"""
return map(self.mapping.encode_token, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment