Skip to content
Snippets Groups Projects
Commit 022f6f64 authored by Solene Tarride's avatar Solene Tarride
Browse files

Fix linting

parent d413e1ad
No related branches found
No related tags found
1 merge request!287Support subword and word language models
...@@ -14,7 +14,6 @@ import cv2 ...@@ -14,7 +14,6 @@ import cv2
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from tqdm import tqdm from tqdm import tqdm
from nltk.tokenize import wordpunct_tokenize
from arkindex_export import open_database from arkindex_export import open_database
from dan.datasets.extract.db import ( from dan.datasets.extract.db import (
...@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import ( ...@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText, UnknownTokenInText,
) )
from dan.datasets.extract.utils import ( from dan.datasets.extract.utils import (
Tokenizer,
download_image, download_image,
get_bbox, get_bbox,
insert_token, insert_token,
normalize_linebreaks, normalize_linebreaks,
normalize_spaces, normalize_spaces,
Tokenizer,
) )
from dan.utils import EntityType, LMTokenMapping, parse_tokens from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract from line_image_extractor.extractor import extract
...@@ -371,20 +370,53 @@ class ArkindexExtractor: ...@@ -371,20 +370,53 @@ class ArkindexExtractor:
# Build LM corpus # Build LM corpus
train_corpus = [text for text in self.data["train"].values()] train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(train_corpus, outdir=self.output / "language_model", mapping=self.mapping, tokens=self.tokens) tokenizer = Tokenizer(
tokenizer.train_subword_tokenizer() train_corpus,
self.language_corpus["characters"] = [tokenizer.char_tokenize(doc) for doc in train_corpus] outdir=self.output / "language_model",
self.language_corpus["words"] = [tokenizer.word_tokenize(doc) for doc in train_corpus] mapping=self.mapping,
self.language_corpus["subwords"] = [tokenizer.subword_tokenize(doc) for doc in train_corpus] tokens=self.tokens,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary # Build vocabulary
word_vocabulary = set([word for doc in self.language_corpus["words"] for word in doc.split(" ")]) word_vocabulary = set(
subword_vocabulary = set([subword for doc in self.language_corpus["subwords"] for subword in doc.split(" ")]) [
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon # Build LM lexicon
self.language_lexicon["chars"] = [f"{token} {tokenizer.char_tokenize(token)}" for token in self.language_tokens] self.language_lexicon["characters"] = [
self.language_lexicon["words"] = [f"{word} {tokenizer.char_tokenize(word)}" for word in word_vocabulary] f"{token} {token}" for token in self.language_tokens
self.language_lexicon["subwords"] = [f"{subword} {tokenizer.char_tokenize(subword)}" for subword in subword_vocabulary] ]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
]
def export(self): def export(self):
(self.output / "labels.json").write_text( (self.output / "labels.json").write_text(
......
...@@ -2,9 +2,12 @@ ...@@ -2,9 +2,12 @@
import logging import logging
import re import re
from io import BytesIO from io import BytesIO
from pathlib import Path
from typing import List from typing import List
import requests import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps from PIL import Image, ImageOps
from tenacity import ( from tenacity import (
retry, retry,
...@@ -12,10 +15,8 @@ from tenacity import ( ...@@ -12,10 +15,8 @@ from tenacity import (
stop_after_attempt, stop_after_attempt,
wait_exponential, wait_exponential,
) )
from pathlib import Path
from dan.utils import EntityType from dan.utils import EntityType, LMTokenMapping
import sentencepiece as spm
from nltk import wordpunct_tokenize
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str: ...@@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
class Tokenizer(): class Tokenizer:
""" """
A multi-level tokenizer (char, subword, word) A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece. Subword tokenizer is trained using sentencepiece.
""" """
def __init__(self, training_corpus, outdir, mapping, tokens=[]) -> None:
def __init__(
self,
training_corpus: List[str],
outdir: Path,
mapping: LMTokenMapping,
tokens: EntityType = None,
subword_vocab_size: int = 1000,
) -> None:
self.corpus = training_corpus self.corpus = training_corpus
self.outdir = outdir self.outdir = outdir
self.prefix = f"{self.outdir}/subword_tokenizer" self.prefix = f"{self.outdir}/subword_tokenizer"
self.sentencepiece_model = None
self.mapping = mapping
self.tokens = tokens self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property @property
def ner_tokens(self): def ner_tokens(self) -> List[str]:
return [entity.start for entity in self.tokens.values()] + [entity.end for entity in self.tokens.values() if entity.end != ""] if self.tokens is None:
return []
return [entity.start for entity in self.tokens.values()] + [
entity.end for entity in self.tokens.values() if entity.end != ""
]
@property @property
def mapping_tokens(self): def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping] return [token.encoded for token in self.mapping]
@property @property
def special_tokens(self): def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens)) return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self): def train_subword_tokenizer(self):
""" """
Train a sentencepiece model on the training corpus. Train a sentencepiece model on the training corpus.
""" """
# Write the corpus in a text file # Write the corpus in a text file
corpus_file = Path(self.outdir / f"tmp_training_corpus.txt") corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus)) corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it # Train the tokenizer and load it
logger.info("Training sentencepiece model for subword tokenization") logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(input=str(corpus_file), vocab_size=1000, model_prefix=self.prefix, user_defined_symbols=self.special_tokens) spm.SentencePieceTrainer.train(
input=str(corpus_file),
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file # Delete the corpus file
corpus_file.unlink() corpus_file.unlink()
# Load the corpus # Load the model and return it
self.sentencepiece_model = spm.SentencePieceProcessor(model_file=f"{self.prefix}.model") return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str, enable_sampling=True, alpha=0.1, nbest_size=-1) -> List[str]: def subword_tokenize(self, text: str) -> List[str]:
""" """
Tokenize into subwords. As sampling is enabled, a text can be tokenized in different ways. Tokenize into subwords. Sampling is disabled to ensure reproducibility.
""" """
tokens = self.sentencepiece_model.encode(text, out_type=str, enable_sampling=enable_sampling, alpha=alpha, nbest_size=nbest_size) tokens = self.sentencepiece_model.encode(text, out_type=str)
# Replace special sentencepiece space token
tokens = [t.replace("", "") for t in tokens]
# Return encoded tokenized text # Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens]) return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]: def word_tokenize(self, text: str) -> List[str]:
""" """
Tokenize text into words Tokenize text into words
Spaces (⎵) and NER tokens are considered as distinct words. Spaces (⎵) and NER tokens are considered as words.
""" """
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)] words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join([word + "" if (i != len(words) - 1 and word not in self.ner_tokens) else word for i, word in enumerate(words)]) words = " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
return words return words
def char_tokenize(self, text: str) -> List[str]: def char_tokenize(self, text: str) -> List[str]:
""" """
Tokenize text into characters Tokenize text into characters
...@@ -188,5 +218,7 @@ class Tokenizer(): ...@@ -188,5 +218,7 @@ class Tokenizer():
return " ".join(self.encode(list(text))) return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]: def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
"""
return map(self.mapping.encode_token, text) return map(self.mapping.encode_token, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment