Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (29)
......@@ -30,6 +30,7 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText,
)
from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
insert_token,
......@@ -97,9 +98,9 @@ class ArkindexExtractor:
self.data: Dict = defaultdict(dict)
self.charset = set()
self.language_corpus = []
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = []
self.language_lexicon = defaultdict(list)
# Image download tasks to process
self.tasks: List[Dict[str, str]] = []
......@@ -275,12 +276,6 @@ class ArkindexExtractor:
)
return text.strip()
def format_text_language_model(self, text: str):
"""
Format text for the language model. Return the text tokenized at character-level.
"""
return " ".join(map(self.mapping.encode_token, list(text.strip())))
def process_element(
self,
element: Element,
......@@ -319,10 +314,6 @@ class ArkindexExtractor:
self.data[split][str(image_path)] = text
self.charset = self.charset.union(set(text))
# Language model should be built using only text from the training set
if split == "train":
self.language_corpus.append(self.format_text_language_model(text))
def process_parent(
self,
pbar,
......@@ -361,6 +352,9 @@ class ArkindexExtractor:
"""
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Build LM tokens
for token in sorted(list(self.charset)):
assert (
token not in self.mapping.encode.values()
......@@ -369,14 +363,60 @@ class ArkindexExtractor:
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
# Add the special blank token
self.language_tokens.append(self.mapping.ctc.encoded)
# Build lexicon
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
self.language_lexicon = [f"{token} {token}" for token in self.language_tokens]
# Build LM corpus
train_corpus = [text for text in self.data["train"].values()]
tokenizer = Tokenizer(
train_corpus,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
)
self.language_corpus["characters"] = [
tokenizer.char_tokenize(doc) for doc in train_corpus
]
self.language_corpus["words"] = [
tokenizer.word_tokenize(doc) for doc in train_corpus
]
self.language_corpus["subwords"] = [
tokenizer.subword_tokenize(doc) for doc in train_corpus
]
# Build vocabulary
word_vocabulary = set(
[
word
for doc in self.language_corpus["words"]
for word in doc.split()
if word != ""
]
)
subword_vocabulary = set(
[
subword
for doc in self.language_corpus["subwords"]
for subword in doc.split()
if subword != ""
]
)
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
self.language_lexicon["words"] = [
f"{word} {tokenizer.char_tokenize(word)}"
for word in sorted(word_vocabulary)
if word != ""
]
self.language_lexicon["subwords"] = [
f"{subword} {tokenizer.char_tokenize(subword)}"
for subword in sorted(subword_vocabulary)
]
def export(self):
(self.output / "labels.json").write_text(
......@@ -386,15 +426,16 @@ class ArkindexExtractor:
indent=4,
)
)
(self.output / "language_model" / "corpus.txt").write_text(
"\n".join(self.language_corpus)
)
for level in ["characters", "words", "subwords"]:
(self.output / "language_model" / f"corpus_{level}.txt").write_text(
"\n".join(self.language_corpus[level])
)
(self.output / "language_model" / f"lexicon_{level}.txt").write_text(
"\n".join(self.language_lexicon[level])
)
(self.output / "language_model" / "tokens.txt").write_text(
"\n".join(self.language_tokens)
)
(self.output / "language_model" / "lexicon.txt").write_text(
"\n".join(self.language_lexicon)
)
(self.output / "charset.pkl").write_bytes(
pickle.dumps(sorted(list(self.charset)))
)
......
......@@ -2,9 +2,12 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import List
import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps
from tenacity import (
retry,
......@@ -13,7 +16,7 @@ from tenacity import (
wait_exponential,
)
from dan.utils import EntityType
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
......@@ -92,18 +95,16 @@ def insert_token(text: str, entity_type: EntityType, offset: int, length: int) -
def normalize_linebreaks(text: str) -> str:
"""
Remove begin/ending linebreaks.
Replace \r with regular linebreak and consecutive linebreaks.
:param text: Text to normalize.
Remove begin/ending linebreaks
Replace \r with regular linebreak and consecutive linebreaks
"""
return TRIM_RETURN_REGEX.sub("\n", text.strip())
def normalize_spaces(text: str) -> str:
"""
Remove begin/ending spaces.
Replace \t with regular space and consecutive spaces.
:param text: Text to normalize.
Remove begin/ending spaces
Replace \t with regular space and consecutive spaces
"""
return TRIM_SPACE_REGEX.sub(" ", text.strip())
......@@ -117,3 +118,107 @@ def get_bbox(polygon: List[List[int]]) -> str:
x, y = min(all_x), min(all_y)
width, height = max(all_x) - x, max(all_y) - y
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
"""
def __init__(
self,
training_corpus: List[str],
outdir: Path,
mapping: LMTokenMapping,
tokens: EntityType = None,
subword_vocab_size: int = 1000,
) -> None:
self.corpus = training_corpus
self.outdir = outdir
self.prefix = f"{self.outdir}/subword_tokenizer"
self.tokens = tokens
self.mapping = mapping
# Train the subword tokenizer
self.user_subword_vocab_size = subword_vocab_size
self.sentencepiece_model = self.train_subword_tokenizer()
@property
def ner_tokens(self) -> List[str]:
if self.tokens is None:
return []
return [entity.start for entity in self.tokens.values()] + [
entity.end for entity in self.tokens.values() if entity.end != ""
]
@property
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self) -> List[str]:
return list(set(self.ner_tokens + self.mapping_tokens))
@property
def subword_vocab_size(self):
n_words = len(set([word for doc in self.corpus for word in doc.split()]))
return min(self.user_subword_vocab_size, 3 * n_words)
def train_subword_tokenizer(self):
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file = Path(self.outdir / "tmp.txt")
corpus_file.write_text("\n".join(self.corpus))
# Train the tokenizer and load it
logger.info("Training sentencepiece model for subword tokenization")
spm.SentencePieceTrainer.train(
input=str(corpus_file),
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
# Delete the corpus file
corpus_file.unlink()
# Load the model and return it
return spm.SentencePieceProcessor(model_file=f"{self.prefix}.model")
def subword_tokenize(self, text: str) -> List[str]:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
# Return encoded tokenized text
return " ".join(["".join(self.encode(subword)) for subword in tokens])
def word_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as words.
"""
words = ["".join(self.encode(word)) for word in wordpunct_tokenize(text)]
words = " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
return words
def char_tokenize(self, text: str) -> List[str]:
"""
Tokenize text into characters
"""
return " ".join(self.encode(list(text)))
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens
"""
return map(self.mapping.encode_token, text)
......@@ -470,13 +470,17 @@ class GlobalHTADecoder(Module):
class CTCLanguageDecoder:
"""
Initialize a CTC decoder with n-gram language modeling.
:param language_model_path: Path to a KenLM or ARPA language model.
:param lexicon_path: Path to a lexicon file containing the possible words and corresponding spellings.
Each line consists of a word and its space separated spelling. If `None`, uses lexicon-free decoding.
:param tokens_path: Path to a file containing valid tokens. If using a file, the expected
format is for tokens mapping to the same index to be on the same line.
:param language_model_weight: Weight of the language model.
:param temperature: Temperature for model calibreation.
Args:
language_model_path (str): path to a KenLM or ARPA language model
lexicon_path (str): path to a lexicon file containing the possible words and corresponding spellings.
Each line consists of a word and its space separated spelling. If `None`, uses lexicon-free
decoding.
tokens_path (str): path to a file containing valid tokens. If using a file, the expected
format is for tokens mapping to the same index to be on the same line
language_model_weight (float): weight of the language model.
blank_token (str): token representing the blank/ctc symbol
unk_token (str): token representing unknown characters
sil_token (str): token representing the space character
"""
def __init__(
......@@ -495,9 +499,6 @@ class CTCLanguageDecoder:
}
self.index_to_token = {i: token for token, i in self.tokens_to_index.items()}
self.blank_token_id = self.tokens_to_index[self.mapping.ctc.encoded]
# Torchaudio's decoder
# https://pytorch.org/audio/master/generated/torchaudio.models.decoder.ctc_decoder.html
self.decoder = ctc_decoder(
lm=language_model_path,
lexicon=lexicon_path,
......@@ -515,7 +516,7 @@ class CTCLanguageDecoder:
self, batch_features: torch.FloatTensor, batch_frames: torch.LongTensor
) -> tuple[torch.FloatTensor, torch.LongTensor]:
"""
Add CTC frames between each characters to avoid duplicate removal.
Add CTC frames between each characters to avoid duplicate removal
"""
high_prob = batch_features.max()
low_prob = batch_features.min()
......@@ -562,9 +563,6 @@ class CTCLanguageDecoder:
) -> Dict[str, List[Union[str, float]]]:
"""
Post-process hypotheses to output JSON. Exports only the best hypothesis for each image.
:param hypotheses: List of hypotheses returned by the decoder.
:param batch_sizes: Prediction length of size batch_size.
:return: A dictionary containing the hypotheses and their confidences.
"""
out = {}
# Replace <space> by an actual space and format string
......@@ -597,9 +595,11 @@ class CTCLanguageDecoder:
) -> Dict[str, List[Union[str, float]]]:
"""
Decode a feature vector using n-gram language modelling.
:param batch_features: Feature vector of size (batch_size, n_tokens, n_frames).
:param batch_frames: Prediction length of size batch_size.
:return: A dictionary containing the hypotheses and their confidences.
Args:
features: Feature vector of size (batch_size, n_tokens, n_frames).
batch_sizes: Prediction length of size (batch_size)
Returns:
a dictionary containing the hypotheses and their confidences
"""
# Reshape from (batch_size, n_tokens, n_frames) to (batch_size, n_frames, n_tokens)
batch_features = batch_features.permute((0, 2, 1))
......
......@@ -479,6 +479,7 @@ def run(
dan_model.load(
model, parameters, charset, mode="eval", use_language_model=use_language_model
)
batch_size = 1 if use_language_model else batch_size
# Do not use LM with invalid LM weight
use_language_model = dan_model.lm_decoder is not None
......
......@@ -22,7 +22,7 @@ class Token(NamedTuple):
class LMTokenMapping(NamedTuple):
space: Token = Token("", " ")
space: Token = Token("", " ")
linebreak: Token = Token("", "\n")
ctc: Token = Token("", "<ctc>")
......@@ -163,9 +163,7 @@ def read_json(json_path: str) -> Dict:
def read_txt(txt_path: str) -> str:
"""
Read TXT file.
:param txt_path: Path of the text file to read.
:return: The content of the read file.
Read TXT file
"""
filename = Path(txt_path)
assert filename.exists(), f"{txt_path} does not resolve."
......
......@@ -21,9 +21,9 @@ output/
│ ├── val
│ └── test
├── language_model
│ ├── corpus.txt
│ ├── lexicon.txt
│ └── tokens.txt
│ ├── language_corpus.txt
│ ├── language_lexicon.txt
│ └── language_tokens.txt
```
## 2. Train
......
# Predict
Use the `teklia-dan predict` command to apply a trained DAN model on an image.
## Description of parameters
| Parameter | Description | Type | Default |
| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------- |
| `--image` | Path to the image to predict. Must not be provided with `--image-dir`. | `Path` | |
| `--image-dir` | Path to the folder where the images to predict are stored. Must not be provided with `--image`. | `Path` | |
| `--image-extension` | The extension of the images in the folder. Ignored if `--image-dir` is not provided. | `str` | .jpg |
| `--model` | Path to the model to use for prediction | `Path` | |
| `--parameters` | Path to the YAML parameters file. | `Path` | |
| `--charset` | Path to the charset file. | `Path` | |
| `--output` | Path to the output folder. Results will be saved in this directory. | `Path` | |
| `--confidence-score` | Whether to return confidence scores. | `bool` | `False` |
| `--confidence-score-levels` | Level to return confidence scores. Should be any combination of `["line", "word", "char"]`. | `str` | |
| `--attention-map` | Whether to plot attention maps. | `bool` | `False` |
| `--attention-map-scale` | Image scaling factor before creating the GIF. | `float` | `0.5` |
| `--attention-map-level` | Level to plot the attention maps. Should be in `["line", "word", "char"]`. | `str` | `"line"` |
| `--predict-objects` | Whether to return polygons coordinates. | `bool` | `False` |
| `--word-separators` | List of word separators. | `list` | `[" ", "\n"]` |
| `--line-separators` | List of line separators. | `list` | `["\n"]` |
| `--threshold-method` | Method to use for attention mask thresholding. Should be in `["otsu", "simple"]`. | `str` | `"otsu"` |
| `--threshold-value ` | Threshold to use for the "simple" thresholding method. | `int` | `0` |
| `--batch-size ` | Size of the batches for prediction. | `int` | `1` |
| `--start-token ` | Use a specific starting token at the beginning of the prediction. Useful when making predictions on different single pages. | `str` | `None` |
| `--use-language-model` | Whether to use an external n-gram language model to rescore hypotheses. See [the dedicated example](#predict-with-an-external-n-gram-language-model) for details. | `bool` | `False` |
## Examples
### Predict with confidence scores
To run a prediction with confidence scores, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--confidence-score
```
It will create the following JSON file named `dan_humu_page/predict/example.json`
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99
}
```
### Predict with confidence scores and line-level attention maps
To run a prediction with confidence scores and plot line-level attention maps, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--confidence-score \
--attention-map \
```
It will create the following JSON file named `dan_humu_page/predict/example.json` and a GIF showing a word-level attention map `dan_humu_page/predict/example_line.gif`
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99,
"attention_gif": "dan_humu_page/predict/example_line.gif"
}
```
<img src="../../assets/example_line.gif" />
### Predict with confidence scores and word-level attention maps
To run a prediction with confidence scores and plot word-level attention maps, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--confidence-score \
--attention-map \
--attention-map-level word \
--attention-map-scale 0.5
```
It will create the following JSON file named `dan_humu_page/predict/example.json` and a GIF showing a word-level attention map `dan_humu_page/predict/example_word.gif`.
```json
{
"text": "Hansteensgt. 2 IV 28/4 - 19\nKj\u00e6re Gerhard.\nTak for Brevet om Boken og Haven\nog Crokus og Blaaveis og tak fordi\nDu vilde be mig derut sammen\nmed Kris og Ragna. Men vet Du\nda ikke, at Kris reiste med sin S\u00f8-\nster Fru Cr\u00f8ger til Lillehammer\nnogle Dage efter Begravelsen? Hen\ndes Address er Amtsingeni\u00f8r\nCr\u00f8ger. Hun skriver at de blir\nder til lidt ut i Mai. Nu er hun\nnoksaa medtat skj\u00f8nner jeg af Sorg\nog af L\u00e6ngsel, skriver saameget r\u00f8-\nrende om Oluf. Ragna har det\nherligt, skriver hun. Hun er bare\ngla, og det vet jeg, at \"Oluf er gla over,\nder hvor han nu er. Jeg har saa in-\nderlig ondt af hende, og om Du skrev\net Par Ord tror jeg det vilde gj\u00f8re\nhende godt. - Jeg gl\u00e6der mig over,\nat Du har skrevet en Bok, og\njeg er vis paa, at den er god.",
"confidence": 0.99,
"attention_gif": "dan_humu_page/predict/example_word.gif"
}
```
<img src="../../assets/example_word.gif" >
### Predict with line-level attention maps and extract polygons
To run a prediction, plot line-level attention maps, and extract polygons, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--output dan_humu_page/predict/ \
--attention-map \
--predict-objects \
--threshold-method otsu
```
It will create the following JSON file named `dan_humu_page/predict/example.json` and a GIF showing a line-level attention map with extracted polygons `dan_humu_page/predict/example_line.gif`
```json
{
"text": "Oslo\n39 \nOresden den 24te Rasser!\nH\u00f8jst\u00e6redesherr Hartvig - assert!\nUllereder fra den f\u00f8rste tide da\njeg havder den tilfredsstillelser at vide den ar-\ndistiske ledelser af Kristiania theater i Deres\nhronder, har jeg g\u00e5t hernede med et stille\nh\u00e5b om fra Dem at modtage et forelag, sig -\nsende tils at lade \"K\u00e6rlighedens \u00abKomedie\u00bb\nopf\u00f8re fore det norske purblikum.\nEt s\u00e5dant forslag er imidlertid, imod\nforventning; ikke fremkommet, og jeg n\u00f8des der-\nfor tils self at grivbe initiativet, hvilket hervede\nsker, idet jeg\nbeder\nbet\nragte stigkket some ved denne\nskrivelse officielde indleveret til theatret. No-\nget exemplar af bogen vedlagger jeg ikke da\ndenne (i 2den udgave) med Lethed kan er -\nholdet deroppe.\nDe bet\u00e6nkeligheder, jeg i sin tid n\u00e6-\nrede mod stykkets opf\u00f8relse, er for l\u00e6nge si -\ndem forsvundne. Af mange begn er jeg kom-\nmen til den overbevisning at almenlreden\naru har f\u00e5tt sine \u00f8gne opladte for den sand -\nMed at dette arbejde i sin indersten id\u00e9 hviler\np\u00e5 et ubedinget meralsk grundlag, og brad\nstykkets hele kunstneriske struktuve ang\u00e5r,",
"objects": [
{
"confidence": 0.68,
"polygon": [
[
264,
118
],
[
410,
118
],
[
410,
185
],
[
264,
185
]
],
"text": "Oslo",
"text_confidence": 0.8
}
],
"attention_gif": "dan_humu_page/predict/example_line.gif"
}
```
<img src="../../assets/example_line_polygon.gif" >
### Predict with an external n-gram language model
#### Build the language model
A dataset extracted with the `teklia-dan dataset extract` command should contain the files required to build a language model (in the `language_model` folder). To refine DAN's predictions with a language model, follow these steps:
1. Install and build [kenlm](https://github.com/kpu/kenlm)
1. Build a 6-gram language model using the following command
```sh
bin/lmplz --order 6 \
--text my_dataset/language_model/corpus.txt \
--arpa my_dataset/language_model/model.arpa
```
1. Update `inference_parameters.yml`. The `weight` parameter defines how much weight to give to the language model. It should be set carefully (usually between 0.5 and 2.0) as it will affect the quality of the predictions.
```yaml
parameters:
...
language_model:
model: my_dataset/language_model/model.arpa
lexicon: my_dataset/language_model/lexicon.txt
tokens: my_dataset/language_model/tokens.txt
weight: 0.5
```
#### Predict
To run a prediction with the n-gram language model, run this command:
```shell
teklia-dan predict \
--image dan_humu_page/example.jpg \
--model dan_humu_page/model.pt \
--parameters dan_humu_page/parameters.yml \
--charset dan_humu_page/charset.pkl \
--use-language-model \
--output dan_humu_page/predict/
```
It will create the following JSON file named `dan_humu_page/predict/example.json`
```json
{
"text": "etc., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f8rterne\nbestad i at jeg kj\u00f8bte\net Forkl\u00e6de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f8rter og et Forkl\u00e6de,\nsamt Fragt paa det Gods\n(N\u00f8i) some man sendte\nmig ubet\u00e6lt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg",
"language_model": {
"text": "eet., some jeg netop idag\nholder Vask paa.\nLeien af Skj\u00f9rterne\nbestad i at jeg kj\u00f9bte\net Forkl\u00e7de til hver\naf de to Piger, some\nhavde laant os dem.\nResten var Vask af Hardan-\ngerskj\u00f9rter og et Forkl\u00e7de,\nsamt Fragt paa det Gods\n(N\u00f9i) some man sendte\nmig ubetalt.\nIdag fik jeg hyggeligt\nFrimarkebrev fra Fosvold\nMed Hilsen\nDeres\nHulda Garborg",
"confidence": 0.87
}
}
```
......@@ -6,10 +6,12 @@ flashlight-text==0.0.4
imageio==2.26.1
imagesize==1.4.1
mdutils==1.6.0
nltk==3.8.1
numpy==1.24.3
prettytable==3.8.0
PyYAML==6.0
scipy==1.10.1
sentencepiece==0.1.99
teklia-line-image-extractor==0.2.8rc4
tenacity==8.2.3
tensorboard==2.12.2
......
⎵ ⎵
▁ ▁
! !
" "
& &
......
!
"
&
......
......@@ -33,7 +33,7 @@ EXTRACTION_DATA_PATH = FIXTURES / "extraction"
TWO_SPACES_REGEX = re.compile(r" {2}")
ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
TWO_SPACES_LM_REGEX = re.compile(r"⎵ ⎵")
TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
# NamedTuple to mock actual database result
Entity = NamedTuple("Entity", offset=int, length=int, type=str, value=str)
......@@ -319,11 +319,11 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
arkindex_extractor.process_element(element, "val")
@pytest.mark.parametrize("load_entities", (True, False))
@pytest.mark.parametrize("keep_spaces", (True, False))
@pytest.mark.parametrize("load_entities", (True,)) # False))
@pytest.mark.parametrize("keep_spaces", (True,)) # False))
# Transcription and entities have the same worker version
@pytest.mark.parametrize(
"transcription_entities_worker_version", ("worker_version_id", False)
"transcription_entities_worker_version", (False,) # "worker_version_id",)#, False)
)
@patch("dan.datasets.extract.extract.download_image")
def test_extract(
......@@ -398,8 +398,14 @@ def test_extract(
VAL_DIR / "val-page_1-line_3.jpg",
output / "labels.json",
# Language resources
output / "language_model" / "corpus.txt",
output / "language_model" / "lexicon.txt",
output / "language_model" / "corpus_characters.txt",
output / "language_model" / "corpus_subwords.txt",
output / "language_model" / "corpus_words.txt",
output / "language_model" / "lexicon_characters.txt",
output / "language_model" / "lexicon_subwords.txt",
output / "language_model" / "lexicon_words.txt",
output / "language_model" / "subword_tokenizer.model",
output / "language_model" / "subword_tokenizer.vocab",
output / "language_model" / "tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
......@@ -466,36 +472,62 @@ def test_extract(
assert set(pickle.loads((output / "charset.pkl").read_bytes())) == expected_charset
# Check "language_corpus.txt"
expected_language_corpus = """ⓢ C a i l l e t ⎵ ⎵ ⓕ M a u r i c e ⎵ ⎵ ⓑ 2 8 . 9 . 0 6
ⓢ R e b o u l ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 3 0 . 9 . 0 2
ⓢ B a r e y r e ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 2 8 . 3 . 1 1
ⓢ R o u s s y ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 4 . 1 1 . 1 4
ⓢ M a r i n ⎵ ⎵ ⓕ M a r c e l ⎵ ⎵ ⓑ 1 0 . 8 . 0 6
ⓢ A m i c a l ⎵ ⎵ ⓕ E l o i ⎵ ⎵ ⓑ 1 1 . 1 0 . 0 4
ⓢ B i r o s ⎵ ⎵ ⓕ M a e l ⎵ ⎵ ⓑ 3 0 . 1 0 . 1 0"""
expected_char_language_corpus = """ⓢ C a i l l e t ▁ ▁ ⓕ M a u r i c e ▁ ▁ ⓑ 2 8 . 9 . 0 6
ⓢ R e b o u l ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 3 0 . 9 . 0 2
ⓢ B a r e y r e ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2 8 . 3 . 1 1
ⓢ R o u s s y ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 4 . 1 1 . 1 4
ⓢ M a r i n ▁ ▁ ⓕ M a r c e l ▁ ▁ ⓑ 1 0 . 8 . 0 6
ⓢ A m i c a l ▁ ▁ ⓕ E l o i ▁ ▁ ⓑ 1 1 . 1 0 . 0 4
ⓢ B i r o s ▁ ▁ ⓕ M a e l ▁ ▁ ⓑ 3 0 . 1 0 . 1 0"""
expected_word_language_corpus = """ⓢ Caillet ▁ ⓕ Maurice ▁ ⓑ 28 ▁ . ▁ 9 ▁ . ▁ 06
ⓢ Reboul ▁ ⓕ Jean ▁ ⓑ 30 ▁ . ▁ 9 ▁ . ▁ 02
ⓢ Bareyre ▁ ⓕ Jean ▁ ⓑ 28 ▁ . ▁ 3 ▁ . ▁ 11
ⓢ Roussy ▁ ⓕ Jean ▁ ⓑ 4 ▁ . ▁ 11 ▁ . ▁ 14
ⓢ Marin ▁ ⓕ Marcel ▁ ⓑ 10 ▁ . ▁ 8 ▁ . ▁ 06
ⓢ Amical ▁ ⓕ Eloi ▁ ⓑ 11 ▁ . ▁ 10 ▁ . ▁ 04
ⓢ Biros ▁ ⓕ Mael ▁ ⓑ 30 ▁ . ▁ 10 ▁ . ▁ 10"""
expected_subword_language_corpus = """▁ ⓢ C a i l l e t ▁ ⓕ M a u ri ce ▁ ⓑ 28. 9.0 6
▁ ⓢ R e b ou l ▁ ⓕ J e a n ▁ ⓑ 30. 9.0 2
▁ ⓢ B a re y re ▁ ⓕ J e a n ▁ ⓑ 28. 3 .11
▁ ⓢ R ou s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 11.1 4
▁ ⓢ Mar i n ▁ ⓕ Mar ce l ▁ ⓑ 10. 8. 0 6
▁ ⓢ A m ic a l ▁ ⓕ E l o i ▁ ⓑ 11.1 0 . 0 4
▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 30. 10. 10"""
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
expected_language_corpus = expected_language_corpus.lower()
expected_char_language_corpus = expected_char_language_corpus.lower()
# If we do not load entities, remove tokens
if not load_entities:
token_translations = {f"{token} ": "" for token in tokens}
expected_language_corpus = ENTITY_TOKEN_SPACE.sub("", expected_language_corpus)
expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_char_language_corpus
)
# Replace double spaces with regular space
if not keep_spaces:
expected_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_language_corpus
expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_char_language_corpus
)
assert (
output / "language_model" / "corpus.txt"
).read_text() == expected_language_corpus
output / "language_model" / "corpus_characters.txt"
).read_text() == expected_char_language_corpus
assert (
output / "language_model" / "corpus_words.txt"
).read_text() == expected_word_language_corpus
assert (
output / "language_model" / "corpus_subwords.txt"
).read_text() == expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens = [
t if t != " " else "" for t in sorted(list(expected_charset))
t if t != " " else "" for t in sorted(list(expected_charset))
]
expected_language_tokens.append("")
assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
......@@ -503,10 +535,28 @@ def test_extract(
)
# Check "language_lexicon.txt"
expected_language_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (output / "language_model" / "lexicon.txt").read_text() == "\n".join(
expected_language_lexicon
expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (
output / "language_model" / "lexicon_characters.txt"
).read_text() == "\n".join(expected_language_char_lexicon)
word_vocab = set([word for word in expected_word_language_corpus.split()])
expected_language_word_lexicon = [
f"{word} {' '.join(word)}" for word in sorted(word_vocab)
]
assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
expected_language_word_lexicon
)
subword_vocab = set(
[subword for subword in expected_subword_language_corpus.split()]
)
expected_language_subword_lexicon = [
f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
]
assert (
output / "language_model" / "lexicon_subwords.txt"
).read_text() == "\n".join(expected_language_subword_lexicon)
# Check cropped images
for expected_path in expected_paths:
......
......@@ -589,10 +589,12 @@ def test_run_prediction_batch(
),
),
)
@pytest.mark.parametrize("batch_size", [1, 2])
def test_run_prediction_language_model(
image_names,
language_model_weight,
expected_predictions,
batch_size,
tmp_path,
):
# Make tmpdir and copy needed images inside
......