Skip to content
Snippets Groups Projects
Commit 646805ae authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'language-model-command' into 'main'

Use a dedicated command to build language model resources

Closes #296

See merge request !439
parents e2f3d8f3 b04d27c6
No related branches found
No related tags found
1 merge request!439Use a dedicated command to build language model resources
Showing
with 914 additions and 510 deletions
......@@ -10,6 +10,7 @@ from dan.datasets.analyze import add_analyze_parser
from dan.datasets.download import add_download_parser
from dan.datasets.entities import add_entities_parser
from dan.datasets.extract import add_extract_parser
from dan.datasets.language_model import add_language_model_parser
from dan.datasets.tokens import add_tokens_parser
......@@ -26,3 +27,4 @@ def add_dataset_parser(subcommands) -> None:
add_analyze_parser(subcommands)
add_entities_parser(subcommands)
add_tokens_parser(subcommands)
add_language_model_parser(subcommands)
......@@ -60,18 +60,4 @@ def add_download_parser(subcommands) -> None:
help="Token to use to replace character in the validation/test sets that is not included in the training set.",
)
parser.add_argument(
"--subword-vocab-size",
type=int,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
default=1000,
)
parser.add_argument(
"--tokens",
type=pathlib.Path,
help="Mapping between starting tokens and end tokens to extract text with their entities.",
required=False,
)
parser.set_defaults(func=run)
......@@ -19,13 +19,10 @@ from tqdm import tqdm
from dan.datasets.download.exceptions import ImageDownloadError
from dan.datasets.download.utils import (
Tokenizer,
download_image,
get_bbox,
get_vocabulary,
)
from dan.datasets.extract.arkindex import TRAIN_NAME
from dan.utils import LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
from line_image_extractor.image_utils import (
BoundingBox,
......@@ -33,8 +30,6 @@ from line_image_extractor.image_utils import (
polygon_to_bbox,
)
LANGUAGE_DIR = "language_model" # Subpath to the language model directory.
IMAGES_DIR = "images" # Subpath to the images directory.
IIIF_URL = "{image_url}/{bbox}/{size}/0/default.jpg"
......@@ -56,8 +51,6 @@ class ImageDownloader:
max_height: int | None = None,
image_extension: str = "",
unknown_token: str = "",
subword_vocab_size: int = 1000,
tokens: Path | None = None,
) -> None:
self.output = output
......@@ -65,16 +58,6 @@ class ImageDownloader:
self.max_height = max_height
self.image_extension = image_extension
self.unknown_token = unknown_token
self.tokens = parse_tokens(tokens) if tokens else {}
self.subword_vocab_size = subword_vocab_size
self.mapping = LMTokenMapping()
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = defaultdict(list)
# Load split file
split_file = self.output / "split.json" if self.output else None
self.split: Dict = (
......@@ -94,6 +77,10 @@ class ImageDownloader:
)
)
# Add unknown token to charset
self.unknown_token = unknown_token
self.charset.add(self.unknown_token)
def check_extraction(self, values: dict) -> str | None:
# Check dataset_id parameter
if values.get("dataset_id") is None:
......@@ -273,62 +260,6 @@ class ImageDownloader:
logger.error(f"Failed to download {len(failed_downloads)} image(s).")
print(*list(map(": ".join, failed_downloads)), sep="\n")
def format_lm_files(self) -> None:
"""
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Add unknown token to charset
self.charset.add(self.unknown_token)
# Build LM tokens
for token in sorted(list(self.charset)):
assert (
token not in self.mapping.encode.values()
), f"Special token {token} is reserved for language modeling."
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
# Build LM corpus
train_corpus = [
values["text"].replace(
self.mapping.linebreak.display, self.mapping.space.display
)
for values in self.split[TRAIN_NAME].values()
]
tokenizer = Tokenizer(
training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / LANGUAGE_DIR,
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
if not tokenizer.sentencepiece_model:
return
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
("subwords", tokenizer.subword_tokenize),
):
self.language_corpus[level] = list(map(tokenize, train_corpus))
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
for level in ["words", "subwords"]:
self.language_lexicon[level] = [
f"{token} {tokenizer.char_tokenize(token)}"
for token in get_vocabulary(self.language_corpus[level])
]
def export(self) -> None:
"""
Writes a `labels.json` file containing a mapping of the images that have been correctly uploaded (identified by its path)
......@@ -342,16 +273,6 @@ class ImageDownloader:
)
)
for level in ["characters", "words", "subwords"]:
(self.output / LANGUAGE_DIR / f"corpus_{level}.txt").write_text(
"\n".join(self.language_corpus[level])
)
(self.output / LANGUAGE_DIR / f"lexicon_{level}.txt").write_text(
"\n".join(self.language_lexicon[level])
)
(self.output / LANGUAGE_DIR / "tokens.txt").write_text(
"\n".join(self.language_tokens)
)
(self.output / "charset.pkl").write_bytes(
pickle.dumps(sorted(list(self.charset)))
)
......@@ -364,7 +285,6 @@ class ImageDownloader:
"""
tasks: List[Dict[str, str]] = self.build_tasks()
self.download_images(tasks)
self.format_lm_files()
self.export()
......@@ -374,8 +294,6 @@ def run(
max_height: int | None,
image_format: str,
unknown_token: str,
subword_vocab_size: int,
tokens: Path | None,
):
"""
Download the missing images from a `split.json` file and build a `labels.json` file containing
......@@ -387,17 +305,11 @@ def run(
:param max_height: Images larger than this height will be resized to this height
:param image_format: Images will be saved under this format
:param unknown_token: The token used to replace unknown characters.
:param subword_vocab_size: The size of the subword vocabulary.
:param tokens: Mapping between starting tokens and end tokens to extract text with their entities..
"""
(output / LANGUAGE_DIR).mkdir(parents=True, exist_ok=True)
ImageDownloader(
output=output,
max_width=max_width,
max_height=max_height,
image_extension=image_format,
unknown_token=unknown_token,
subword_vocab_size=subword_vocab_size,
tokens=tokens,
).run()
......@@ -2,18 +2,11 @@
# This code is licensed under CeCILL-C
# -*- coding: utf-8 -*-
import itertools
import logging
import operator
from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import List
import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps
from tenacity import (
retry,
......@@ -22,8 +15,6 @@ from tenacity import (
wait_exponential,
)
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
# See http://docs.python-requests.org/en/master/user/advanced/#timeouts
......@@ -89,130 +80,3 @@ def get_bbox(polygon: List[List[int]]) -> str:
x, y = min(all_x), min(all_y)
width, height = max(all_x) - x, max(all_y) - y
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(set([token for doc in tokenized_text for token in doc.split()]))
@dataclass
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
training_corpus: List[str]
charset: List[str]
unknown_token: str
outdir: Path
mapping: LMTokenMapping
tokens: EntityType | None = None
subword_vocab_size: int = 1000
sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
@property
def prefix(self) -> Path:
return self.outdir / "subword_tokenizer"
@property
def ner_tokens(self) -> List[str]:
if self.tokens is None:
return []
return list(
itertools.chain(
map(operator.attrgetter("start"), self.tokens.values()),
filter(
operator.truth,
map(operator.attrgetter("end"), self.tokens.values()),
),
)
)
@property
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self) -> List[str]:
return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
def __post_init__(self) -> None:
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
logger.info("Training a sentencepiece model for subword tokenization")
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp_file:
tmp_file.write("\n".join(self.training_corpus))
tmp_file.flush()
try:
spm.SentencePieceTrainer.train(
input=tmp_file.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
minloglevel=1,
)
except Exception as e:
logger.warning(
f"Failed to train a sentencepiece model for subword tokenization: {e} "
"Try again by editing the `--subword-vocab-size` parameter."
)
self.sentencepiece_model = None
return
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
model_file=str(self.prefix.with_suffix(".model"))
)
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
return " ".join(map("".join, map(self.encode, tokens)))
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
return " ".join(
[
f"{word} {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(
[
char if char in self.charset else self.unknown_token
for char in self.encode(text)
]
)
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
# Copyright Teklia (contact@teklia.com) & Denis Coquenet
# This code is licensed under CeCILL-C
# -*- coding: utf-8 -*-
"""
Build all resources needed for the language model from a split extracted by DAN
"""
import pathlib
from dan.datasets.language_model.build import run
def add_language_model_parser(subcommands) -> None:
parser = subcommands.add_parser(
"language-model",
description=__doc__,
help=__doc__,
)
# Required arguments.
parser.add_argument(
"--output",
type=pathlib.Path,
help="Path where the `labels.json` and `charset.pkl` files are stored and where the data will be generated.",
required=True,
)
# Formatting arguments
parser.add_argument(
"--subword-vocab-size",
type=int,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
default=1000,
)
parser.add_argument(
"--unknown-token",
type=str,
default="",
help="Token to use to replace character in the validation/test sets that is not included in the training set.",
)
parser.add_argument(
"--tokens",
type=pathlib.Path,
help="Mapping between starting tokens and end tokens to extract text with their entities.",
required=False,
)
parser.set_defaults(func=run)
# Copyright Teklia (contact@teklia.com) & Denis Coquenet
# This code is licensed under CeCILL-C
# -*- coding: utf-8 -*-
import json
import logging
import pickle
from collections import defaultdict
from pathlib import Path
from typing import Dict
from dan.datasets.extract.arkindex import TRAIN_NAME
from dan.datasets.language_model.utils import (
Tokenizer,
get_vocabulary,
)
from dan.utils import LMTokenMapping, parse_tokens
LANGUAGE_DIR = "language_model" # Subpath to the language model directory.
logger = logging.getLogger(__name__)
class LanguageModelBuilder:
"""
Build a language model from extracted data
"""
def __init__(
self,
output: Path | None = None,
subword_vocab_size: int = 1000,
unknown_token: str = "",
tokens: Path | None = None,
) -> None:
self.output = output
self.unknown_token = unknown_token
self.tokens = parse_tokens(tokens) if tokens else {}
self.subword_vocab_size = subword_vocab_size
self.mapping = LMTokenMapping()
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = defaultdict(list)
# Load labels file
labels_file = self.output / "labels.json" if self.output else None
self.labels: Dict = (
json.loads(labels_file.read_text())
if labels_file and labels_file.is_file()
else {}
)
# Load charset file
charset_file = self.output / "charset.pkl" if self.output else None
self.charset: Dict = (
pickle.loads(charset_file.read_bytes())
if charset_file and charset_file.is_file()
else {}
)
def format_lm_files(self) -> None:
"""
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Build LM tokens
for token in sorted(list(self.charset)):
assert (
token not in self.mapping.encode.values()
), f"Special token {token} is reserved for language modeling."
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
self.language_tokens.append(self.mapping.ctc.encoded)
# Build LM corpus
train_corpus = [
value.replace(self.mapping.linebreak.display, self.mapping.space.display)
for value in self.labels[TRAIN_NAME].values()
]
tokenizer = Tokenizer(
training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / LANGUAGE_DIR,
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
if not tokenizer.sentencepiece_model:
return
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
("subwords", tokenizer.subword_tokenize),
):
self.language_corpus[level] = list(map(tokenize, train_corpus))
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
for level in ["words", "subwords"]:
self.language_lexicon[level] = [
f"{token} {tokenizer.char_tokenize(token)}"
for token in get_vocabulary(self.language_corpus[level])
]
def export(self) -> None:
"""
Writes all files needed for the language model
"""
for level in ["characters", "words", "subwords"]:
(self.output / LANGUAGE_DIR / f"corpus_{level}.txt").write_text(
"\n".join(self.language_corpus[level])
)
(self.output / LANGUAGE_DIR / f"lexicon_{level}.txt").write_text(
"\n".join(self.language_lexicon[level])
)
(self.output / LANGUAGE_DIR / "tokens.txt").write_text(
"\n".join(self.language_tokens)
)
def run(self) -> None:
"""
Build and write all files needed for the language model
"""
self.format_lm_files()
self.export()
def run(
output: Path,
subword_vocab_size: int,
unknown_token: str,
tokens: Path | None,
):
"""
Build and write all files needed for the language model
:param output: Path where the `split.json` file is stored and where the data will be generated
:param subword_vocab_size: The size of the subword vocabulary.
:param unknown_token: The token used to replace unknown characters.
:param tokens: Mapping between starting tokens and end tokens to extract text with their entities.
"""
(output / LANGUAGE_DIR).mkdir(parents=True, exist_ok=True)
LanguageModelBuilder(
output=output,
subword_vocab_size=subword_vocab_size,
unknown_token=unknown_token,
tokens=tokens,
).run()
# Copyright Teklia (contact@teklia.com) & Denis Coquenet
# This code is licensed under CeCILL-C
# -*- coding: utf-8 -*-
import itertools
import logging
import operator
from dataclasses import dataclass, field
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import List
import sentencepiece as spm
from nltk import wordpunct_tokenize
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(set([token for doc in tokenized_text for token in doc.split()]))
@dataclass
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
training_corpus: List[str]
charset: List[str]
unknown_token: str
outdir: Path
mapping: LMTokenMapping
tokens: EntityType | None = None
subword_vocab_size: int = 1000
sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
@property
def prefix(self) -> Path:
return self.outdir / "subword_tokenizer"
@property
def ner_tokens(self) -> List[str]:
if self.tokens is None:
return []
return list(
itertools.chain(
map(operator.attrgetter("start"), self.tokens.values()),
filter(
operator.truth,
map(operator.attrgetter("end"), self.tokens.values()),
),
)
)
@property
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self) -> List[str]:
return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
def __post_init__(self) -> None:
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
logger.info("Training a sentencepiece model for subword tokenization")
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp_file:
tmp_file.write("\n".join(self.training_corpus))
tmp_file.flush()
try:
spm.SentencePieceTrainer.train(
input=tmp_file.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
minloglevel=1,
)
except Exception as e:
logger.warning(
f"Failed to train a sentencepiece model for subword tokenization: {e} "
"Try again by editing the `--subword-vocab-size` parameter."
)
self.sentencepiece_model = None
return
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
model_file=str(self.prefix.with_suffix(".model"))
)
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
return " ".join(map("".join, map(self.encode, tokens)))
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
return " ".join(
[
f"{word} {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(
[
char if char in self.charset else self.unknown_token
for char in self.encode(text)
]
)
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
......@@ -51,6 +51,14 @@ The library already has all the documents needed to run the [dataset download co
teklia-dan dataset download --output .
```
#### Dataset language-model command
The library already has all the documents needed to run the [dataset language-model command](../usage/datasets/language_model.md) on a minimalist dataset. In the `tests/data/prediction` directory, you can run the following command and add any extra parameters you need:
```shell
teklia-dan dataset language-model --output . --subword-vocab-size 45
```
#### Dataset analyze command
The library already has all the documents needed to run the [dataset analyze command](../usage/datasets/analyze.md) on a minimalist dataset. In the `tests/data/training/training_dataset` directory, you can run the following command and add any extra parameters you need:
......
......@@ -10,6 +10,7 @@ To extract the data, DAN uses an Arkindex export database in SQLite format. You
1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
1. Extract the data with the [extract command](../usage/datasets/extract.md).
1. Download images with the [download command](../usage/datasets/download.md).
1. Build language model resources with the [language-model command](../usage/datasets/language_model.md).
These commands will extract and format the images and labels needed to train DAN. It will also tokenize the training corpus at character, subword, and word levels, allowing you to combine DAN with an explicit statistical language model to improve performance.
......
# Build
::: dan.datasets.language_model.build
# Language model
::: dan.datasets.language_model
# Utils
::: dan.datasets.language_model.utils
......@@ -5,21 +5,18 @@
Use the `teklia-dan dataset download` command to download images of a dataset from a split extracted by DAN. This will:
- Store the set of characters encountered in the dataset (in the `charset.pkl` file),
- Generate the resources needed to build a n-gram language model at character, subword or word-level with [kenlm](https://github.com/kpu/kenlm) (in the `language_model/` folder).
- Generate the images of each element (in the `images/` folder),
- Create the mapping of the images that have been correctly uploaded (identified by its path) to the ground-truth transcription (with NER tokens if needed) (in the `labels.json` file).
If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed.
| Parameter | Description | Type | Default |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
| `--output` | Path where the `split.json` file is stored and where the data will be generated. | `pathlib.Path` | |
| `--max-width` | Images larger than this width will be resized to this width. | `int` | |
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--unknown-token` | Token to use to replace character in the validation/test sets that is not included in the training set. | `str` | `⁇` |
| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` | |
| `--subword-vocab-size` | Size of the vocabulary used to train the sentencepiece subword tokenizer used to train the optional language model. | `int` | `1000` |
| Parameter | Description | Type | Default |
| ----------------- | ------------------------------------------------------------------------------------------------------- | -------------- | ------- |
| `--output` | Path where the `split.json` file is stored and where the data will be generated. | `pathlib.Path` | |
| `--max-width` | Images larger than this width will be resized to this width. | `int` | |
| `--max-height` | Images larger than this height will be resized to this height. | `int` | |
| `--image-format` | Images will be saved under this format. | `str` | `.jpg` |
| `--unknown-token` | Token to use to replace character in the validation/test sets that is not included in the training set. | `str` | `⁇` |
The `--output` directory should have a `split.json` JSON-formatted file with a specific format. A mapping of the elements (identified by its ID) to the image information and the ground-truth transcription (with NER tokens if needed). This file can be generated by the `teklia-dan dataset extract` command. More details in the [dedicated page](./extract.md).
......@@ -46,32 +43,6 @@ The `--output` directory should have a `split.json` JSON-formatted file with a s
}
```
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. This file can be generated by the `teklia-dan dataset tokens` command. More details in the [dedicated page](./tokens.md).
```yaml
INTITULE: # Type of the entity on Arkindex
start: # Starting token for this entity
end: # Optional ending token for this entity
DATE:
start:
end:
COTE_SERIE:
start:
end:
ANALYSE_COMPL.:
start:
end:
PRECISIONS_SUR_COTE:
start:
end:
COTE_ARTICLE:
start:
end:
CLASSEMENT:
start:
end:
```
## Examples
### Download full images
......
# Datasets
Two operations are available through subcommands:
Several operations are available through subcommands:
`teklia-dan dataset entities`
: To extract entities from an [Arkindex export](https://doc.arkindex.org/howto/export/). More details in the [dedicated page](./entities.md).
......@@ -14,5 +14,8 @@ Two operations are available through subcommands:
`teklia-dan dataset download`
: To download images of a dataset. More details in the [dedicated page](./download.md).
`teklia-dan dataset language-model`
: To build language model resources of a dataset. More details in the [dedicated page](./language_model.md).
`teklia-dan dataset analyze`
: To analyze datasets and display statistics. More details in the [dedicated page](./analyze.md).
# Dataset language model
## Description
Use the `teklia-dan dataset language-model` command to build language model resources of a dataset from a split extracted by DAN. This will:
- Generate the resources needed to build a n-gram language model at character, subword or word-level with [kenlm](https://github.com/kpu/kenlm) (in the `language_model/` folder).
| Parameter | Description | Type | Default |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------- | -------------- | ------- |
| `--output` | Path where the `labels.json` and `charset.pkl` files are stored and where the data will be generated. | `pathlib.Path` | |
| `--subword-vocab-size` | Size of the vocabulary used to train the sentencepiece subword tokenizer used to train the optional language model. | `int` | `1000` |
| `--unknown-token` | Token to use to replace character in the validation/test sets that is not included in the training set. | `str` | `⁇` |
| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` | |
The `--output` directory should have:
- A `charset.pkl` file of the set of characters encountered in the dataset,
- A `labels.json` JSON-formatted file with a specific format. A mapping of the images (identified by its path) to the ground-truth transcription (with NER tokens if needed).
These files can be generated by the `teklia-dan dataset download` command. More details in the [dedicated page](./download.md).
```json
{
"train": {
"<image_path>": "\u24e2Coufet \u24d5Bouis \u24d107.12.14"
},
"val": {},
"test": {}
}
```
The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. This file can be generated by the `teklia-dan dataset tokens` command. More details in the [dedicated page](./tokens.md).
```yaml
INTITULE: # Type of the entity on Arkindex
start: # Starting token for this entity
end: # Optional ending token for this entity
DATE:
start:
end:
COTE_SERIE:
start:
end:
ANALYSE_COMPL.:
start:
end:
PRECISIONS_SUR_COTE:
start:
end:
COTE_ARTICLE:
start:
end:
CLASSEMENT:
start:
end:
```
## Examples
### HTR and NER data
To build language model resources with NER data, please use the following:
```shell
teklia-dan dataset language-model \
--output data \
--tokens tokens.yml
```
### HTR data
To build language model resources without NER data, please use the following:
```shell
teklia-dan dataset language-model \
--output data
```
......@@ -9,7 +9,7 @@ To build the language model, you first need to install and compile [kenlm](https
## Build the language model
The `teklia-dan dataset extract` automatically generate the files required to train a language model either at character, subword or word-level in `my_dataset/language_model/`.
The `teklia-dan dataset language-model` automatically generate the files required to train a language model either at character, subword or word-level in `my_dataset/language_model/`.
Note that linebreaks are replaced by spaces in the language model.
......
......@@ -65,6 +65,7 @@ nav:
- Dataset tokens: usage/datasets/tokens.md
- Dataset extraction: usage/datasets/extract.md
- Dataset download: usage/datasets/download.md
- Dataset language model: usage/datasets/language_model.md
- Dataset analysis: usage/datasets/analyze.md
- Training:
- usage/train/index.md
......@@ -98,6 +99,10 @@ nav:
- Utils: ref/datasets/extract/utils.md
- Database management: ref/datasets/extract/db.md
- Exceptions: ref/datasets/extract/exceptions.md
- Language model:
- ref/datasets/language_model/index.md
- Build: ref/datasets/language_model/build.md
- Utils: ref/datasets/language_model/utils.md
- Tokens:
- ref/datasets/tokens/index.md
- Generate: ref/datasets/tokens/generate.md
......
......@@ -5,7 +5,6 @@
import json
import logging
import pickle
import re
from operator import attrgetter, methodcaller
from pathlib import Path
......@@ -20,9 +19,6 @@ from tests import FIXTURES, change_split_content
EXTRACTION_DATA_PATH = FIXTURES / "extraction"
ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
@pytest.mark.parametrize(
"max_width, max_height, width, height, resize",
......@@ -46,140 +42,21 @@ def test_get_iiif_size_arg(max_width, max_height, width, height, resize, tmp_pat
)
@pytest.mark.parametrize("load_entities", [True, False])
@pytest.mark.parametrize("keep_spaces", [True, False])
@pytest.mark.parametrize(
"load_entities,keep_spaces,transcription_entities_worker_version,expected_subword_language_corpus,subword_vocab_size",
(
(
True,
True,
"worker_version_id",
"""▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
40,
),
(
True,
False,
"worker_version_id",
"""▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
40,
),
(
False,
True,
"worker_version_id",
"""▁ la u l ont ▁ f r an c oi s ▁ 8
▁ c i re t ▁ an t oi ne ▁ 2 7
▁ c i re t ▁ m a r ie ▁ 2 8
▁ c i re t ▁ m a r ie ▁ 2
▁ e u res t on ▁ so l an g e ▁ 1 0
▁ t e r ont u ss ie u x ▁ j e an ▁ 2
▁ p res so ne t ▁ m a r ie ▁ 1 2""",
40,
),
(
False,
False,
"worker_version_id",
"""▁ la u l ont ▁ f r an c oi s ▁ 8
▁ c i re t ▁ an t oi ne ▁ 2 7
▁ c i re t ▁ m a r ie ▁ 2 8
▁ c i re t ▁ m a r ie ▁ 2
▁ e u res t on ▁ so l an g e ▁ 1 0
▁ t e r ont u ss ie u x ▁ j e an ▁ 2
▁ p res so ne t ▁ m a r ie ▁ 1 2""",
40,
),
(
True,
True,
False,
"""▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
40,
),
(
True,
True,
False,
"""▁ ⓢ L a u l ont ▁ ⓕ F r an c oi s ▁ ⓑ 8
▁ ⓢ C i re t ▁ ⓕ A n t oi n e ▁ ⓑ 2 7
▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u re s t on ▁ ⓕ S o l an g e ▁ ⓑ 1 0
▁ ⓢ T e r ont u s s ie u x ▁ ⓕ J e an ▁ ⓑ 2
▁ ⓢ P re s s on e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
45,
),
(
True,
False,
False,
"""▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
40,
),
(
False,
True,
False,
"""▁ L a u l ont ▁ F r an c oi s ▁ 8
▁ C i re t ▁ A n t oi n e ▁ 2 7
▁ C i re t ▁ M a r ie ▁ 2 8
▁ C i re t ▁ M a r ie ▁ 2
▁ E u re s t on ▁ S o l an g e ▁ 1 0
▁ T e r ont u s s ie u x ▁ J e an ▁ 2
▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
40,
),
(
False,
False,
False,
"""▁ L a u l ont ▁ F r an c oi s ▁ 8
▁ C i re t ▁ A n t oi n e ▁ 2 7
▁ C i re t ▁ M a r ie ▁ 2 8
▁ C i re t ▁ M a r ie ▁ 2
▁ E u re s t on ▁ S o l an g e ▁ 1 0
▁ T e r ont u s s ie u x ▁ J e an ▁ 2
▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
40,
),
),
"transcription_entities_worker_version", ["worker_version_id", False]
)
def test_download(
load_entities,
keep_spaces,
transcription_entities_worker_version,
expected_subword_language_corpus,
subword_vocab_size,
split_content,
monkeypatch,
tmp_path,
):
output = tmp_path / "download"
(output / "language_model").mkdir(parents=True, exist_ok=True)
output.mkdir(parents=True, exist_ok=True)
# Mock tokens
tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
......@@ -236,8 +113,6 @@ def test_download(
extractor = ImageDownloader(
output=output,
image_extension=".jpg",
tokens=tokens_path if load_entities else None,
subword_vocab_size=subword_vocab_size,
)
# Mock build_image_url to simply return the path to the image
extractor.build_iiif_url = mock_build_image_url
......@@ -271,16 +146,6 @@ def test_download(
VAL_DIR / "val-page_1-line_2.jpg",
VAL_DIR / "val-page_1-line_3.jpg",
output / "labels.json",
# Language resources
output / "language_model" / "corpus_characters.txt",
output / "language_model" / "corpus_subwords.txt",
output / "language_model" / "corpus_words.txt",
output / "language_model" / "lexicon_characters.txt",
output / "language_model" / "lexicon_subwords.txt",
output / "language_model" / "lexicon_words.txt",
output / "language_model" / "subword_tokenizer.model",
output / "language_model" / "subword_tokenizer.vocab",
output / "language_model" / "tokens.txt",
output / "split.json",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
......@@ -298,97 +163,6 @@ def test_download(
# Check "labels.json"
assert json.loads((output / "labels.json").read_text()) == expected_labels
# Check "language_corpus.txt"
expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
expected_char_language_corpus = expected_char_language_corpus.lower()
expected_word_language_corpus = expected_word_language_corpus.lower()
expected_subword_language_corpus = expected_subword_language_corpus.lower()
# If we do not load entities, remove tokens
if not load_entities:
expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_subword_language_corpus
)
# Replace double spaces with regular space
if not keep_spaces:
expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_subword_language_corpus
)
assert (
output / "language_model" / "corpus_characters.txt"
).read_text() == expected_char_language_corpus
assert (
output / "language_model" / "corpus_words.txt"
).read_text() == expected_word_language_corpus
assert (
output / "language_model" / "corpus_subwords.txt"
).read_text() == expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens = [
"" if t.isspace() else t for t in sorted(list(expected_charset))
]
expected_language_tokens.append("")
assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
expected_language_tokens
)
# Check "language_lexicon.txt"
expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (
output / "language_model" / "lexicon_characters.txt"
).read_text() == "\n".join(expected_language_char_lexicon)
word_vocab = set([word for word in expected_word_language_corpus.split()])
expected_language_word_lexicon = [
f"{word} {' '.join(word)}" for word in sorted(word_vocab)
]
assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
expected_language_word_lexicon
)
subword_vocab = set(
[subword for subword in expected_subword_language_corpus.split()]
)
expected_language_subword_lexicon = [
f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
]
assert (
output / "language_model" / "lexicon_subwords.txt"
).read_text() == "\n".join(expected_language_subword_lexicon)
# Check cropped images
for expected_path in expected_paths:
if expected_path.suffix != ".jpg":
......
# Copyright Teklia (contact@teklia.com) & Denis Coquenet
# This code is licensed under CeCILL-C
# -*- coding: utf-8 -*-
import json
import pickle
import re
from operator import methodcaller
import pytest
from dan.datasets.language_model.build import LanguageModelBuilder
from dan.utils import parse_tokens
from tests import FIXTURES, change_split_content
EXTRACTION_DATA_PATH = FIXTURES / "extraction"
ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
@pytest.mark.parametrize(
"load_entities,transcription_entities_worker_version,expected_subword_language_corpus",
(
(
True,
"worker_version_id",
"""▁ ⓢ l a u l ont ▁ ⓕ f r an c oi s ▁ ⓑ 8
▁ ⓢ c i re t ▁ ⓕ an t oi ne ▁ ⓑ 2 7
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2 8
▁ ⓢ c i re t ▁ ⓕ m a r ie ▁ ⓑ 2
▁ ⓢ e u re s t on ▁ ⓕ so l an g e ▁ ⓑ 1 0
▁ ⓢ t e r ont u s s ie u x ▁ ⓕ j e an ▁ ⓑ 2
▁ ⓢ p re s s on e t ▁ ⓕ m a r ie ▁ ⓑ 1 2""",
),
(
False,
"worker_version_id",
"""▁ la u l ont ▁ f r an c oi s ▁ 8
▁ c i re t ▁ an t oi ne ▁ 2 7
▁ c i re t ▁ m a r ie ▁ 2 8
▁ c i re t ▁ m a r ie ▁ 2
▁ e u res t on ▁ so l an g e ▁ 1 0
▁ t e r ont u ss ie u x ▁ j e an ▁ 2
▁ p res so ne t ▁ m a r ie ▁ 1 2""",
),
(
True,
False,
"""▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
),
(
False,
False,
"""▁ L a u l ont ▁ F r an c oi s ▁ 8
▁ C i re t ▁ A n t oi n e ▁ 2 7
▁ C i re t ▁ M a r ie ▁ 2 8
▁ C i re t ▁ M a r ie ▁ 2
▁ E u re s t on ▁ S o l an g e ▁ 1 0
▁ T e r ont u s s ie u x ▁ J e an ▁ 2
▁ P re s s on e t ▁ M a r ie ▁ 1 2""",
),
),
)
@pytest.mark.parametrize("keep_spaces", [True, False])
def test_language_model(
load_entities,
keep_spaces,
transcription_entities_worker_version,
expected_subword_language_corpus,
split_content,
tmp_path,
):
output = tmp_path / "build"
(output / "language_model").mkdir(parents=True, exist_ok=True)
# Mock tokens
tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
tokens = [
token
for entity_type in parse_tokens(tokens_path).values()
for token in [entity_type.start, entity_type.end]
if token
]
# Mock "labels.json"
_, labels_content = change_split_content(
load_entities,
transcription_entities_worker_version,
keep_spaces,
split_content,
tokens,
{
"test": {
"images/test/dataset_id/test-page_1-line_1.jpg": "ⓢLeunaut ⓕClau⁇e ⓑ⁇⁇",
"images/test/dataset_id/test-page_1-line_2.jpg": "ⓢ⁇aurac⁇o ⓕClau⁇ine ⓑ⁇⁇",
"images/test/dataset_id/test-page_1-line_3.jpg": "ⓢLaurent ⓕJac⁇use ⓑ21",
"images/test/dataset_id/test-page_2-line_1.jpg": "ⓢ⁇alette ⓕElisa⁇et⁇ ⓑ7⁇",
"images/test/dataset_id/test-page_2-line_2.jpg": "ⓢTan⁇ol ⓕJean ⓑ7⁇",
"images/test/dataset_id/test-page_2-line_3.jpg": "ⓢ⁇auret ⓕJean ⓑ⁇⁇",
},
"train": {
"images/train/dataset_id/train-page_1-line_1.jpg": "ⓢLaulont ⓕFrancois ⓑ8",
"images/train/dataset_id/train-page_1-line_2.jpg": "ⓢCiret ⓕAntoine ⓑ27",
"images/train/dataset_id/train-page_1-line_3.jpg": "ⓢCiret ⓕMarie ⓑ28",
"images/train/dataset_id/train-page_1-line_4.jpg": "ⓢCiret ⓕMarie ⓑ2",
"images/train/dataset_id/train-page_2-line_1.jpg": "ⓢEureston ⓕSolange ⓑ10",
"images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux ⓕJean ⓑ2",
"images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet ⓕMarie ⓑ12",
},
"val": {
"images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇ ⓕAntoine ⓑ⁇⁇",
"images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇ ⓕPriser ⓑ⁇⁇",
"images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇ ⓕElisa⁇et⁇ ⓑ⁇⁇",
},
},
)
(output / "labels.json").write_text(json.dumps(labels_content))
# Mock "charset.pkl"
expected_charset = {""}
for value in labels_content["train"].values():
expected_charset.update(set(value))
if load_entities:
expected_charset.update(tokens)
(output / "charset.pkl").write_bytes(pickle.dumps(sorted(list(expected_charset))))
extractor = LanguageModelBuilder(
output=output,
tokens=tokens_path if load_entities else None,
subword_vocab_size=40,
)
extractor.run()
# Check files
expected_paths = [
# Previous files
output / "charset.pkl",
output / "labels.json",
# Language resources
output / "language_model" / "corpus_characters.txt",
output / "language_model" / "corpus_subwords.txt",
output / "language_model" / "corpus_words.txt",
output / "language_model" / "lexicon_characters.txt",
output / "language_model" / "lexicon_subwords.txt",
output / "language_model" / "lexicon_words.txt",
output / "language_model" / "subword_tokenizer.model",
output / "language_model" / "subword_tokenizer.vocab",
output / "language_model" / "tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
# Check "language_corpus.txt"
expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
expected_char_language_corpus = expected_char_language_corpus.lower()
expected_word_language_corpus = expected_word_language_corpus.lower()
expected_subword_language_corpus = expected_subword_language_corpus.lower()
# If we do not load entities, remove tokens
if not load_entities:
expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_subword_language_corpus
)
# Replace double spaces with regular space
if not keep_spaces:
expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_subword_language_corpus
)
assert (
output / "language_model" / "corpus_characters.txt"
).read_text() == expected_char_language_corpus
assert (
output / "language_model" / "corpus_words.txt"
).read_text() == expected_word_language_corpus
assert (
output / "language_model" / "corpus_subwords.txt"
).read_text() == expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens = [
"" if t.isspace() else t for t in sorted(list(expected_charset))
]
expected_language_tokens.append("")
assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
expected_language_tokens
)
# Check "language_lexicon.txt"
expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (
output / "language_model" / "lexicon_characters.txt"
).read_text() == "\n".join(expected_language_char_lexicon)
word_vocab = set([word for word in expected_word_language_corpus.split()])
expected_language_word_lexicon = [
f"{word} {' '.join(word)}" for word in sorted(word_vocab)
]
assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
expected_language_word_lexicon
)
subword_vocab = set(
[subword for subword in expected_subword_language_corpus.split()]
)
expected_language_subword_lexicon = [
f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
]
assert (
output / "language_model" / "lexicon_subwords.txt"
).read_text() == "\n".join(expected_language_subword_lexicon)
@pytest.mark.parametrize(
"expected_subword_language_corpus,subword_vocab_size",
(
(
"""▁ ⓢ L a u l o n t ▁ ⓕ F r a n c o i s ▁ ⓑ 8
▁ ⓢ C i r e t ▁ ⓕ A n t o i n e ▁ ⓑ 2 7
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i r e t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u r e s t o n ▁ ⓕ S o l a n g e ▁ ⓑ 1 0
▁ ⓢ T e r o n t u s s ie u x ▁ ⓕ J e a n ▁ ⓑ 2
▁ ⓢ P r e s s o n e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
40,
),
(
"""▁ ⓢ L a u l ont ▁ ⓕ F r an c oi s ▁ ⓑ 8
▁ ⓢ C i re t ▁ ⓕ A n t oi n e ▁ ⓑ 2 7
▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2 8
▁ ⓢ C i re t ▁ ⓕ M a r ie ▁ ⓑ 2
▁ ⓢ E u re s t on ▁ ⓕ S o l an g e ▁ ⓑ 1 0
▁ ⓢ T e r ont u s s ie u x ▁ ⓕ J e an ▁ ⓑ 2
▁ ⓢ P re s s on e t ▁ ⓕ M a r ie ▁ ⓑ 1 2""",
45,
),
),
)
@pytest.mark.parametrize("keep_spaces", [True, False])
def test_language_model_subword_vocab_size(
keep_spaces,
expected_subword_language_corpus,
subword_vocab_size,
split_content,
tmp_path,
):
output = tmp_path / "build"
(output / "language_model").mkdir(parents=True, exist_ok=True)
# Mock tokens
tokens_path = EXTRACTION_DATA_PATH / "tokens.yml"
tokens = [
token
for entity_type in parse_tokens(tokens_path).values()
for token in [entity_type.start, entity_type.end]
if token
]
# Mock "labels.json"
_, labels_content = change_split_content(
True,
False,
keep_spaces,
split_content,
tokens,
{
"test": {
"images/test/dataset_id/test-page_1-line_1.jpg": "ⓢLeunaut ⓕClau⁇e ⓑ⁇⁇",
"images/test/dataset_id/test-page_1-line_2.jpg": "ⓢ⁇aurac⁇o ⓕClau⁇ine ⓑ⁇⁇",
"images/test/dataset_id/test-page_1-line_3.jpg": "ⓢLaurent ⓕJac⁇use ⓑ21",
"images/test/dataset_id/test-page_2-line_1.jpg": "ⓢ⁇alette ⓕElisa⁇et⁇ ⓑ7⁇",
"images/test/dataset_id/test-page_2-line_2.jpg": "ⓢTan⁇ol ⓕJean ⓑ7⁇",
"images/test/dataset_id/test-page_2-line_3.jpg": "ⓢ⁇auret ⓕJean ⓑ⁇⁇",
},
"train": {
"images/train/dataset_id/train-page_1-line_1.jpg": "ⓢLaulont ⓕFrancois ⓑ8",
"images/train/dataset_id/train-page_1-line_2.jpg": "ⓢCiret ⓕAntoine ⓑ27",
"images/train/dataset_id/train-page_1-line_3.jpg": "ⓢCiret ⓕMarie ⓑ28",
"images/train/dataset_id/train-page_1-line_4.jpg": "ⓢCiret ⓕMarie ⓑ2",
"images/train/dataset_id/train-page_2-line_1.jpg": "ⓢEureston ⓕSolange ⓑ10",
"images/train/dataset_id/train-page_2-line_2.jpg": "ⓢTerontussieux ⓕJean ⓑ2",
"images/train/dataset_id/train-page_2-line_3.jpg": "ⓢPressonet ⓕMarie ⓑ12",
},
"val": {
"images/val/dataset_id/val-page_1-line_1.jpg": "ⓢCirau⁇ ⓕAntoine ⓑ⁇⁇",
"images/val/dataset_id/val-page_1-line_2.jpg": "ⓢCirau⁇ ⓕPriser ⓑ⁇⁇",
"images/val/dataset_id/val-page_1-line_3.jpg": "ⓢCirau⁇ ⓕElisa⁇et⁇ ⓑ⁇⁇",
},
},
)
(output / "labels.json").write_text(json.dumps(labels_content))
# Mock "charset.pkl"
expected_charset = {""}
for value in labels_content["train"].values():
expected_charset.update(set(value))
expected_charset.update(tokens)
(output / "charset.pkl").write_bytes(pickle.dumps(sorted(list(expected_charset))))
extractor = LanguageModelBuilder(
output=output,
tokens=tokens_path,
subword_vocab_size=subword_vocab_size,
)
extractor.run()
# Check files
expected_paths = [
# Previous files
output / "charset.pkl",
output / "labels.json",
# Language resources
output / "language_model" / "corpus_characters.txt",
output / "language_model" / "corpus_subwords.txt",
output / "language_model" / "corpus_words.txt",
output / "language_model" / "lexicon_characters.txt",
output / "language_model" / "lexicon_subwords.txt",
output / "language_model" / "lexicon_words.txt",
output / "language_model" / "subword_tokenizer.model",
output / "language_model" / "subword_tokenizer.vocab",
output / "language_model" / "tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
# Check "language_corpus.txt"
expected_char_language_corpus = """ⓢ L a u l o n t ▁ ▁ ⓕ F r a n c o i s ▁ ▁ ⓑ 8
ⓢ C i r e t ▁ ▁ ⓕ A n t o i n e ▁ ▁ ⓑ 2 7
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2 8
ⓢ C i r e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 2
ⓢ E u r e s t o n ▁ ▁ ⓕ S o l a n g e ▁ ▁ ⓑ 1 0
ⓢ T e r o n t u s s i e u x ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2
ⓢ P r e s s o n e t ▁ ▁ ⓕ M a r i e ▁ ▁ ⓑ 1 2"""
expected_word_language_corpus = """ⓢ Laulont ▁ ⓕ Francois ▁ ⓑ 8
ⓢ Ciret ▁ ⓕ Antoine ▁ ⓑ 27
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 28
ⓢ Ciret ▁ ⓕ Marie ▁ ⓑ 2
ⓢ Eureston ▁ ⓕ Solange ▁ ⓑ 10
ⓢ Terontussieux ▁ ⓕ Jean ▁ ⓑ 2
ⓢ Pressonet ▁ ⓕ Marie ▁ ⓑ 12"""
# Replace double spaces with regular space
if not keep_spaces:
expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_subword_language_corpus
)
assert (
output / "language_model" / "corpus_characters.txt"
).read_text() == expected_char_language_corpus
assert (
output / "language_model" / "corpus_words.txt"
).read_text() == expected_word_language_corpus
assert (
output / "language_model" / "corpus_subwords.txt"
).read_text() == expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens = [
"" if t.isspace() else t for t in sorted(list(expected_charset))
]
expected_language_tokens.append("")
assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
expected_language_tokens
)
# Check "language_lexicon.txt"
expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (
output / "language_model" / "lexicon_characters.txt"
).read_text() == "\n".join(expected_language_char_lexicon)
word_vocab = set([word for word in expected_word_language_corpus.split()])
expected_language_word_lexicon = [
f"{word} {' '.join(word)}" for word in sorted(word_vocab)
]
assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
expected_language_word_lexicon
)
subword_vocab = set(
[subword for subword in expected_subword_language_corpus.split()]
)
expected_language_subword_lexicon = [
f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
]
assert (
output / "language_model" / "lexicon_subwords.txt"
).read_text() == "\n".join(expected_language_subword_lexicon)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment