Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (6)
Showing
with 437 additions and 104 deletions
*gif filter=lfs diff=lfs merge=lfs -text
**/*.pt filter=lfs diff=lfs merge=lfs -text
tests/data/prediction/language_model.arpa filter=lfs diff=lfs merge=lfs -text
......@@ -56,7 +56,7 @@ To apply DAN to an image, one needs to first add a few imports and to load an im
```python
import cv2
from dan.ocr.predict.prediction import DAN
from dan.ocr.predict.inference import DAN
image = cv2.cvtColor(cv2.imread(IMAGE_PATH), cv2.COLOR_BGR2RGB)
```
......@@ -84,16 +84,16 @@ This package provides three subcommands. To get more information about any subco
### Get started
See the [dedicated section](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
### Data extraction from Arkindex
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
### Model training
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
### Model prediction
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
......@@ -4,7 +4,9 @@ Preprocess datasets for training.
"""
from dan.datasets.analyze import add_analyze_parser
from dan.datasets.entities import add_entities_parser
from dan.datasets.extract import add_extract_parser
from dan.datasets.tokens import add_tokens_parser
def add_dataset_parser(subcommands) -> None:
......@@ -17,3 +19,5 @@ def add_dataset_parser(subcommands) -> None:
add_extract_parser(subcommands)
add_analyze_parser(subcommands)
add_entities_parser(subcommands)
add_tokens_parser(subcommands)
# -*- coding: utf-8 -*-
"""
Extract entities from Arkindex using a corpus export.
"""
from pathlib import Path
from dan.datasets.entities.extract import run
def add_entities_parser(subcommands) -> None:
parser = subcommands.add_parser(
"entities",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"database",
type=Path,
help="Path where the data were exported from Arkindex.",
)
parser.add_argument(
"--output-file",
type=Path,
default=Path("entities.yml"),
required=False,
help="Path to a YAML file to save the extracted entities.",
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
from operator import itemgetter
from pathlib import Path
import yaml
from arkindex_export import EntityType, open_database
def run(database: Path, output_file: Path) -> None:
# Load SQLite database
open_database(database)
# Extract and save entities to YAML
entities = list(
map(itemgetter(0), EntityType.select(EntityType.name).distinct().tuples())
)
output_file.write_text(
yaml.safe_dump({"entities": entities}, explicit_start=True, allow_unicode=True)
)
# -*- coding: utf-8 -*-
"""
Extract dataset from Arkindex using API.
Extract dataset from Arkindex using a corpus export.
"""
import argparse
import pathlib
from uuid import UUID
from dan.datasets.extract.extract import run
from dan.datasets.extract.arkindex import run
MANUAL_SOURCE = "manual"
......@@ -144,7 +144,14 @@ def add_extract_parser(subcommands) -> None:
parser.add_argument(
"--max-height",
type=int,
help="Images larger than this height will be resized to this width.",
help="Images larger than this height will be resized to this height.",
)
parser.add_argument(
"--subword-vocab-size",
type=int,
default=1000,
help="Size of the vocabulary to train the sentencepiece subword tokenizer needed for language model.",
)
# Formatting arguments
......
......@@ -30,8 +30,10 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText,
)
from dan.datasets.extract.utils import (
Tokenizer,
download_image,
get_bbox,
get_vocabulary,
insert_token,
normalize_linebreaks,
normalize_spaces,
......@@ -77,6 +79,7 @@ class ArkindexExtractor:
keep_spaces: bool = False,
image_extension: str = "",
allow_empty: bool = False,
subword_vocab_size: int = 1000,
) -> None:
self.folders = folders
self.element_type = element_type
......@@ -92,14 +95,14 @@ class ArkindexExtractor:
self.image_extension = image_extension
self.allow_empty = allow_empty
self.mapping = LMTokenMapping()
self.keep_spaces = keep_spaces
self.subword_vocab_size = subword_vocab_size
self.data: Dict = defaultdict(dict)
self.charset = set()
self.language_corpus = []
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = []
self.language_lexicon = defaultdict(list)
# Image download tasks to process
self.tasks: List[Dict[str, str]] = []
......@@ -275,12 +278,6 @@ class ArkindexExtractor:
)
return text.strip()
def format_text_language_model(self, text: str):
"""
Format text for the language model. Return the text tokenized at character-level.
"""
return " ".join(map(self.mapping.encode_token, list(text.strip())))
def process_element(
self,
element: Element,
......@@ -319,10 +316,6 @@ class ArkindexExtractor:
self.data[split][str(image_path)] = text
self.charset = self.charset.union(set(text))
# Language model should be built using only text from the training set
if split == "train":
self.language_corpus.append(self.format_text_language_model(text))
def process_parent(
self,
pbar,
......@@ -361,6 +354,11 @@ class ArkindexExtractor:
"""
Convert charset to a LM-compatible charset. Ensure that special LM tokens do not appear in the charset.
"""
logger.info("Preparing language resources")
# Add unknown token to charset
self.charset.add(self.unknown_token)
# Build LM tokens
for token in sorted(list(self.charset)):
assert (
token not in self.mapping.encode.values()
......@@ -368,15 +366,40 @@ class ArkindexExtractor:
self.language_tokens.append(
self.mapping.encode[token]
) if token in self.mapping.encode else self.language_tokens.append(token)
# Add the special blank token
self.language_tokens.append(self.mapping.ctc.encoded)
# Build lexicon
assert all(
[len(token) == 1 for token in self.language_lexicon]
), "Tokens should be single characters."
self.language_lexicon = [f"{token} {token}" for token in self.language_tokens]
# Build LM corpus
train_corpus = [
text.replace(self.mapping.linebreak.display, self.mapping.space.display)
for text in self.data["train"].values()
]
tokenizer = Tokenizer(
training_corpus=train_corpus,
charset=self.language_tokens,
unknown_token=self.unknown_token,
outdir=self.output / "language_model",
mapping=self.mapping,
tokens=self.tokens,
subword_vocab_size=self.subword_vocab_size,
)
for level, tokenize in (
("characters", tokenizer.char_tokenize),
("words", tokenizer.word_tokenize),
("subwords", tokenizer.subword_tokenize),
):
self.language_corpus[level] = list(map(tokenize, train_corpus))
# Build LM lexicon
self.language_lexicon["characters"] = [
f"{token} {token}" for token in self.language_tokens
]
for level in ["words", "subwords"]:
self.language_lexicon[level] = [
f"{token} {tokenizer.char_tokenize(token)}"
for token in get_vocabulary(self.language_corpus[level])
]
def export(self):
(self.output / "labels.json").write_text(
......@@ -386,15 +409,16 @@ class ArkindexExtractor:
indent=4,
)
)
(self.output / "language_model" / "corpus.txt").write_text(
"\n".join(self.language_corpus)
)
for level in ["characters", "words", "subwords"]:
(self.output / "language_model" / f"corpus_{level}.txt").write_text(
"\n".join(self.language_corpus[level])
)
(self.output / "language_model" / f"lexicon_{level}.txt").write_text(
"\n".join(self.language_lexicon[level])
)
(self.output / "language_model" / "tokens.txt").write_text(
"\n".join(self.language_tokens)
)
(self.output / "language_model" / "lexicon.txt").write_text(
"\n".join(self.language_lexicon)
)
(self.output / "charset.pkl").write_bytes(
pickle.dumps(sorted(list(self.charset)))
)
......@@ -477,6 +501,7 @@ def run(
image_format: str,
keep_spaces: bool,
allow_empty: bool,
subword_vocab_size: int,
):
assert database.exists(), f"No file found @ {database}"
open_database(path=database)
......@@ -503,4 +528,5 @@ def run(
keep_spaces=keep_spaces,
image_extension=image_format,
allow_empty=allow_empty,
subword_vocab_size=subword_vocab_size,
).run()
# -*- coding: utf-8 -*-
import itertools
import logging
import operator
import re
from dataclasses import dataclass, field
from io import BytesIO
from typing import List
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Iterator, List, Optional, Union
import requests
import sentencepiece as spm
from nltk import wordpunct_tokenize
from PIL import Image, ImageOps
from tenacity import (
retry,
......@@ -13,7 +20,7 @@ from tenacity import (
wait_exponential,
)
from dan.utils import EntityType
from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
......@@ -117,3 +124,117 @@ def get_bbox(polygon: List[List[int]]) -> str:
x, y = min(all_x), min(all_y)
width, height = max(all_x) - x, max(all_y) - y
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
def get_vocabulary(tokenized_text: List[str]) -> set[str]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return sorted(set([token for doc in tokenized_text for token in doc.split()]))
@dataclass
class Tokenizer:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
training_corpus: List[str]
charset: List[str]
unknown_token: str
outdir: Path
mapping: LMTokenMapping
tokens: Optional[EntityType] = None
subword_vocab_size: int = 1000
sentencepiece_model: spm.SentencePieceProcessor = field(init=False)
@property
def prefix(self):
return self.outdir / "subword_tokenizer"
@property
def ner_tokens(self) -> Union[List[str], Iterator[str]]:
if self.tokens is None:
return []
return itertools.chain(
map(operator.attrgetter("start"), self.tokens.values()),
filter(
operator.truth, map(operator.attrgetter("end"), self.tokens.values())
),
)
@property
def mapping_tokens(self) -> List[str]:
return [token.encoded for token in self.mapping]
@property
def special_tokens(self) -> List[str]:
return list(set(itertools.chain(self.mapping_tokens, self.ner_tokens)))
def __post_init__(self) -> None:
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
logger.info("Training a sentencepiece model for subword tokenization")
with NamedTemporaryFile(dir=self.outdir, suffix=".txt", mode="w") as tmp:
tmp.write("\n".join(self.training_corpus))
tmp.flush()
spm.SentencePieceTrainer.train(
input=tmp.name,
vocab_size=self.subword_vocab_size,
model_prefix=self.prefix,
user_defined_symbols=self.special_tokens,
)
# Load the model
self.sentencepiece_model = spm.SentencePieceProcessor(
model_file=str(self.prefix.with_suffix(".model"))
)
def subword_tokenize(self, text: str) -> str:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens = self.sentencepiece_model.encode(text, out_type=str)
return " ".join(map("".join, map(self.encode, tokens)))
def word_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words = list(map("".join, map(self.encode, wordpunct_tokenize(text))))
return " ".join(
[
word + f" {self.mapping.space.encoded}"
if (i != len(words) - 1 and word not in self.ner_tokens)
else word
for i, word in enumerate(words)
]
)
def char_tokenize(self, text: str) -> str:
"""
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return " ".join(
[
char if char in self.charset else self.unknown_token
for char in self.encode(text)
]
)
def encode(self, text: List[str]) -> List[str]:
"""
Encode special tokens.
:param text: Text to be encoded.
"""
return map(self.mapping.encode_token, text)
# -*- coding: utf-8 -*-
"""
Generate the YAML file containing entities and their token(s) to train a DAN model
"""
from pathlib import Path
from dan.datasets.tokens.generate import run
def add_tokens_parser(subcommands) -> None:
parser = subcommands.add_parser(
"tokens",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"entities",
type=Path,
help="Path to a YAML file containing the extracted entities.",
)
parser.add_argument(
"--end-tokens",
action="store_true",
help="Whether to generate end tokens along with starting tokens.",
)
parser.add_argument(
"--output-file",
type=Path,
default=Path("tokens.yml"),
required=False,
help="Path to a YAML file to save the entities and their token(s).",
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
from pathlib import Path
from typing import Iterable
import yaml
OFFSET = 86
LIMIT = 160
STARTING_TOKEN = "\u2460"
def get_token() -> Iterable[str]:
offset = OFFSET
while offset < LIMIT:
yield chr(ord(STARTING_TOKEN) + offset % LIMIT)
offset += 1
raise Exception(f"More than {LIMIT} tokens asked")
def run(entities: Path, end_tokens: bool, output_file: Path) -> None:
# Load extracted entities
entities = yaml.safe_load(entities.read_text())
# Generate associated starting/ending token
token_generator = get_token()
tokens = {}
for entity in entities.get("entities", []):
tokens[entity] = {
"start": next(token_generator),
"end": next(token_generator) if end_tokens else "",
}
# Save entities & tokens to YAML
output_file.write_text(
yaml.safe_dump(tokens, explicit_start=True, allow_unicode=True, sort_keys=False)
)
......@@ -6,7 +6,7 @@ Predict on an image using a trained DAN model.
import pathlib
from dan.ocr.predict.attention import Level
from dan.ocr.predict.prediction import run
from dan.ocr.predict.inference import run
from dan.utils import parse_tokens
......@@ -70,7 +70,7 @@ def add_predict_parser(subcommands) -> None:
"--temperature",
type=float,
default=1.0,
help="Temperature scaling scalar parameter",
help="Temperature scaling scalar parameter.",
required=False,
)
parser.add_argument(
......@@ -104,7 +104,7 @@ def add_predict_parser(subcommands) -> None:
"--attention-map-scale",
type=float,
default=0.5,
help="Image scaling factor before creating the GIF",
help="Image scaling factor before creating the GIF.",
required=False,
)
parser.add_argument(
......
......@@ -356,62 +356,62 @@ def process_batch(
logger.info("Prediction parsing...")
for idx, image_path in enumerate(image_batch):
predicted_text = prediction["text"][idx]
result = {"text": predicted_text}
# Return LM results
if use_language_model:
result["language_model"] = {
"text": prediction["language_model"]["text"][idx],
"confidence": prediction["language_model"]["confidence"][idx],
}
# Return extracted objects (coordinates, text, confidence)
if predict_objects:
result["objects"] = prediction["objects"][idx]
# Return mean confidence score
if confidence_score:
result["confidences"] = {}
char_confidences = prediction["confidences"][idx]
result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
for level in confidence_score_levels:
result["confidences"][level.value] = []
texts, confidences, _ = split_text_and_confidences(
predicted_text,
char_confidences,
level,
word_separators,
line_separators,
tokens,
)
for text, conf in zip(texts, confidences):
result["confidences"][level.value].append(
{"text": text, "confidence": conf}
result = {"text": predicted_text, "confidences": {}, "language_model": {}}
if predicted_text:
# Return LM results
if use_language_model:
result["language_model"] = {
"text": prediction["language_model"]["text"][idx],
"confidence": prediction["language_model"]["confidence"][idx],
}
# Return extracted objects (coordinates, text, confidence)
if predict_objects:
result["objects"] = prediction["objects"][idx]
# Return mean confidence score
if confidence_score:
char_confidences = prediction["confidences"][idx]
result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
for level in confidence_score_levels:
result["confidences"][level.value] = []
texts, confidences, _ = split_text_and_confidences(
predicted_text,
char_confidences,
level,
word_separators,
line_separators,
tokens,
)
# Save gif with attention map
if attention_map:
attentions = prediction["attentions"][idx]
gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
logger.info(f"Creating attention GIF in {gif_filename}")
plot_attention(
image=visu_tensor[idx],
text=predicted_text,
weights=attentions,
level=attention_map_level,
scale=attention_map_scale,
word_separators=word_separators,
line_separators=line_separators,
tokens=tokens,
display_polygons=predict_objects,
threshold_method=threshold_method,
threshold_value=threshold_value,
max_object_height=max_object_height,
outname=gif_filename,
)
result["attention_gif"] = gif_filename
for text, conf in zip(texts, confidences):
result["confidences"][level.value].append(
{"text": text, "confidence": conf}
)
# Save gif with attention map
if attention_map:
attentions = prediction["attentions"][idx]
gif_filename = f"{output}/{image_path.stem}_{attention_map_level}.gif"
logger.info(f"Creating attention GIF in {gif_filename}")
plot_attention(
image=visu_tensor[idx],
text=predicted_text,
weights=attentions,
level=attention_map_level,
scale=attention_map_scale,
word_separators=word_separators,
line_separators=line_separators,
tokens=tokens,
display_polygons=predict_objects,
threshold_method=threshold_method,
threshold_value=threshold_value,
max_object_height=max_object_height,
outname=gif_filename,
)
result["attention_gif"] = gif_filename
json_filename = Path(output, image_path.stem).with_suffix(".json")
logger.info(f"Saving JSON prediction in {json_filename}")
......
......@@ -28,12 +28,20 @@ from torchvision.transforms.functional import resize as resize_tensor
class Preprocessing(str, Enum):
# If the image is bigger than the given size, resize it while keeping the original ratio
MaxResize = "max_resize"
# Resize the height to a fixed value while keeping the original ratio
"""
If the image is bigger than the given size, resize it while keeping the original ratio
"""
FixedHeightResize = "fixed_height_resize"
# Resize the width to a fixed value while keeping the original ratio
"""
Resize the height to a fixed value while keeping the original ratio
"""
FixedWidthResize = "fixed_width_resize"
"""
Resize the width to a fixed value while keeping the original ratio
"""
class FixedHeightResize:
......
......@@ -22,7 +22,7 @@ class Token(NamedTuple):
class LMTokenMapping(NamedTuple):
space: Token = Token("", " ")
space: Token = Token("", " ")
linebreak: Token = Token("", "\n")
ctc: Token = Token("", "<ctc>")
......@@ -139,7 +139,9 @@ def parse_tokens(filename: str) -> Dict[str, EntityType]:
def read_yaml(yaml_path: str) -> Dict:
"""
Read YAML tokens file
Read YAML tokens file.
:param yaml_path: Path of the YAML file to read.
:return: The content of the read file.
"""
filename = Path(yaml_path)
assert filename.exists(), f"{yaml_path} does not resolve."
......@@ -152,6 +154,8 @@ def read_yaml(yaml_path: str) -> Dict:
def read_json(json_path: str) -> Dict:
"""
Read labels JSON file
:param json_path: Path of the JSON file to read.
:return: The content of the read file.
"""
filename = Path(json_path)
assert filename.exists(), f"{json_path} does not resolve."
......
# Get started
To use DAN in your own environment, you need to first clone via:
## Installation
To use DAN in your own environment, you need to install it as a dependency or manually.
### As a dependency
To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
```shell
teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
```
Then you can install it via pip:
```shell
pip install -r requirements.txt
```
### Manually
To install DAN manually, you need to first clone via:
```shell
git clone git@gitlab.teklia.com:atr/dan.git
......@@ -9,9 +29,11 @@ git clone git@gitlab.teklia.com:atr/dan.git
Then you can install it via pip:
```shell
pip install -e .
pip install .
```
---
To learn more about the newly installed `teklia-dan` command, make sure to run:
```shell
......
......@@ -4,13 +4,15 @@ There are a several steps to follow when training a DAN model.
## 1. Extract data
The data must be extracted and formatted for training. To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:
To extract the data, DAN uses an Arkindex export database in SQLite format. You will need to:
1. Structure the data into folders (`train` / `val` / `test`) in [Arkindex](https://demo.arkindex.org/).
1. [Export the project](https://doc.arkindex.org/howto/export/) in SQLite format.
1. Extract the data with the [extract command](../usage/datasets/extract.md).
At the end, you should have a tree structure like this:
This command will extract and format the images and labels needed to train DAN. It will also tokenize the training corpus at character, subword, and word levels, allowing you to combine DAN with an explicit statistical language model to improve performance.
At the end, you should get the following tree structure:
```
output/
......@@ -21,8 +23,14 @@ output/
│ ├── val
│ └── test
├── language_model
│ ├── corpus.txt
│ ├── lexicon.txt
│ ├── corpus_characters.txt
│ ├── lexicon_characters.txt
│ ├── corpus_subwords.txt
│ ├── lexicon_subwords.txt
│ ├── corpus_words.txt
│ ├── lexicon_words.txt
│ ├── subword_tokenizer.model
│ ├── subword_tokenizer.vocab
│ └── tokens.txt
```
......
# Command Line Interface
::: dan.cli
# Analysis
::: dan.datasets.analyze
# Extract
::: dan.datasets.entities.extract
# Entities
::: dan.datasets.entities