Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (3)
Showing
with 191 additions and 21 deletions
......@@ -56,7 +56,7 @@ To apply DAN to an image, one needs to first add a few imports and to load an im
```python
import cv2
from dan.ocr.predict.prediction import DAN
from dan.ocr.predict.inference import DAN
image = cv2.cvtColor(cv2.imread(IMAGE_PATH), cv2.COLOR_BGR2RGB)
```
......@@ -84,16 +84,16 @@ This package provides three subcommands. To get more information about any subco
### Get started
See the [dedicated section](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
### Data extraction from Arkindex
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
### Model training
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
### Model prediction
See the [dedicated section](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
See the [dedicated page](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
......@@ -4,7 +4,9 @@ Preprocess datasets for training.
"""
from dan.datasets.analyze import add_analyze_parser
from dan.datasets.entities import add_entities_parser
from dan.datasets.extract import add_extract_parser
from dan.datasets.tokens import add_tokens_parser
def add_dataset_parser(subcommands) -> None:
......@@ -17,3 +19,5 @@ def add_dataset_parser(subcommands) -> None:
add_extract_parser(subcommands)
add_analyze_parser(subcommands)
add_entities_parser(subcommands)
add_tokens_parser(subcommands)
# -*- coding: utf-8 -*-
"""
Extract entities from Arkindex using a corpus export.
"""
from pathlib import Path
from dan.datasets.entities.extract import run
def add_entities_parser(subcommands) -> None:
parser = subcommands.add_parser(
"entities",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"database",
type=Path,
help="Path where the data were exported from Arkindex.",
)
parser.add_argument(
"--output-file",
type=Path,
default=Path("entities.yml"),
required=False,
help="Path to a YAML file to save the extracted entities.",
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
from operator import itemgetter
from pathlib import Path
import yaml
from arkindex_export import EntityType, open_database
def run(database: Path, output_file: Path) -> None:
# Load SQLite database
open_database(database)
# Extract and save entities to YAML
entities = list(
map(itemgetter(0), EntityType.select(EntityType.name).distinct().tuples())
)
output_file.write_text(
yaml.safe_dump({"entities": entities}, explicit_start=True, allow_unicode=True)
)
# -*- coding: utf-8 -*-
"""
Extract dataset from Arkindex using API.
Extract dataset from Arkindex using a corpus export.
"""
import argparse
import pathlib
from uuid import UUID
from dan.datasets.extract.extract import run
from dan.datasets.extract.arkindex import run
MANUAL_SOURCE = "manual"
......@@ -144,7 +144,7 @@ def add_extract_parser(subcommands) -> None:
parser.add_argument(
"--max-height",
type=int,
help="Images larger than this height will be resized to this width.",
help="Images larger than this height will be resized to this height.",
)
# Formatting arguments
......
# -*- coding: utf-8 -*-
"""
Generate the YAML file containing entities and their token(s) to train a DAN model
"""
from pathlib import Path
from dan.datasets.tokens.generate import run
def add_tokens_parser(subcommands) -> None:
parser = subcommands.add_parser(
"tokens",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"entities",
type=Path,
help="Path to a YAML file containing the extracted entities.",
)
parser.add_argument(
"--end-tokens",
action="store_true",
help="Whether to generate end tokens along with starting tokens.",
)
parser.add_argument(
"--output-file",
type=Path,
default=Path("tokens.yml"),
required=False,
help="Path to a YAML file to save the entities and their token(s).",
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
from pathlib import Path
from typing import Iterable
import yaml
OFFSET = 86
LIMIT = 160
STARTING_TOKEN = "\u2460"
def get_token() -> Iterable[str]:
offset = OFFSET
while offset < LIMIT:
yield chr(ord(STARTING_TOKEN) + offset % LIMIT)
offset += 1
raise Exception(f"More than {LIMIT} tokens asked")
def run(entities: Path, end_tokens: bool, output_file: Path) -> None:
# Load extracted entities
entities = yaml.safe_load(entities.read_text())
# Generate associated starting/ending token
token_generator = get_token()
tokens = {}
for entity in entities.get("entities", []):
tokens[entity] = {
"start": next(token_generator),
"end": next(token_generator) if end_tokens else "",
}
# Save entities & tokens to YAML
output_file.write_text(
yaml.safe_dump(tokens, explicit_start=True, allow_unicode=True, sort_keys=False)
)
......@@ -6,7 +6,7 @@ Predict on an image using a trained DAN model.
import pathlib
from dan.ocr.predict.attention import Level
from dan.ocr.predict.prediction import run
from dan.ocr.predict.inference import run
from dan.utils import parse_tokens
......@@ -70,7 +70,7 @@ def add_predict_parser(subcommands) -> None:
"--temperature",
type=float,
default=1.0,
help="Temperature scaling scalar parameter",
help="Temperature scaling scalar parameter.",
required=False,
)
parser.add_argument(
......@@ -104,7 +104,7 @@ def add_predict_parser(subcommands) -> None:
"--attention-map-scale",
type=float,
default=0.5,
help="Image scaling factor before creating the GIF",
help="Image scaling factor before creating the GIF.",
required=False,
)
parser.add_argument(
......
......@@ -28,12 +28,20 @@ from torchvision.transforms.functional import resize as resize_tensor
class Preprocessing(str, Enum):
# If the image is bigger than the given size, resize it while keeping the original ratio
MaxResize = "max_resize"
# Resize the height to a fixed value while keeping the original ratio
"""
If the image is bigger than the given size, resize it while keeping the original ratio
"""
FixedHeightResize = "fixed_height_resize"
# Resize the width to a fixed value while keeping the original ratio
"""
Resize the height to a fixed value while keeping the original ratio
"""
FixedWidthResize = "fixed_width_resize"
"""
Resize the width to a fixed value while keeping the original ratio
"""
class FixedHeightResize:
......
# Get started
To use DAN in your own environment, you need to first clone via:
## Installation
To use DAN in your own environment, you need to install it as a dependency or manually.
### As a dependency
To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
```shell
teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
```
Then you can install it via pip:
```shell
pip install -r requirements.txt
```
### Manually
To install DAN manually, you need to first clone via:
```shell
git clone git@gitlab.teklia.com:atr/dan.git
......@@ -9,9 +29,11 @@ git clone git@gitlab.teklia.com:atr/dan.git
Then you can install it via pip:
```shell
pip install -e .
pip install .
```
---
To learn more about the newly installed `teklia-dan` command, make sure to run:
```shell
......
......@@ -20,10 +20,10 @@ output/
│ ├── train
│ ├── val
│ └── test
── language_model
├── corpus.txt
├── lexicon.txt
└── tokens.txt
── language_model
├── corpus.txt
├── lexicon.txt
└── tokens.txt
```
## 2. Train
......
# Command Line Interface
::: dan.cli
# Analysis
::: dan.datasets.analyze
# Extract
::: dan.datasets.entities.extract
# Entities
::: dan.datasets.entities
# Arkindex
::: dan.datasets.extract.extract
::: dan.datasets.extract.arkindex
# Extraction
::: dan.datasets.extract
# Datasets
::: dan.datasets