Compare revisions

Manon Blanco · Yoann Schneider · Manon Blanco · Yoann Schneider · Manon Blanco · Yoann Schneider
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ To apply DAN to an image, one needs to first add a few imports and to load an im

 ```python
 import cv2
-from dan.ocr.predict.prediction import DAN
+from dan.ocr.predict.inference import DAN

 image = cv2.cvtColor(cv2.imread(IMAGE_PATH), cv2.COLOR_BGR2RGB)
 ```
@@ -84,16 +84,16 @@ This package provides three subcommands. To get more information about any subco

 ### Get started

-See the [dedicated section](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/get_started/training/) on the official DAN documentation.

 ### Data extraction from Arkindex

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/datasets/extract/) on the official DAN documentation.

 ### Model training

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/train/) on the official DAN documentation.

 ### Model prediction

-See the [dedicated section](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
+See the [dedicated page](https://atr.pages.teklia.com/dan/usage/predict/) on the official DAN documentation.
--- a/dan/datasets/__init__.py
+++ b/dan/datasets/__init__.py
@@ -4,7 +4,9 @@ Preprocess datasets for training.
 """

 from dan.datasets.analyze import add_analyze_parser
+from dan.datasets.entities import add_entities_parser
 from dan.datasets.extract import add_extract_parser
+from dan.datasets.tokens import add_tokens_parser


 def add_dataset_parser(subcommands) -> None:
@@ -17,3 +19,5 @@ def add_dataset_parser(subcommands) -> None:

    add_extract_parser(subcommands)
    add_analyze_parser(subcommands)
+    add_entities_parser(subcommands)
+    add_tokens_parser(subcommands)
--- a/dan/datasets/entities/__init__.py
+++ b/dan/datasets/entities/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Extract entities from Arkindex using a corpus export.
+"""
+from pathlib import Path
+
+from dan.datasets.entities.extract import run
+
+
+def add_entities_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "entities",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.add_argument(
+        "database",
+        type=Path,
+        help="Path where the data were exported from Arkindex.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=Path("entities.yml"),
+        required=False,
+        help="Path to a YAML file to save the extracted entities.",
+    )
+    parser.set_defaults(func=run)
--- a/dan/datasets/entities/extract.py
+++ b/dan/datasets/entities/extract.py
+# -*- coding: utf-8 -*-
+from operator import itemgetter
+from pathlib import Path
+
+import yaml
+
+from arkindex_export import EntityType, open_database
+
+
+def run(database: Path, output_file: Path) -> None:
+    # Load SQLite database
+    open_database(database)
+
+    # Extract and save entities to YAML
+    entities = list(
+        map(itemgetter(0), EntityType.select(EntityType.name).distinct().tuples())
+    )
+    output_file.write_text(
+        yaml.safe_dump({"entities": entities}, explicit_start=True, allow_unicode=True)
+    )
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
 # -*- coding: utf-8 -*-
 """
-Extract dataset from Arkindex using API.
+Extract dataset from Arkindex using a corpus export.
 """

 import argparse
 import pathlib
 from uuid import UUID

-from dan.datasets.extract.extract import run
+from dan.datasets.extract.arkindex import run

 MANUAL_SOURCE = "manual"

@@ -144,7 +144,7 @@ def add_extract_parser(subcommands) -> None:
    parser.add_argument(
        "--max-height",
        type=int,
-        help="Images larger than this height will be resized to this width.",
+        help="Images larger than this height will be resized to this height.",
    )

    # Formatting arguments

--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
--- a/dan/datasets/tokens/__init__.py
+++ b/dan/datasets/tokens/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Generate the YAML file containing entities and their token(s) to train a DAN model
+"""
+from pathlib import Path
+
+from dan.datasets.tokens.generate import run
+
+
+def add_tokens_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "tokens",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.add_argument(
+        "entities",
+        type=Path,
+        help="Path to a YAML file containing the extracted entities.",
+    )
+    parser.add_argument(
+        "--end-tokens",
+        action="store_true",
+        help="Whether to generate end tokens along with starting tokens.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=Path("tokens.yml"),
+        required=False,
+        help="Path to a YAML file to save the entities and their token(s).",
+    )
+
+    parser.set_defaults(func=run)
--- a/dan/datasets/tokens/generate.py
+++ b/dan/datasets/tokens/generate.py
+# -*- coding: utf-8 -*-
+from pathlib import Path
+from typing import Iterable
+
+import yaml
+
+OFFSET = 86
+LIMIT = 160
+
+STARTING_TOKEN = "\u2460"
+
+
+def get_token() -> Iterable[str]:
+    offset = OFFSET
+
+    while offset < LIMIT:
+        yield chr(ord(STARTING_TOKEN) + offset % LIMIT)
+        offset += 1
+
+    raise Exception(f"More than {LIMIT} tokens asked")
+
+
+def run(entities: Path, end_tokens: bool, output_file: Path) -> None:
+    # Load extracted entities
+    entities = yaml.safe_load(entities.read_text())
+
+    # Generate associated starting/ending token
+    token_generator = get_token()
+    tokens = {}
+    for entity in entities.get("entities", []):
+        tokens[entity] = {
+            "start": next(token_generator),
+            "end": next(token_generator) if end_tokens else "",
+        }
+
+    # Save entities & tokens to YAML
+    output_file.write_text(
+        yaml.safe_dump(tokens, explicit_start=True, allow_unicode=True, sort_keys=False)
+    )
--- a/dan/ocr/predict/__init__.py
+++ b/dan/ocr/predict/__init__.py
@@ -6,7 +6,7 @@ Predict on an image using a trained DAN model.
 import pathlib

 from dan.ocr.predict.attention import Level
-from dan.ocr.predict.prediction import run
+from dan.ocr.predict.inference import run
 from dan.utils import parse_tokens


@@ -70,7 +70,7 @@ def add_predict_parser(subcommands) -> None:
        "--temperature",
        type=float,
        default=1.0,
-        help="Temperature scaling scalar parameter",
+        help="Temperature scaling scalar parameter.",
        required=False,
    )
    parser.add_argument(
@@ -104,7 +104,7 @@ def add_predict_parser(subcommands) -> None:
        "--attention-map-scale",
        type=float,
        default=0.5,
-        help="Image scaling factor before creating the GIF",
+        help="Image scaling factor before creating the GIF.",
        required=False,
    )
    parser.add_argument(

--- a/dan/ocr/predict/prediction.py
+++ b/dan/ocr/predict/prediction.py
--- a/dan/ocr/transforms.py
+++ b/dan/ocr/transforms.py
@@ -28,12 +28,20 @@ from torchvision.transforms.functional import resize as resize_tensor


 class Preprocessing(str, Enum):
-    # If the image is bigger than the given size, resize it while keeping the original ratio
    MaxResize = "max_resize"
-    # Resize the height to a fixed value while keeping the original ratio
+    """
+    If the image is bigger than the given size, resize it while keeping the original ratio
+    """
+
    FixedHeightResize = "fixed_height_resize"
-    # Resize the width to a fixed value while keeping the original ratio
+    """
+    Resize the height to a fixed value while keeping the original ratio
+    """
+
    FixedWidthResize = "fixed_width_resize"
+    """
+    Resize the width to a fixed value while keeping the original ratio
+    """


 class FixedHeightResize:

--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
 # Get started

-To use DAN in your own environment, you need to first clone via:
+## Installation
+
+To use DAN in your own environment, you need to install it as a dependency or manually.
+
+### As a dependency
+
+To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
+
+```shell
+teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
+```
+
+Then you can install it via pip:
+
+```shell
+pip install -r requirements.txt
+```
+
+### Manually
+
+To install DAN manually, you need to first clone via:

 ```shell
 git clone git@gitlab.teklia.com:atr/dan.git
@@ -9,9 +29,11 @@ git clone git@gitlab.teklia.com:atr/dan.git
 Then you can install it via pip:

 ```shell
-pip install -e .
+pip install .
 ```

+---
+
 To learn more about the newly installed `teklia-dan` command, make sure to run:

 ```shell

--- a/docs/get_started/training.md
+++ b/docs/get_started/training.md
@@ -20,10 +20,10 @@ output/
 │   ├── train
 │   ├── val
 │   └── test
-├── language_model
-│   ├── corpus.txt
-│   ├── lexicon.txt
-│   └── tokens.txt
+└── language_model
+    ├── corpus.txt
+    ├── lexicon.txt
+    └── tokens.txt
 ```

 ## 2. Train

--- a/docs/ref/cli.md
+++ b/docs/ref/cli.md
+# Command Line Interface
+
+::: dan.cli
--- a/docs/ref/datasets/analyze/index.md
+++ b/docs/ref/datasets/analyze/index.md
 # Analysis
+
+::: dan.datasets.analyze
--- a/docs/ref/datasets/entities/extract.md
+++ b/docs/ref/datasets/entities/extract.md
+# Extract
+
+::: dan.datasets.entities.extract
--- a/docs/ref/datasets/entities/index.md
+++ b/docs/ref/datasets/entities/index.md
+# Entities
+
+::: dan.datasets.entities
--- a/docs/ref/datasets/extract/arkindex.md
+++ b/docs/ref/datasets/extract/arkindex.md
 # Arkindex

-::: dan.datasets.extract.extract
+::: dan.datasets.extract.arkindex
--- a/docs/ref/datasets/extract/index.md
+++ b/docs/ref/datasets/extract/index.md
 # Extraction
+
+::: dan.datasets.extract
--- a/docs/ref/datasets/index.md
+++ b/docs/ref/datasets/index.md
 # Datasets
+
+::: dan.datasets
No results found