Compare revisions

08a96bf6 · 08a96bf6 · 08a96bf6 · 08a96bf6 · 08a96bf6 · 08a96bf6
--- a/docs/usage/datasets/index.md
+++ b/docs/usage/datasets/index.md
@@ -7,3 +7,6 @@ Two operations are available through subcommands:

 `teklia-dan dataset format`
 : To format datasets for training. More details in [the dedicated section](./format.md).
+
+`teklia-dan dataset analyze`
+: To analyze datasets and display statistics. More details in [the dedicated section](./analyze.md).
--- a/docs/usage/train/index.md
+++ b/docs/usage/train/index.md
 # Train

-Use the `teklia-dan train document` command to train a new DAN model. It is able to train a DAN model at line or document-level and evaluate it.
+Use the `teklia-dan train` command to train a new DAN model. It is able to train a DAN model at line or document-level and evaluate it.

 ## Examples

@@ -8,15 +8,15 @@ Use the `teklia-dan train document` command to train a new DAN model. It is able

 To train DAN on documents:

-1. Set your training configuration in `dan/ocr/document/train.py`. Refer to the [dedicated section](parameters.md) for a description of parameters.
-1. Run `teklia-dan train document`.
+1. Set your training configuration in `dan/ocr/train.py`. Refer to the [dedicated section](parameters.md) for a description of parameters.
+1. Run `teklia-dan train`.
 1. Look into evaluation results in the `output` folder:
    - `checkpoints` contains model weights for the last trained epoch and for the epoch giving the best valid CER.
    - `results` contains the tensorboard log file, the parameters file, and the evaluation results for the best epoch.

 ### Line

-To train DAN on lines, run `teklia-dan train document` with a line dataset.
+To train DAN on lines, run `teklia-dan train` with a line dataset.

 ## Additional pages


--- a/docs/usage/train/jeanzay.md
+++ b/docs/usage/train/jeanzay.md
@@ -38,7 +38,7 @@ conda activate /gpfswork/rech/rxm/ubz97wr/.conda/envs/dan/
 set -x

 # execution
-teklia-dan train document
+teklia-dan train
 ```

 ## Train on multiple GPUs

--- a/docs/usage/train/parameters.md
+++ b/docs/usage/train/parameters.md
@@ -89,7 +89,7 @@ Usage:
 Augmentation transformations are applied on-the-fly during training to artificially increase data variability.

 DAN takes advantage of transforms from [albumentations](https://albumentations.ai/).
-The following configuration is used by default when using the `teklia-dan train document` command. Data augmentation is applied with a probability of 0.9. In this case, two transformations are randomly selected to be applied.
+The following configuration is used by default when using the `teklia-dan train` command. Data augmentation is applied with a probability of 0.9. In this case, two transformations are randomly selected to be applied.

 ```py
 transforms = A.Compose(

--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -83,25 +83,23 @@ nav:
      - Formatting:
        - ref/datasets/format/index.md
        - Automatic Text Recognition: ref/datasets/format/atr.md
-    - Managers:
-      - ref/managers/index.md
-      - Dataset managers: ref/managers/dataset.md
-      - Metrics managers: ref/managers/metrics.md
-      - OCR managers: ref/managers/ocr.md
-      - Training managers: ref/managers/training.md
    - OCR:
      - ref/ocr/index.md
-      - Document:
-        - ref/ocr/document/index.md
-        - Training: ref/ocr/document/train.md
-    - Prediction:
-      - Inference: ref/predict/prediction.md
-      - Attention: ref/predict/attention.md
-    - Decoders: ref/decoder.md
-    - Models: ref/encoder.md
-    - MLflow: ref/mlflow.md
-    - Schedulers: ref/schedulers.md
-    - Transformations: ref/transforms.md
+      - Managers:
+        - ref/ocr/managers/index.md
+        - Dataset managers: ref/ocr/managers/dataset.md
+        - Metrics managers: ref/ocr/managers/metrics.md
+        - OCR managers: ref/ocr/managers/ocr.md
+        - Training managers: ref/ocr/managers/training.md
+      - Training: ref/ocr/train.md
+      - Prediction:
+        - Inference: ref/ocr/predict/prediction.md
+        - Attention: ref/ocr/predict/attention.md
+      - Decoder: ref/ocr/decoder.md
+      - Encoder: ref/ocr/encoder.md
+      - MLflow: ref/ocr/mlflow.md
+      - Schedulers: ref/ocr/schedulers.md
+      - Transformations: ref/ocr/transforms.md
    - Utils: ref/utils.md

 markdown_extensions:

--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,10 @@ arkindex-export==0.1.3
 boto3==1.26.124
 editdistance==0.6.2
 imageio==2.26.1
+imagesize==1.4.1
+mdutils==1.6.0
 numpy==1.24.3
+prettytable==3.8.0
 PyYAML==6.0
 scipy==1.10.1
 tenacity==8.2.2

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,9 +5,9 @@ import pytest
 from torch.optim import Adam

 from arkindex_export import open_database
-from dan.decoder import GlobalHTADecoder
-from dan.encoder import FCN_Encoder
-from dan.transforms import Preprocessing
+from dan.ocr.decoder import GlobalHTADecoder
+from dan.ocr.encoder import FCN_Encoder
+from dan.ocr.transforms import Preprocessing

 FIXTURES = Path(__file__).resolve().parent / "data"


--- a/tests/data/analyze/images.md
+++ b/tests/data/analyze/images.md
+
+Statistics
+==========
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   6   |   6    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
--- a/tests/data/analyze/labels.md
+++ b/tests/data/analyze/labels.md
+
+Statistics
+==========
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |  109  |   16  |   2   |
+| Max    |  151  |   19  |   3   |
+| Mean   | 124.5 |  17.5 |  2.75 |
+| Median | 119.0 |  17.5 |  3.0  |
+| Total  |  498  |   70  |   11  |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |         59 |
+| e         |         49 |
+| t         |         44 |
+| s         |         37 |
+| o         |         37 |
+| n         |         37 |
+| i         |         35 |
+| a         |         30 |
+| d         |         22 |
+| r         |         15 |
+| c         |         15 |
+| l         |         13 |
+| u         |         13 |
+| m         |         11 |
+| h         |         10 |
+| g         |          9 |
+| p         |          8 |
+|           |          7 |
+|           |            |
+| y         |          6 |
+| ,         |          5 |
+| x         |          4 |
+| v         |          4 |
+| .         |          4 |
+| w         |          4 |
+| b         |          4 |
+| O         |          3 |
+| f         |          3 |
+| -         |          3 |
+| T         |          1 |
+| k         |          1 |
+| ’         |          1 |
+| C         |          1 |
+| R         |          1 |
+| j         |          1 |
+| W         |          1 |
--- a/tests/data/analyze/ner.md
+++ b/tests/data/analyze/ner.md
+
+Statistics
+==========
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    3    |     3     |  3  |
+| Max    |    3    |     3     |  3  |
+| Mean   |   3.0   |    3.0    | 3.0 |
+| Median |   3.0   |    3.0    | 3.0 |
+| Total  |    6    |     6     |  6  |
--- a/tests/data/analyze/stats.md
+++ b/tests/data/analyze/stats.md
+
+Statistics
+==========
+
+# Train
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   19  |   4   |   1   |
+| Max    |   24  |   5   |   1   |
+| Mean   |  21.5 |  4.5  |  1.0  |
+| Median |  21.5 |  4.5  |  1.0  |
+| Total  |   43  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| e         |          5 |
+| t         |          5 |
+| n         |          4 |
+| r         |          3 |
+| o         |          3 |
+| g         |          3 |
+| h         |          2 |
+| a         |          2 |
+| d         |          2 |
+| i         |          2 |
+| T         |          1 |
+| l         |          1 |
+| A         |          1 |
+| b         |          1 |
+| f         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
+
+# Val
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   20  |   4   |   1   |
+| Max    |   21  |   5   |   1   |
+| Mean   |  20.5 |  4.5  |  1.0  |
+| Median |  20.5 |  4.5  |  1.0  |
+| Total  |   41  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| e         |          6 |
+| n         |          3 |
+| a         |          3 |
+| r         |          3 |
+| m         |          3 |
+| w         |          2 |
+| h         |          2 |
+| o         |          2 |
+| O         |          1 |
+| c         |          1 |
+| b         |          1 |
+| i         |          1 |
+| t         |          1 |
+| T         |          1 |
+| p         |          1 |
+| l         |          1 |
+| y         |          1 |
+| s         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
+
+# Test
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   20  |   4   |   1   |
+| Max    |   29  |   5   |   1   |
+| Mean   |  24.5 |  4.5  |  1.0  |
+| Median |  24.5 |  4.5  |  1.0  |
+| Total  |   49  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| r         |          6 |
+| o         |          5 |
+| e         |          4 |
+| a         |          4 |
+| t         |          3 |
+| s         |          3 |
+| h         |          2 |
+| y         |          2 |
+| B         |          1 |
+| w         |          1 |
+| i         |          1 |
+| b         |          1 |
+| S         |          1 |
+| O         |          1 |
+| M         |          1 |
+| E         |          1 |
+| g         |          1 |
+| c         |          1 |
+| n         |          1 |
+| m         |          1 |
+| p         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
--- a/tests/data/training/training_dataset/labels.json
+++ b/tests/data/training/training_dataset/labels.json
 {
-    "test": {
-        "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
-            "text": "Both her wrists bore"
-        },
-        "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
-            "text": "SOME years ago a contemporary"
-        }
-    },
    "train": {
        "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": {
            "text": "The latter do not regard"
@@ -22,5 +14,13 @@
        "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": {
            "text": "The play was no more"
        }
+    },
+    "test": {
+        "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
+            "text": "Both her wrists bore"
+        },
+        "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
+            "text": "SOME years ago a contemporary"
+        }
    }
 }
--- a/tests/data/training/training_dataset/tokens.yaml
+++ b/tests/data/training/training_dataset/tokens.yaml
+---
+surname:
+  start: "Ⓢ"
+  end: ""
+firstname:
+  start: "Ⓕ"
+  end: ""
+age:
+  start: "Ⓐ"
+  end: ""
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
+# -*- coding: utf-8 -*-
+
+import pytest
+from mdutils.mdutils import MdUtils
+
+from dan.datasets.analyze import read_json, read_yaml
+from dan.datasets.analyze.statistics import Statistics
+from tests.conftest import FIXTURES
+
+
+@pytest.fixture
+def image_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "images"))
+
+
+@pytest.fixture
+def labels_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "labels"))
+
+
+@pytest.fixture
+def ner_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "ner"))
+
+
+@pytest.fixture
+def full_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
+
+
+@pytest.mark.parametrize(
+    "im_paths, expected_summary",
+    (
+        (
+            [
+                "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
+                "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
+                "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
+                "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
+                "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
+                "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
+            ],
+            pytest.lazy_fixture("image_statistics"),
+        ),
+    ),
+)
+def test_display_image_statistics(im_paths, expected_summary, tmp_path):
+    stats = Statistics(filename=tmp_path)
+    stats.create_image_statistics(images=im_paths)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "texts, expected_summary",
+    (
+        (
+            [
+                "Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
+                "Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
+                "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
+                "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
+            ],
+            pytest.lazy_fixture("labels_statistics"),
+        ),
+    ),
+)
+def test_display_label_statistics(texts, expected_summary, tmp_path):
+    filename = tmp_path / "labels.md"
+    stats = Statistics(filename=str(filename))
+    stats.create_label_statistics(labels=texts)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "texts, expected_summary",
+    (
+        (
+            [
+                "ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
+                "ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
+            ],
+            pytest.lazy_fixture("ner_statistics"),
+        ),
+    ),
+)
+def test_display_ner_statistics(texts, expected_summary, tmp_path):
+    tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
+    stats = Statistics(filename=tmp_path)
+    stats.create_ner_statistics(labels=texts, ner_tokens=tokens)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "labels, tokens, expected_summary",
+    (
+        (
+            FIXTURES / "training" / "training_dataset" / "labels.json",
+            FIXTURES / "training" / "training_dataset" / "tokens.yaml",
+            pytest.lazy_fixture("full_statistics"),
+        ),
+    ),
+)
+def test_run(labels, tokens, expected_summary, tmp_path):
+    output_file = tmp_path / "stats.md"
+    stats = Statistics(filename=str(output_file))
+    stats.run(labels=read_json(labels), tokens=read_yaml(tokens))
+    assert output_file.read_text() == expected_summary
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -5,8 +5,8 @@ import shutil

 import pytest

-from dan.predict.prediction import DAN
-from dan.predict.prediction import run as run_prediction
+from dan.ocr.predict.prediction import DAN
+from dan.ocr.predict.prediction import run as run_prediction


 @pytest.mark.parametrize(

--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 import yaml

-from dan.ocr.document.train import train_and_test
+from dan.ocr.train import train_and_test
 from tests.conftest import FIXTURES



--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ package = wheel
 wheel_build_env = .pkg
 deps =
    pytest>=6
+    pytest-lazy-fixture
    -rrequirements.txt
 commands =
    pytest {tty:--color=yes} {posargs}
No results found