Compute dataset statistics after extraction/formatting

08a96bf6 · Solene Tarride · Mélodie Boillet · 4de3a84d · 08a96bf6 · 08a96bf6
Commit 08a96bf6 authored 1 year ago by Solene Tarride Committed by Mélodie Boillet 1 year ago
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,6 +44,7 @@ repos:
    rev: 0.7.16
    hooks:
    - id: mdformat
+      exclude: tests/data/analyze
      # Optionally add plugins
      additional_dependencies:
      - mdformat-mkdocs[recommended]
--- a/dan/datasets/__init__.py
+++ b/dan/datasets/__init__.py
@@ -3,6 +3,7 @@
 Preprocess datasets for training.
 """

+from dan.datasets.analyze import add_analyze_parser
 from dan.datasets.extract import add_extract_parser
 from dan.datasets.format import add_format_parser

@@ -17,3 +18,4 @@ def add_dataset_parser(subcommands) -> None:

    add_extract_parser(subcommands)
    add_format_parser(subcommands)
+    add_analyze_parser(subcommands)
--- a/dan/datasets/analyze/__init__.py
+++ b/dan/datasets/analyze/__init__.py
+# -*- coding: utf-8 -*-
+"""
+Analyze dataset and display statistics in markdown format.
+"""
+
+import json
+from pathlib import Path
+from typing import Dict
+
+import yaml
+
+from dan.datasets.analyze.statistics import run
+
+
+def read_yaml(yaml_path: str) -> Dict:
+    """
+    Read YAML tokens file
+    """
+    filename = Path(yaml_path)
+    assert filename.exists()
+    return yaml.safe_load(filename.read_text())
+
+
+def read_json(json_path: str) -> Dict:
+    """
+    Read labels JSON file
+    """
+    filename = Path(json_path)
+    assert filename.exists()
+    return json.loads(filename.read_text())
+
+
+def add_analyze_parser(subcommands) -> None:
+    parser = subcommands.add_parser(
+        "analyze",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.add_argument(
+        "--labels",
+        type=read_json,
+        help="Path to the formatted labels in JSON format.",
+        required=True,
+    )
+    parser.add_argument(
+        "--tokens",
+        type=read_yaml,
+        help="Path to the tokens YAML file.",
+        required=False,
+    )
+    parser.add_argument(
+        "--output-file",
+        dest="output",
+        type=Path,
+        help="The statistics will be saved to this file in Markdown format.",
+        required=True,
+    )
+
+    parser.set_defaults(func=run)
--- a/dan/datasets/analyze/statistics.py
+++ b/dan/datasets/analyze/statistics.py
+# -*- coding: utf-8 -*-
+from collections import Counter, defaultdict
+from operator import itemgetter
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import imagesize
+import numpy as np
+from mdutils.mdutils import MdUtils
+from prettytable import MARKDOWN, PrettyTable
+
+from dan import logger
+
+METRIC_COLUMN = "Metric"
+
+
+def create_table(
+    data: Dict,
+    count: bool = False,
+    total: bool = True,
+):
+    """
+    Each keys will be made into a column
+    We compute min, max, mean, median, total by default.
+    Total can be disabled. Count (length) computation can be enabled.
+    """
+
+    statistics = PrettyTable(field_names=[METRIC_COLUMN, *data.keys()])
+    statistics.align.update({METRIC_COLUMN: "l"})
+    statistics.set_style(MARKDOWN)
+
+    operations = []
+
+    if count:
+        operations.append(("Count", len))
+
+    operations.extend(
+        [
+            ("Min", np.min),
+            ("Max", np.max),
+            ("Mean", np.mean),
+            ("Median", np.median),
+        ]
+    )
+    if total:
+        operations.append(("Total", np.sum))
+
+    statistics.add_rows(
+        [
+            [col_name, *list(map(operator, data.values()))]
+            for col_name, operator in operations
+        ]
+    )
+
+    return statistics
+
+
+class Statistics:
+    HEADERS = {
+        "Images": "Images statistics",
+        "Labels": "Labels statistics",
+        "Chars": "Characters statistics",
+        "Tokens": "NER tokens statistics",
+    }
+
+    def __init__(self, filename: str) -> None:
+        self.document = MdUtils(file_name=filename, title="Statistics")
+
+    def _write_section(self, table: PrettyTable, title: str, level: int = 2):
+        """
+        Write the new section in the file.
+
+        <title with appropriate level>
+
+        <table>
+
+        """
+        self.document.new_header(level=level, title=title, add_table_of_contents="n")
+        self.document.write("\n")
+
+        logger.info(f"{title}\n\n{table}\n")
+
+        self.document.write(table.get_string())
+        self.document.write("\n")
+
+    def create_image_statistics(self, images: List[str]):
+        """
+        Compute statistics on image sizes and write them to file.
+        """
+        shapes = list(map(imagesize.get, images))
+        widths, heights = zip(*shapes)
+
+        self._write_section(
+            table=create_table(
+                data={"Width": widths, "Height": heights}, count=True, total=False
+            ),
+            title=Statistics.HEADERS["Images"],
+        )
+
+    def create_label_statistics(self, labels: List[str]):
+        """
+        Compute statistics on text labels and write them to file.
+        """
+        char_counter = Counter()
+        data = defaultdict(list)
+
+        for text in labels:
+            char_counter.update(text)
+            data["Chars"].append(len(text))
+            data["Words"].append(len(text.split()))
+            data["Lines"].append(len(text.split("\n")))
+
+        self._write_section(
+            table=create_table(data=data),
+            title=Statistics.HEADERS["Labels"],
+        )
+
+        self.create_character_occurrences_statistics(char_counter)
+
+    def create_character_occurrences_statistics(self, char_counter: Counter):
+        """
+        Compute statistics on the character distribution and write them to file.
+        """
+        char_occurrences = PrettyTable(
+            field_names=["Character", "Occurrence"],
+        )
+        char_occurrences.align.update({"Character": "l", "Occurrence": "r"})
+        char_occurrences.set_style(MARKDOWN)
+        char_occurrences.add_rows(list(char_counter.most_common()))
+
+        self._write_section(
+            table=char_occurrences, title=Statistics.HEADERS["Chars"], level=3
+        )
+
+    def create_ner_statistics(self, labels: List[str], ner_tokens: Dict) -> str:
+        """
+        Compute statistics on ner tokens presence.
+        """
+        entity_counter = defaultdict(list)
+        for text in labels:
+            for ner_label, token in ner_tokens.items():
+                entity_counter[ner_label].append(text.count(token["start"]))
+
+        self._write_section(
+            table=create_table(data=entity_counter),
+            title=Statistics.HEADERS["Tokens"],
+            level=3,
+        )
+
+    def run(self, labels: Dict, tokens: Optional[Dict]):
+        # Iterate over each split
+        for split_name, split_data in labels.items():
+            self.document.new_header(level=1, title=split_name.capitalize())
+
+            # Image statistics
+            # Path to the images are the key of the dict
+            self.create_image_statistics(images=split_data.keys())
+
+            # The text is actually under the "text" key of the values
+            labels = list(map(itemgetter("text"), split_data.values()))
+            # Text statistics
+            self.create_label_statistics(labels=labels)
+
+            if tokens is not None:
+                self.create_ner_statistics(labels=labels, ner_tokens=tokens)
+        self.document.create_md_file()
+
+
+def run(labels: Dict, tokens: Optional[Dict], output: Path) -> None:
+    """
+    Compute and save a dataset statistics.
+    """
+    Statistics(filename=str(output)).run(labels=labels, tokens=tokens)
--- a/docs/ref/datasets/analyze/index.md
+++ b/docs/ref/datasets/analyze/index.md
+# Analysis
--- a/docs/ref/datasets/analyze/statistics.md
+++ b/docs/ref/datasets/analyze/statistics.md
+# Statistics
+
+::: dan.datasets.analyze.statistics
--- a/docs/usage/datasets/analyze.md
+++ b/docs/usage/datasets/analyze.md
+# Dataset analysis
+
+## Description
+
+Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in Markdown format.
+
+The available arguments are
+
+| Parameter       | Description                      | Type  | Default |
+| --------------- | -------------------------------- | ----- | ------- |
+| `--labels`      | Path to the `labels.json` file.  | `str` |         |
+| `--tokens`      | Path to the `tokens.yaml` file.  | `str` | `None`  |
+| `--output-file` | Where the summary will be saved. | `str` |         |
+
+## Examples
+
+### Display statistics for an HTR dataset
+
+```shell
+teklia-dan dataset analyze \
+    --labels path/to/dataset/labels.json \
+    --output-file statistics.md
+```
+
+### Display statistics for an HTR-NER dataset
+
+```shell
+teklia-dan dataset analyze \
+    --labels path/to/dataset/labels.json \
+    --tokens  path/to/tokens.yaml \
+    --output-file statistics.md
+```
--- a/docs/usage/datasets/index.md
+++ b/docs/usage/datasets/index.md
@@ -7,3 +7,6 @@ Two operations are available through subcommands:

 `teklia-dan dataset format`
 : To format datasets for training. More details in [the dedicated section](./format.md).
+
+`teklia-dan dataset analyze`
+: To analyze datasets and display statistics. More details in [the dedicated section](./analyze.md).
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,10 @@ arkindex-export==0.1.3
 boto3==1.26.124
 editdistance==0.6.2
 imageio==2.26.1
+imagesize==1.4.1
+mdutils==1.6.0
 numpy==1.24.3
+prettytable==3.8.0
 PyYAML==6.0
 scipy==1.10.1
 tenacity==8.2.2

--- a/tests/data/analyze/images.md
+++ b/tests/data/analyze/images.md
+
+Statistics
+==========
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   6   |   6    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
--- a/tests/data/analyze/labels.md
+++ b/tests/data/analyze/labels.md
+
+Statistics
+==========
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |  109  |   16  |   2   |
+| Max    |  151  |   19  |   3   |
+| Mean   | 124.5 |  17.5 |  2.75 |
+| Median | 119.0 |  17.5 |  3.0  |
+| Total  |  498  |   70  |   11  |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |         59 |
+| e         |         49 |
+| t         |         44 |
+| s         |         37 |
+| o         |         37 |
+| n         |         37 |
+| i         |         35 |
+| a         |         30 |
+| d         |         22 |
+| r         |         15 |
+| c         |         15 |
+| l         |         13 |
+| u         |         13 |
+| m         |         11 |
+| h         |         10 |
+| g         |          9 |
+| p         |          8 |
+|           |          7 |
+|           |            |
+| y         |          6 |
+| ,         |          5 |
+| x         |          4 |
+| v         |          4 |
+| .         |          4 |
+| w         |          4 |
+| b         |          4 |
+| O         |          3 |
+| f         |          3 |
+| -         |          3 |
+| T         |          1 |
+| k         |          1 |
+| ’         |          1 |
+| C         |          1 |
+| R         |          1 |
+| j         |          1 |
+| W         |          1 |
--- a/tests/data/analyze/ner.md
+++ b/tests/data/analyze/ner.md
+
+Statistics
+==========
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    3    |     3     |  3  |
+| Max    |    3    |     3     |  3  |
+| Mean   |   3.0   |    3.0    | 3.0 |
+| Median |   3.0   |    3.0    | 3.0 |
+| Total  |    6    |     6     |  6  |
--- a/tests/data/analyze/stats.md
+++ b/tests/data/analyze/stats.md
+
+Statistics
+==========
+
+# Train
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   19  |   4   |   1   |
+| Max    |   24  |   5   |   1   |
+| Mean   |  21.5 |  4.5  |  1.0  |
+| Median |  21.5 |  4.5  |  1.0  |
+| Total  |   43  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| e         |          5 |
+| t         |          5 |
+| n         |          4 |
+| r         |          3 |
+| o         |          3 |
+| g         |          3 |
+| h         |          2 |
+| a         |          2 |
+| d         |          2 |
+| i         |          2 |
+| T         |          1 |
+| l         |          1 |
+| A         |          1 |
+| b         |          1 |
+| f         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
+
+# Val
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   20  |   4   |   1   |
+| Max    |   21  |   5   |   1   |
+| Mean   |  20.5 |  4.5  |  1.0  |
+| Median |  20.5 |  4.5  |  1.0  |
+| Total  |   41  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| e         |          6 |
+| n         |          3 |
+| a         |          3 |
+| r         |          3 |
+| m         |          3 |
+| w         |          2 |
+| h         |          2 |
+| o         |          2 |
+| O         |          1 |
+| c         |          1 |
+| b         |          1 |
+| i         |          1 |
+| t         |          1 |
+| T         |          1 |
+| p         |          1 |
+| l         |          1 |
+| y         |          1 |
+| s         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
+
+# Test
+
+## Images statistics
+
+| Metric | Width | Height |
+|:-------|:-----:|:------:|
+| Count  |   2   |   2    |
+| Min    |  537  |  768   |
+| Max    |  537  |  768   |
+| Mean   | 537.0 | 768.0  |
+| Median | 537.0 | 768.0  |
+
+## Labels statistics
+
+| Metric | Chars | Words | Lines |
+|:-------|:-----:|:-----:|:-----:|
+| Min    |   20  |   4   |   1   |
+| Max    |   29  |   5   |   1   |
+| Mean   |  24.5 |  4.5  |  1.0  |
+| Median |  24.5 |  4.5  |  1.0  |
+| Total  |   49  |   9   |   2   |
+
+### Characters statistics
+
+| Character | Occurrence |
+|:----------|-----------:|
+|           |          7 |
+| r         |          6 |
+| o         |          5 |
+| e         |          4 |
+| a         |          4 |
+| t         |          3 |
+| s         |          3 |
+| h         |          2 |
+| y         |          2 |
+| B         |          1 |
+| w         |          1 |
+| i         |          1 |
+| b         |          1 |
+| S         |          1 |
+| O         |          1 |
+| M         |          1 |
+| E         |          1 |
+| g         |          1 |
+| c         |          1 |
+| n         |          1 |
+| m         |          1 |
+| p         |          1 |
+
+### NER tokens statistics
+
+| Metric | surname | firstname | age |
+|:-------|:-------:|:---------:|:---:|
+| Min    |    0    |     0     |  0  |
+| Max    |    0    |     0     |  0  |
+| Mean   |   0.0   |    0.0    | 0.0 |
+| Median |   0.0   |    0.0    | 0.0 |
+| Total  |    0    |     0     |  0  |
--- a/tests/data/training/training_dataset/labels.json
+++ b/tests/data/training/training_dataset/labels.json
 {
-    "test": {
-        "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
-            "text": "Both her wrists bore"
-        },
-        "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
-            "text": "SOME years ago a contemporary"
-        }
-    },
    "train": {
        "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": {
            "text": "The latter do not regard"
@@ -22,5 +14,13 @@
        "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": {
            "text": "The play was no more"
        }
+    },
+    "test": {
+        "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
+            "text": "Both her wrists bore"
+        },
+        "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
+            "text": "SOME years ago a contemporary"
+        }
    }
 }
--- a/tests/data/training/training_dataset/tokens.yaml
+++ b/tests/data/training/training_dataset/tokens.yaml
+---
+surname:
+  start: "Ⓢ"
+  end: ""
+firstname:
+  start: "Ⓕ"
+  end: ""
+age:
+  start: "Ⓐ"
+  end: ""
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
+# -*- coding: utf-8 -*-
+
+import pytest
+from mdutils.mdutils import MdUtils
+
+from dan.datasets.analyze import read_json, read_yaml
+from dan.datasets.analyze.statistics import Statistics
+from tests.conftest import FIXTURES
+
+
+@pytest.fixture
+def image_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "images"))
+
+
+@pytest.fixture
+def labels_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "labels"))
+
+
+@pytest.fixture
+def ner_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "ner"))
+
+
+@pytest.fixture
+def full_statistics():
+    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
+
+
+@pytest.mark.parametrize(
+    "im_paths, expected_summary",
+    (
+        (
+            [
+                "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
+                "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
+                "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
+                "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
+                "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
+                "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
+            ],
+            pytest.lazy_fixture("image_statistics"),
+        ),
+    ),
+)
+def test_display_image_statistics(im_paths, expected_summary, tmp_path):
+    stats = Statistics(filename=tmp_path)
+    stats.create_image_statistics(images=im_paths)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "texts, expected_summary",
+    (
+        (
+            [
+                "Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
+                "Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
+                "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
+                "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
+            ],
+            pytest.lazy_fixture("labels_statistics"),
+        ),
+    ),
+)
+def test_display_label_statistics(texts, expected_summary, tmp_path):
+    filename = tmp_path / "labels.md"
+    stats = Statistics(filename=str(filename))
+    stats.create_label_statistics(labels=texts)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "texts, expected_summary",
+    (
+        (
+            [
+                "ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
+                "ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
+            ],
+            pytest.lazy_fixture("ner_statistics"),
+        ),
+    ),
+)
+def test_display_ner_statistics(texts, expected_summary, tmp_path):
+    tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
+    stats = Statistics(filename=tmp_path)
+    stats.create_ner_statistics(labels=texts, ner_tokens=tokens)
+    assert stats.document.get_md_text() == expected_summary
+
+
+@pytest.mark.parametrize(
+    "labels, tokens, expected_summary",
+    (
+        (
+            FIXTURES / "training" / "training_dataset" / "labels.json",
+            FIXTURES / "training" / "training_dataset" / "tokens.yaml",
+            pytest.lazy_fixture("full_statistics"),
+        ),
+    ),
+)
+def test_run(labels, tokens, expected_summary, tmp_path):
+    output_file = tmp_path / "stats.md"
+    stats = Statistics(filename=str(output_file))
+    stats.run(labels=read_json(labels), tokens=read_yaml(tokens))
+    assert output_file.read_text() == expected_summary
--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ package = wheel
 wheel_build_env = .pkg
 deps =
    pytest>=6
+    pytest-lazy-fixture
    -rrequirements.txt
 commands =
    pytest {tty:--color=yes} {posargs}