Skip to content
Snippets Groups Projects
Commit 08a96bf6 authored by Solene Tarride's avatar Solene Tarride Committed by Mélodie Boillet
Browse files

Compute dataset statistics after extraction/formatting

parent 4de3a84d
No related branches found
No related tags found
1 merge request!225Compute dataset statistics after extraction/formatting
Showing
with 654 additions and 8 deletions
......@@ -44,6 +44,7 @@ repos:
rev: 0.7.16
hooks:
- id: mdformat
exclude: tests/data/analyze
# Optionally add plugins
additional_dependencies:
- mdformat-mkdocs[recommended]
......@@ -3,6 +3,7 @@
Preprocess datasets for training.
"""
from dan.datasets.analyze import add_analyze_parser
from dan.datasets.extract import add_extract_parser
from dan.datasets.format import add_format_parser
......@@ -17,3 +18,4 @@ def add_dataset_parser(subcommands) -> None:
add_extract_parser(subcommands)
add_format_parser(subcommands)
add_analyze_parser(subcommands)
# -*- coding: utf-8 -*-
"""
Analyze dataset and display statistics in markdown format.
"""
import json
from pathlib import Path
from typing import Dict
import yaml
from dan.datasets.analyze.statistics import run
def read_yaml(yaml_path: str) -> Dict:
"""
Read YAML tokens file
"""
filename = Path(yaml_path)
assert filename.exists()
return yaml.safe_load(filename.read_text())
def read_json(json_path: str) -> Dict:
"""
Read labels JSON file
"""
filename = Path(json_path)
assert filename.exists()
return json.loads(filename.read_text())
def add_analyze_parser(subcommands) -> None:
parser = subcommands.add_parser(
"analyze",
description=__doc__,
help=__doc__,
)
parser.add_argument(
"--labels",
type=read_json,
help="Path to the formatted labels in JSON format.",
required=True,
)
parser.add_argument(
"--tokens",
type=read_yaml,
help="Path to the tokens YAML file.",
required=False,
)
parser.add_argument(
"--output-file",
dest="output",
type=Path,
help="The statistics will be saved to this file in Markdown format.",
required=True,
)
parser.set_defaults(func=run)
# -*- coding: utf-8 -*-
from collections import Counter, defaultdict
from operator import itemgetter
from pathlib import Path
from typing import Dict, List, Optional
import imagesize
import numpy as np
from mdutils.mdutils import MdUtils
from prettytable import MARKDOWN, PrettyTable
from dan import logger
METRIC_COLUMN = "Metric"
def create_table(
data: Dict,
count: bool = False,
total: bool = True,
):
"""
Each keys will be made into a column
We compute min, max, mean, median, total by default.
Total can be disabled. Count (length) computation can be enabled.
"""
statistics = PrettyTable(field_names=[METRIC_COLUMN, *data.keys()])
statistics.align.update({METRIC_COLUMN: "l"})
statistics.set_style(MARKDOWN)
operations = []
if count:
operations.append(("Count", len))
operations.extend(
[
("Min", np.min),
("Max", np.max),
("Mean", np.mean),
("Median", np.median),
]
)
if total:
operations.append(("Total", np.sum))
statistics.add_rows(
[
[col_name, *list(map(operator, data.values()))]
for col_name, operator in operations
]
)
return statistics
class Statistics:
HEADERS = {
"Images": "Images statistics",
"Labels": "Labels statistics",
"Chars": "Characters statistics",
"Tokens": "NER tokens statistics",
}
def __init__(self, filename: str) -> None:
self.document = MdUtils(file_name=filename, title="Statistics")
def _write_section(self, table: PrettyTable, title: str, level: int = 2):
"""
Write the new section in the file.
<title with appropriate level>
<table>
"""
self.document.new_header(level=level, title=title, add_table_of_contents="n")
self.document.write("\n")
logger.info(f"{title}\n\n{table}\n")
self.document.write(table.get_string())
self.document.write("\n")
def create_image_statistics(self, images: List[str]):
"""
Compute statistics on image sizes and write them to file.
"""
shapes = list(map(imagesize.get, images))
widths, heights = zip(*shapes)
self._write_section(
table=create_table(
data={"Width": widths, "Height": heights}, count=True, total=False
),
title=Statistics.HEADERS["Images"],
)
def create_label_statistics(self, labels: List[str]):
"""
Compute statistics on text labels and write them to file.
"""
char_counter = Counter()
data = defaultdict(list)
for text in labels:
char_counter.update(text)
data["Chars"].append(len(text))
data["Words"].append(len(text.split()))
data["Lines"].append(len(text.split("\n")))
self._write_section(
table=create_table(data=data),
title=Statistics.HEADERS["Labels"],
)
self.create_character_occurrences_statistics(char_counter)
def create_character_occurrences_statistics(self, char_counter: Counter):
"""
Compute statistics on the character distribution and write them to file.
"""
char_occurrences = PrettyTable(
field_names=["Character", "Occurrence"],
)
char_occurrences.align.update({"Character": "l", "Occurrence": "r"})
char_occurrences.set_style(MARKDOWN)
char_occurrences.add_rows(list(char_counter.most_common()))
self._write_section(
table=char_occurrences, title=Statistics.HEADERS["Chars"], level=3
)
def create_ner_statistics(self, labels: List[str], ner_tokens: Dict) -> str:
"""
Compute statistics on ner tokens presence.
"""
entity_counter = defaultdict(list)
for text in labels:
for ner_label, token in ner_tokens.items():
entity_counter[ner_label].append(text.count(token["start"]))
self._write_section(
table=create_table(data=entity_counter),
title=Statistics.HEADERS["Tokens"],
level=3,
)
def run(self, labels: Dict, tokens: Optional[Dict]):
# Iterate over each split
for split_name, split_data in labels.items():
self.document.new_header(level=1, title=split_name.capitalize())
# Image statistics
# Path to the images are the key of the dict
self.create_image_statistics(images=split_data.keys())
# The text is actually under the "text" key of the values
labels = list(map(itemgetter("text"), split_data.values()))
# Text statistics
self.create_label_statistics(labels=labels)
if tokens is not None:
self.create_ner_statistics(labels=labels, ner_tokens=tokens)
self.document.create_md_file()
def run(labels: Dict, tokens: Optional[Dict], output: Path) -> None:
"""
Compute and save a dataset statistics.
"""
Statistics(filename=str(output)).run(labels=labels, tokens=tokens)
# Analysis
# Statistics
::: dan.datasets.analyze.statistics
# Dataset analysis
## Description
Use the `teklia-dan dataset analyze` command to analyze a dataset. This will display statistics in Markdown format.
The available arguments are
| Parameter | Description | Type | Default |
| --------------- | -------------------------------- | ----- | ------- |
| `--labels` | Path to the `labels.json` file. | `str` | |
| `--tokens` | Path to the `tokens.yaml` file. | `str` | `None` |
| `--output-file` | Where the summary will be saved. | `str` | |
## Examples
### Display statistics for an HTR dataset
```shell
teklia-dan dataset analyze \
--labels path/to/dataset/labels.json \
--output-file statistics.md
```
### Display statistics for an HTR-NER dataset
```shell
teklia-dan dataset analyze \
--labels path/to/dataset/labels.json \
--tokens path/to/tokens.yaml \
--output-file statistics.md
```
......@@ -7,3 +7,6 @@ Two operations are available through subcommands:
`teklia-dan dataset format`
: To format datasets for training. More details in [the dedicated section](./format.md).
`teklia-dan dataset analyze`
: To analyze datasets and display statistics. More details in [the dedicated section](./analyze.md).
......@@ -4,7 +4,10 @@ arkindex-export==0.1.3
boto3==1.26.124
editdistance==0.6.2
imageio==2.26.1
imagesize==1.4.1
mdutils==1.6.0
numpy==1.24.3
prettytable==3.8.0
PyYAML==6.0
scipy==1.10.1
tenacity==8.2.2
......
Statistics
==========
## Images statistics
| Metric | Width | Height |
|:-------|:-----:|:------:|
| Count | 6 | 6 |
| Min | 537 | 768 |
| Max | 537 | 768 |
| Mean | 537.0 | 768.0 |
| Median | 537.0 | 768.0 |
Statistics
==========
## Labels statistics
| Metric | Chars | Words | Lines |
|:-------|:-----:|:-----:|:-----:|
| Min | 109 | 16 | 2 |
| Max | 151 | 19 | 3 |
| Mean | 124.5 | 17.5 | 2.75 |
| Median | 119.0 | 17.5 | 3.0 |
| Total | 498 | 70 | 11 |
### Characters statistics
| Character | Occurrence |
|:----------|-----------:|
| | 59 |
| e | 49 |
| t | 44 |
| s | 37 |
| o | 37 |
| n | 37 |
| i | 35 |
| a | 30 |
| d | 22 |
| r | 15 |
| c | 15 |
| l | 13 |
| u | 13 |
| m | 11 |
| h | 10 |
| g | 9 |
| p | 8 |
| | 7 |
| | |
| y | 6 |
| , | 5 |
| x | 4 |
| v | 4 |
| . | 4 |
| w | 4 |
| b | 4 |
| O | 3 |
| f | 3 |
| - | 3 |
| T | 1 |
| k | 1 |
| ’ | 1 |
| C | 1 |
| R | 1 |
| j | 1 |
| W | 1 |
Statistics
==========
### NER tokens statistics
| Metric | surname | firstname | age |
|:-------|:-------:|:---------:|:---:|
| Min | 3 | 3 | 3 |
| Max | 3 | 3 | 3 |
| Mean | 3.0 | 3.0 | 3.0 |
| Median | 3.0 | 3.0 | 3.0 |
| Total | 6 | 6 | 6 |
Statistics
==========
# Train
## Images statistics
| Metric | Width | Height |
|:-------|:-----:|:------:|
| Count | 2 | 2 |
| Min | 537 | 768 |
| Max | 537 | 768 |
| Mean | 537.0 | 768.0 |
| Median | 537.0 | 768.0 |
## Labels statistics
| Metric | Chars | Words | Lines |
|:-------|:-----:|:-----:|:-----:|
| Min | 19 | 4 | 1 |
| Max | 24 | 5 | 1 |
| Mean | 21.5 | 4.5 | 1.0 |
| Median | 21.5 | 4.5 | 1.0 |
| Total | 43 | 9 | 2 |
### Characters statistics
| Character | Occurrence |
|:----------|-----------:|
| | 7 |
| e | 5 |
| t | 5 |
| n | 4 |
| r | 3 |
| o | 3 |
| g | 3 |
| h | 2 |
| a | 2 |
| d | 2 |
| i | 2 |
| T | 1 |
| l | 1 |
| A | 1 |
| b | 1 |
| f | 1 |
### NER tokens statistics
| Metric | surname | firstname | age |
|:-------|:-------:|:---------:|:---:|
| Min | 0 | 0 | 0 |
| Max | 0 | 0 | 0 |
| Mean | 0.0 | 0.0 | 0.0 |
| Median | 0.0 | 0.0 | 0.0 |
| Total | 0 | 0 | 0 |
# Val
## Images statistics
| Metric | Width | Height |
|:-------|:-----:|:------:|
| Count | 2 | 2 |
| Min | 537 | 768 |
| Max | 537 | 768 |
| Mean | 537.0 | 768.0 |
| Median | 537.0 | 768.0 |
## Labels statistics
| Metric | Chars | Words | Lines |
|:-------|:-----:|:-----:|:-----:|
| Min | 20 | 4 | 1 |
| Max | 21 | 5 | 1 |
| Mean | 20.5 | 4.5 | 1.0 |
| Median | 20.5 | 4.5 | 1.0 |
| Total | 41 | 9 | 2 |
### Characters statistics
| Character | Occurrence |
|:----------|-----------:|
| | 7 |
| e | 6 |
| n | 3 |
| a | 3 |
| r | 3 |
| m | 3 |
| w | 2 |
| h | 2 |
| o | 2 |
| O | 1 |
| c | 1 |
| b | 1 |
| i | 1 |
| t | 1 |
| T | 1 |
| p | 1 |
| l | 1 |
| y | 1 |
| s | 1 |
### NER tokens statistics
| Metric | surname | firstname | age |
|:-------|:-------:|:---------:|:---:|
| Min | 0 | 0 | 0 |
| Max | 0 | 0 | 0 |
| Mean | 0.0 | 0.0 | 0.0 |
| Median | 0.0 | 0.0 | 0.0 |
| Total | 0 | 0 | 0 |
# Test
## Images statistics
| Metric | Width | Height |
|:-------|:-----:|:------:|
| Count | 2 | 2 |
| Min | 537 | 768 |
| Max | 537 | 768 |
| Mean | 537.0 | 768.0 |
| Median | 537.0 | 768.0 |
## Labels statistics
| Metric | Chars | Words | Lines |
|:-------|:-----:|:-----:|:-----:|
| Min | 20 | 4 | 1 |
| Max | 29 | 5 | 1 |
| Mean | 24.5 | 4.5 | 1.0 |
| Median | 24.5 | 4.5 | 1.0 |
| Total | 49 | 9 | 2 |
### Characters statistics
| Character | Occurrence |
|:----------|-----------:|
| | 7 |
| r | 6 |
| o | 5 |
| e | 4 |
| a | 4 |
| t | 3 |
| s | 3 |
| h | 2 |
| y | 2 |
| B | 1 |
| w | 1 |
| i | 1 |
| b | 1 |
| S | 1 |
| O | 1 |
| M | 1 |
| E | 1 |
| g | 1 |
| c | 1 |
| n | 1 |
| m | 1 |
| p | 1 |
### NER tokens statistics
| Metric | surname | firstname | age |
|:-------|:-------:|:---------:|:---:|
| Min | 0 | 0 | 0 |
| Max | 0 | 0 | 0 |
| Mean | 0.0 | 0.0 | 0.0 |
| Median | 0.0 | 0.0 | 0.0 |
| Total | 0 | 0 | 0 |
{
"test": {
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
"text": "Both her wrists bore"
},
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
"text": "SOME years ago a contemporary"
}
},
"train": {
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png": {
"text": "The latter do not regard"
......@@ -22,5 +14,13 @@
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png": {
"text": "The play was no more"
}
},
"test": {
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png": {
"text": "Both her wrists bore"
},
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png": {
"text": "SOME years ago a contemporary"
}
}
}
---
surname:
start: "Ⓢ"
end: ""
firstname:
start: "Ⓕ"
end: ""
age:
start: "Ⓐ"
end: ""
# -*- coding: utf-8 -*-
import pytest
from mdutils.mdutils import MdUtils
from dan.datasets.analyze import read_json, read_yaml
from dan.datasets.analyze.statistics import Statistics
from tests.conftest import FIXTURES
@pytest.fixture
def image_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "images"))
@pytest.fixture
def labels_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "labels"))
@pytest.fixture
def ner_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "ner"))
@pytest.fixture
def full_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
@pytest.mark.parametrize(
"im_paths, expected_summary",
(
(
[
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
"tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
],
pytest.lazy_fixture("image_statistics"),
),
),
)
def test_display_image_statistics(im_paths, expected_summary, tmp_path):
stats = Statistics(filename=tmp_path)
stats.create_image_statistics(images=im_paths)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"texts, expected_summary",
(
(
[
"Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
"Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
"Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
"With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
],
pytest.lazy_fixture("labels_statistics"),
),
),
)
def test_display_label_statistics(texts, expected_summary, tmp_path):
filename = tmp_path / "labels.md"
stats = Statistics(filename=str(filename))
stats.create_label_statistics(labels=texts)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"texts, expected_summary",
(
(
[
"ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
"ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
],
pytest.lazy_fixture("ner_statistics"),
),
),
)
def test_display_ner_statistics(texts, expected_summary, tmp_path):
tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
stats = Statistics(filename=tmp_path)
stats.create_ner_statistics(labels=texts, ner_tokens=tokens)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"labels, tokens, expected_summary",
(
(
FIXTURES / "training" / "training_dataset" / "labels.json",
FIXTURES / "training" / "training_dataset" / "tokens.yaml",
pytest.lazy_fixture("full_statistics"),
),
),
)
def test_run(labels, tokens, expected_summary, tmp_path):
output_file = tmp_path / "stats.md"
stats = Statistics(filename=str(output_file))
stats.run(labels=read_json(labels), tokens=read_yaml(tokens))
assert output_file.read_text() == expected_summary
......@@ -9,6 +9,7 @@ package = wheel
wheel_build_env = .pkg
deps =
pytest>=6
pytest-lazy-fixture
-rrequirements.txt
commands =
pytest {tty:--color=yes} {posargs}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment