test_analyze.py

# -*- coding: utf-8 -*-

import pytest
from mdutils.mdutils import MdUtils

from dan.datasets.analyze import read_json, read_yaml
from dan.datasets.analyze.statistics import Statistics
from tests.conftest import FIXTURES


@pytest.fixture
def image_statistics():
    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "images"))


@pytest.fixture
def labels_statistics():
    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "labels"))


@pytest.fixture
def ner_statistics():
    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "ner"))


@pytest.fixture
def full_statistics():
    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))


def test_display_image_statistics(image_statistics, tmp_path):
    stats = Statistics(filename=tmp_path)
    stats.create_image_statistics(
        images=[
            "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
            "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
            "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
            "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
            "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
            "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
        ]
    )
    assert stats.document.get_md_text() == image_statistics


def test_display_label_statistics(labels_statistics, tmp_path):
    filename = tmp_path / "labels.md"
    stats = Statistics(filename=str(filename))
    stats.create_label_statistics(
        labels=[
            "Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
            "Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
            "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
            "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
        ]
    )
    assert stats.document.get_md_text() == labels_statistics


def test_display_ner_statistics(ner_statistics, tmp_path):
    tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
    stats = Statistics(filename=tmp_path)
    stats.create_ner_statistics(
        labels=[
            "ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
            "ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
        ],
        ner_tokens=tokens,
    )
    assert stats.document.get_md_text() == ner_statistics


def test_run(full_statistics, tmp_path):
    output_file = tmp_path / "stats.md"
    stats = Statistics(filename=str(output_file))
    stats.run(
        labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"),
        tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"),
    )
    assert output_file.read_text() == full_statistics