Skip to content
Snippets Groups Projects
Commit 9cf17c29 authored by Solene Tarride's avatar Solene Tarride Committed by Solene Tarride
Browse files

Update tests for data extraction

parent bcfaa598
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,12 @@ import pytest
from dan.datasets.extract.exceptions import NoEndTokenError
from dan.datasets.extract.extract import ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.datasets.extract.utils import (
EntityType,
insert_token,
normalize_linebreaks,
normalize_spaces,
)
from dan.utils import parse_tokens
from tests import FIXTURES
......@@ -135,8 +140,25 @@ def test_reconstruct_text(entity_separators, tokens, expected, text_before, text
(" \ttab", "tab"),
),
)
def test_remove_spaces(text, trimmed):
assert remove_spaces(text) == trimmed
def test_normalize_spaces(text, trimmed):
assert normalize_spaces(text) == trimmed
@pytest.mark.parametrize(
"text,trimmed",
(
("no_linebreaks", "no_linebreaks"),
("\nbeginning", "beginning"),
("ending\n", "ending"),
("\nboth\n", "both"),
("\n\n\nconsecutive", "consecutive"),
("\rcarriage_return", "carriage_return"),
("\r\ncarriage_return+linebreak", "carriage_return+linebreak"),
("\n\r\r\n\ncarriage_return+linebreak", "carriage_return+linebreak"),
),
)
def test_normalize_linebreaks(text, trimmed):
assert normalize_linebreaks(text) == trimmed
@pytest.mark.parametrize(
......@@ -306,6 +328,9 @@ def test_extract(
VAL_DIR / "text_line_val-page_1-line_2.jpg",
VAL_DIR / "text_line_val-page_1-line_3.jpg",
output / "labels.json",
output / "language_corpus.txt",
output / "language_lexicon.txt",
output / "language_tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment