Skip to content
Snippets Groups Projects

Add Language Model Decoder

Merged Solene Tarride requested to merge lm-decoder into main
All threads resolved!
1 file
+ 28
3
Compare changes
  • Side-by-side
  • Inline
+ 28
3
@@ -10,7 +10,12 @@ import pytest
from dan.datasets.extract.exceptions import NoEndTokenError
from dan.datasets.extract.extract import ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token, remove_spaces
from dan.datasets.extract.utils import (
EntityType,
insert_token,
normalize_linebreaks,
normalize_spaces,
)
from dan.utils import parse_tokens
from tests import FIXTURES
@@ -135,8 +140,25 @@ def test_reconstruct_text(entity_separators, tokens, expected, text_before, text
(" \ttab", "tab"),
),
)
def test_remove_spaces(text, trimmed):
assert remove_spaces(text) == trimmed
def test_normalize_spaces(text, trimmed):
assert normalize_spaces(text) == trimmed
@pytest.mark.parametrize(
"text,trimmed",
(
("no_linebreaks", "no_linebreaks"),
("\nbeginning", "beginning"),
("ending\n", "ending"),
("\nboth\n", "both"),
("\n\n\nconsecutive", "consecutive"),
("\rcarriage_return", "carriage_return"),
("\r\ncarriage_return+linebreak", "carriage_return+linebreak"),
("\n\r\r\n\ncarriage_return+linebreak", "carriage_return+linebreak"),
),
)
def test_normalize_linebreaks(text, trimmed):
assert normalize_linebreaks(text) == trimmed
@pytest.mark.parametrize(
@@ -306,6 +328,9 @@ def test_extract(
VAL_DIR / "text_line_val-page_1-line_2.jpg",
VAL_DIR / "text_line_val-page_1-line_3.jpg",
output / "labels.json",
output / "language_corpus.txt",
output / "language_lexicon.txt",
output / "language_tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
Loading