Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
All threads resolved!
1 file
+ 3
5
Compare changes
  • Side-by-side
  • Inline
+ 3
5
@@ -186,7 +186,6 @@ def test_normalize_spaces(text, trimmed):
("\rcarriage_return", "carriage_return"),
("\r\ncarriage_return+linebreak", "carriage_return+linebreak"),
("\n\r\r\n\ncarriage_return+linebreak", "carriage_return+linebreak"),
("no|linebreaks", "no|linebreaks"),
),
)
def test_normalize_linebreaks(text, trimmed):
@@ -397,10 +396,9 @@ def test_extract(
VAL_DIR / "val-page_1-line_2.jpg",
VAL_DIR / "val-page_1-line_3.jpg",
output / "labels.json",
# Language resources
output / "language_model" / "corpus.txt",
output / "language_model" / "lexicon.txt",
output / "language_model" / "tokens.txt",
output / "language_corpus.txt",
output / "language_lexicon.txt",
output / "language_tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
Loading