From 9870344eea3fa189b2b07abf59a8168f56196898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Wed, 20 Sep 2023 13:35:37 +0200 Subject: [PATCH] Write tests for data extraction --- tests/test_extract.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_extract.py b/tests/test_extract.py index aa6d8aeb..d3370473 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -31,6 +31,8 @@ from tests import FIXTURES EXTRACTION_DATA_PATH = FIXTURES / "extraction" TWO_SPACES_REGEX = re.compile(r" {2}") +ENTITY_TOKEN_SPACE = re.compile(r"[â“¢|â“•|â“‘] ") +TWO_SPACES_LM_REGEX = re.compile(r"⎵ ⎵") # NamedTuple to mock actual database result Entity = NamedTuple("Entity", offset=int, length=int, type=str, value=str) @@ -459,6 +461,47 @@ def test_extract( expected_charset.add("â‡") assert set(pickle.loads((output / "charset.pkl").read_bytes())) == expected_charset + # Check "language_corpus.txt" + expected_language_corpus = """â“¢ C a i l l e t ⎵ ⎵ â“• M a u r i c e ⎵ ⎵ â“‘ 2 8 . 9 . 0 6 +â“¢ R e b o u l ⎵ ⎵ â“• J e a n ⎵ ⎵ â“‘ 3 0 . 9 . 0 2 +â“¢ B a r e y r e ⎵ ⎵ â“• J e a n ⎵ ⎵ â“‘ 2 8 . 3 . 1 1 +â“¢ R o u s s y ⎵ ⎵ â“• J e a n ⎵ ⎵ â“‘ 4 . 1 1 . 1 4 +â“¢ M a r i n ⎵ ⎵ â“• M a r c e l ⎵ ⎵ â“‘ 1 0 . 8 . 0 6 +â“¢ R o q u e s ⎵ ⎵ â“• E l o i ⎵ ⎵ â“‘ 1 1 . 1 0 . 0 4 +â“¢ G i r o s ⎵ ⎵ â“• P a u l ⎵ ⎵ â“‘ 3 0 . 1 0 . 1 0""" + + # Transcriptions with worker version are in lowercase + if transcription_entities_worker_version: + expected_language_corpus = expected_language_corpus.lower() + + # If we do not load entities, remove tokens + if not load_entities: + token_translations = {f"{token} ": "" for token in tokens} + expected_language_corpus = ENTITY_TOKEN_SPACE.sub("", expected_language_corpus) + + # Replace double spaces with regular space + if not keep_spaces: + expected_language_corpus = TWO_SPACES_LM_REGEX.sub( + "⎵", expected_language_corpus + ) + + assert (output / "language_corpus.txt").read_text() == expected_language_corpus + + # Check "language_tokens.txt" + expected_language_tokens = [ + t if t != " " else "⎵" for t in sorted(list(expected_charset)) + ] + expected_language_tokens.append("â—Œ") + assert (output / "language_tokens.txt").read_text() == "\n".join( + expected_language_tokens + ) + + # Check "language_lexicon.txt" + expected_language_lexicon = [f"{t} {t}" for t in expected_language_tokens] + assert (output / "language_lexicon.txt").read_text() == "\n".join( + expected_language_lexicon + ) + # Check cropped images for expected_path in expected_paths: if expected_path.suffix != ".jpg": -- GitLab