From 9870344eea3fa189b2b07abf59a8168f56196898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Wed, 20 Sep 2023 13:35:37 +0200
Subject: [PATCH] Write tests for data extraction

---
 tests/test_extract.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tests/test_extract.py b/tests/test_extract.py
index aa6d8aeb..d3370473 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -31,6 +31,8 @@ from tests import FIXTURES
 EXTRACTION_DATA_PATH = FIXTURES / "extraction"
 
 TWO_SPACES_REGEX = re.compile(r" {2}")
+ENTITY_TOKEN_SPACE = re.compile(r"[â“¢|â“•|â“‘] ")
+TWO_SPACES_LM_REGEX = re.compile(r"⎵ ⎵")
 
 # NamedTuple to mock actual database result
 Entity = NamedTuple("Entity", offset=int, length=int, type=str, value=str)
@@ -459,6 +461,47 @@ def test_extract(
     expected_charset.add("⁇")
     assert set(pickle.loads((output / "charset.pkl").read_bytes())) == expected_charset
 
+    # Check "language_corpus.txt"
+    expected_language_corpus = """ⓢ C a i l l e t ⎵ ⎵ ⓕ M a u r i c e ⎵ ⎵ ⓑ 2 8 . 9 . 0 6
+ⓢ R e b o u l ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 3 0 . 9 . 0 2
+ⓢ B a r e y r e ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 2 8 . 3 . 1 1
+ⓢ R o u s s y ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 4 . 1 1 . 1 4
+ⓢ M a r i n ⎵ ⎵ ⓕ M a r c e l ⎵ ⎵ ⓑ 1 0 . 8 . 0 6
+ⓢ R o q u e s ⎵ ⎵ ⓕ E l o i ⎵ ⎵ ⓑ 1 1 . 1 0 . 0 4
+ⓢ G i r o s ⎵ ⎵ ⓕ P a u l ⎵ ⎵ ⓑ 3 0 . 1 0 . 1 0"""
+
+    # Transcriptions with worker version are in lowercase
+    if transcription_entities_worker_version:
+        expected_language_corpus = expected_language_corpus.lower()
+
+    # If we do not load entities, remove tokens
+    if not load_entities:
+        token_translations = {f"{token} ": "" for token in tokens}
+        expected_language_corpus = ENTITY_TOKEN_SPACE.sub("", expected_language_corpus)
+
+    # Replace double spaces with regular space
+    if not keep_spaces:
+        expected_language_corpus = TWO_SPACES_LM_REGEX.sub(
+            "⎵", expected_language_corpus
+        )
+
+    assert (output / "language_corpus.txt").read_text() == expected_language_corpus
+
+    # Check "language_tokens.txt"
+    expected_language_tokens = [
+        t if t != " " else "⎵" for t in sorted(list(expected_charset))
+    ]
+    expected_language_tokens.append("◌")
+    assert (output / "language_tokens.txt").read_text() == "\n".join(
+        expected_language_tokens
+    )
+
+    # Check "language_lexicon.txt"
+    expected_language_lexicon = [f"{t} {t}" for t in expected_language_tokens]
+    assert (output / "language_lexicon.txt").read_text() == "\n".join(
+        expected_language_lexicon
+    )
+
     # Check cropped images
     for expected_path in expected_paths:
         if expected_path.suffix != ".jpg":
-- 
GitLab