Skip to content
Snippets Groups Projects

Support subword and word language models

Merged Solene Tarride requested to merge subword-and-word-lm into main
1 file
+ 139
14
Compare changes
  • Side-by-side
  • Inline
+ 199
25
@@ -33,7 +33,7 @@ EXTRACTION_DATA_PATH = FIXTURES / "extraction"
TWO_SPACES_REGEX = re.compile(r" {2}")
ENTITY_TOKEN_SPACE = re.compile(r"[ⓢ|ⓕ|ⓑ] ")
TWO_SPACES_LM_REGEX = re.compile(r"⎵ ⎵")
TWO_SPACES_LM_REGEX = re.compile(r"▁ ▁")
# NamedTuple to mock actual database result
Entity = NamedTuple("Entity", offset=int, length=int, type=str, value=str)
@@ -319,11 +319,127 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
arkindex_extractor.process_element(element, "val")
@pytest.mark.parametrize("load_entities", (True, False))
@pytest.mark.parametrize("keep_spaces", (True, False))
# Transcription and entities have the same worker version
@pytest.mark.parametrize(
"transcription_entities_worker_version", ("worker_version_id", False)
"load_entities,keep_spaces,transcription_entities_worker_version,expected_subword_language_corpus,subword_vocab_size",
(
(
True,
True,
"worker_version_id",
"""▁ ⓢ c a i l l e t ▁ ⓕ m a u r i c e ▁ ⓑ 28. 9.0 6
▁ ⓢ re b ou l ▁ ⓕ j e a n ▁ ⓑ 30. 9.0 2
▁ ⓢ b a re y re ▁ ⓕ j e a n ▁ ⓑ 28. 3 . 1 1
▁ ⓢ r ou s s y ▁ ⓕ j e a n ▁ ⓑ 4 . 1 1 . 1 4
▁ ⓢ m a r i n ▁ ⓕ m a r c e l ▁ ⓑ 1 0 . 8 . 0 6
▁ ⓢ a m i c a l ▁ ⓕ e l o i ▁ ⓑ 1 1 . 1 0 . 0 4
▁ ⓢ b i r o s ▁ ⓕ m a e l ▁ ⓑ 30. 1 0 . 1 0""",
40,
),
(
True,
False,
"worker_version_id",
"""▁ ⓢ c a i l l e t ▁ ⓕ m a u r i c e ▁ ⓑ 28. 9.0 6
▁ ⓢ re b ou l ▁ ⓕ j e a n ▁ ⓑ 30. 9.0 2
▁ ⓢ b a re y re ▁ ⓕ j e a n ▁ ⓑ 28. 3 . 1 1
▁ ⓢ r ou s s y ▁ ⓕ j e a n ▁ ⓑ 4 . 1 1 . 1 4
▁ ⓢ m a r i n ▁ ⓕ m a r c e l ▁ ⓑ 1 0 . 8 . 0 6
▁ ⓢ a m i c a l ▁ ⓕ e l o i ▁ ⓑ 1 1 . 1 0 . 0 4
▁ ⓢ b i r o s ▁ ⓕ m a e l ▁ ⓑ 30. 1 0 . 1 0""",
40,
),
(
False,
True,
"worker_version_id",
"""▁ ca i l l e t ▁ ma u r i ce ▁ 28. 9.0 6
▁ re b o u l ▁ j e a n ▁ 30. 9.0 2
▁ b a re y re ▁ j e a n ▁ 28. 3 . 1 1
▁ r o u s s y ▁ j e a n ▁ 4 . 11.1 4
▁ ma r i n ▁ ma r ce l ▁ 10. 8 . 0 6
▁ a m i ca l ▁ el o i ▁ 11.1 0 . 0 4
▁ b i r o s ▁ ma el ▁ 30. 10. 1 0""",
40,
),
(
False,
False,
"worker_version_id",
"""▁ ca i l l e t ▁ ma u r i ce ▁ 28. 9.0 6
▁ re b o u l ▁ j e a n ▁ 30. 9.0 2
▁ b a re y re ▁ j e a n ▁ 28. 3 . 1 1
▁ r o u s s y ▁ j e a n ▁ 4 . 11.1 4
▁ ma r i n ▁ ma r ce l ▁ 10. 8 . 0 6
▁ a m i ca l ▁ el o i ▁ 11.1 0 . 0 4
▁ b i r o s ▁ ma el ▁ 30. 10. 1 0""",
40,
),
(
True,
True,
False,
"""▁ ⓢ C a i l l e t ▁ ⓕ M a u r i c e ▁ ⓑ 2 8 . 9 . 0 6
▁ ⓢ R e b o u l ▁ ⓕ J e a n ▁ ⓑ 3 0 . 9 . 0 2
▁ ⓢ B a r e y r e ▁ ⓕ J e a n ▁ ⓑ 2 8 . 3 . 1 1
▁ ⓢ R o u s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 1 1 . 1 4
▁ ⓢ M a r i n ▁ ⓕ M a r c e l ▁ ⓑ 1 0 . 8 . 0 6
▁ ⓢ A m i c a l ▁ ⓕ E l o i ▁ ⓑ 1 1 . 1 0 . 0 4
▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 3 0 . 1 0 . 1 0""",
40,
),
(
True,
True,
False,
"""▁ ⓢ C a i l l e t ▁ ⓕ M a u ri ce ▁ ⓑ 28. 9.0 6
▁ ⓢ R e b ou l ▁ ⓕ J e a n ▁ ⓑ 30. 9.0 2
▁ ⓢ B a re y re ▁ ⓕ J e a n ▁ ⓑ 28. 3 . 1 1
▁ ⓢ R ou s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 11.1 4
▁ ⓢ Mar i n ▁ ⓕ Mar ce l ▁ ⓑ 10. 8 . 0 6
▁ ⓢ A m ic a l ▁ ⓕ E l o i ▁ ⓑ 11.1 0 . 0 4
▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 30. 10. 10""",
55,
),
(
True,
False,
False,
"""▁ ⓢ C a i l l e t ▁ ⓕ M a u r i c e ▁ ⓑ 2 8 . 9 . 0 6
▁ ⓢ R e b o u l ▁ ⓕ J e a n ▁ ⓑ 3 0 . 9 . 0 2
▁ ⓢ B a r e y r e ▁ ⓕ J e a n ▁ ⓑ 2 8 . 3 . 1 1
▁ ⓢ R o u s s y ▁ ⓕ J e a n ▁ ⓑ 4 . 1 1 . 1 4
▁ ⓢ M a r i n ▁ ⓕ M a r c e l ▁ ⓑ 1 0 . 8 . 0 6
▁ ⓢ A m i c a l ▁ ⓕ E l o i ▁ ⓑ 1 1 . 1 0 . 0 4
▁ ⓢ B i r o s ▁ ⓕ M a e l ▁ ⓑ 3 0 . 1 0 . 1 0""",
40,
),
(
False,
True,
False,
"""▁ C a i l l e t ▁ Ma u r i c e ▁ 28. 9.0 6
▁ R e b o u l ▁ J e a n ▁ 30. 9.0 2
▁ B a r e y r e ▁ J e a n ▁ 28. 3 . 1 1
▁ R o u s s y ▁ J e a n ▁ 4 . 1 1 . 1 4
▁ Ma r i n ▁ Ma r c e l ▁ 1 0 . 8 . 0 6
▁ A m i c a l ▁ E l o i ▁ 1 1 . 1 0 . 0 4
▁ B i r o s ▁ Ma e l ▁ 30. 1 0 . 1 0""",
40,
),
(
False,
False,
False,
"""▁ C a i l l e t ▁ Ma u r i c e ▁ 28. 9.0 6
▁ R e b o u l ▁ J e a n ▁ 30. 9.0 2
▁ B a r e y r e ▁ J e a n ▁ 28. 3 . 1 1
▁ R o u s s y ▁ J e a n ▁ 4 . 1 1 . 1 4
▁ Ma r i n ▁ Ma r c e l ▁ 1 0 . 8 . 0 6
▁ A m i c a l ▁ E l o i ▁ 1 1 . 1 0 . 0 4
▁ B i r o s ▁ Ma e l ▁ 30. 1 0 . 1 0""",
40,
),
),
)
@patch("dan.datasets.extract.arkindex.download_image")
def test_extract(
@@ -332,6 +448,8 @@ def test_extract(
keep_spaces,
transcription_entities_worker_version,
mock_database,
expected_subword_language_corpus,
subword_vocab_size,
tmp_path,
):
output = tmp_path / "extraction"
@@ -362,6 +480,7 @@ def test_extract(
else None,
keep_spaces=keep_spaces,
image_extension=".jpg",
subword_vocab_size=subword_vocab_size,
)
# Mock build_image_url to simply return the path to the image
extractor.build_iiif_url = mock_build_image_url
@@ -398,8 +517,14 @@ def test_extract(
VAL_DIR / "val-page_1-line_3.jpg",
output / "labels.json",
# Language resources
output / "language_model" / "corpus.txt",
output / "language_model" / "lexicon.txt",
output / "language_model" / "corpus_characters.txt",
output / "language_model" / "corpus_subwords.txt",
output / "language_model" / "corpus_words.txt",
output / "language_model" / "lexicon_characters.txt",
output / "language_model" / "lexicon_subwords.txt",
output / "language_model" / "lexicon_words.txt",
output / "language_model" / "subword_tokenizer.model",
output / "language_model" / "subword_tokenizer.vocab",
output / "language_model" / "tokens.txt",
]
assert sorted(filter(methodcaller("is_file"), output.rglob("*"))) == expected_paths
@@ -466,36 +591,67 @@ def test_extract(
assert set(pickle.loads((output / "charset.pkl").read_bytes())) == expected_charset
# Check "language_corpus.txt"
expected_language_corpus = """ⓢ C a i l l e t ⎵ ⎵ ⓕ M a u r i c e ⎵ ⎵ ⓑ 2 8 . 9 . 0 6
ⓢ R e b o u l ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 3 0 . 9 . 0 2
ⓢ B a r e y r e ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 2 8 . 3 . 1 1
ⓢ R o u s s y ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 4 . 1 1 . 1 4
ⓢ M a r i n ⎵ ⎵ ⓕ M a r c e l ⎵ ⎵ ⓑ 1 0 . 8 . 0 6
ⓢ A m i c a l ⎵ ⎵ ⓕ E l o i ⎵ ⎵ ⓑ 1 1 . 1 0 . 0 4
ⓢ B i r o s ⎵ ⎵ ⓕ M a e l ⎵ ⎵ ⓑ 3 0 . 1 0 . 1 0"""
expected_char_language_corpus = """ⓢ C a i l l e t ▁ ▁ ⓕ M a u r i c e ▁ ▁ ⓑ 2 8 . 9 . 0 6
ⓢ R e b o u l ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 3 0 . 9 . 0 2
ⓢ B a r e y r e ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 2 8 . 3 . 1 1
ⓢ R o u s s y ▁ ▁ ⓕ J e a n ▁ ▁ ⓑ 4 . 1 1 . 1 4
ⓢ M a r i n ▁ ▁ ⓕ M a r c e l ▁ ▁ ⓑ 1 0 . 8 . 0 6
ⓢ A m i c a l ▁ ▁ ⓕ E l o i ▁ ▁ ⓑ 1 1 . 1 0 . 0 4
ⓢ B i r o s ▁ ▁ ⓕ M a e l ▁ ▁ ⓑ 3 0 . 1 0 . 1 0"""
expected_word_language_corpus = """ⓢ Caillet ▁ ⓕ Maurice ▁ ⓑ 28 ▁ . ▁ 9 ▁ . ▁ 06
ⓢ Reboul ▁ ⓕ Jean ▁ ⓑ 30 ▁ . ▁ 9 ▁ . ▁ 02
ⓢ Bareyre ▁ ⓕ Jean ▁ ⓑ 28 ▁ . ▁ 3 ▁ . ▁ 11
ⓢ Roussy ▁ ⓕ Jean ▁ ⓑ 4 ▁ . ▁ 11 ▁ . ▁ 14
ⓢ Marin ▁ ⓕ Marcel ▁ ⓑ 10 ▁ . ▁ 8 ▁ . ▁ 06
ⓢ Amical ▁ ⓕ Eloi ▁ ⓑ 11 ▁ . ▁ 10 ▁ . ▁ 04
ⓢ Biros ▁ ⓕ Mael ▁ ⓑ 30 ▁ . ▁ 10 ▁ . ▁ 10"""
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
expected_language_corpus = expected_language_corpus.lower()
expected_char_language_corpus = expected_char_language_corpus.lower()
expected_word_language_corpus = expected_word_language_corpus.lower()
expected_subword_language_corpus = expected_subword_language_corpus.lower()
# If we do not load entities, remove tokens
if not load_entities:
token_translations = {f"{token} ": "" for token in tokens}
expected_language_corpus = ENTITY_TOKEN_SPACE.sub("", expected_language_corpus)
expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub(
"", expected_subword_language_corpus
)
# Replace double spaces with regular space
if not keep_spaces:
expected_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_language_corpus
expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_char_language_corpus
)
expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_word_language_corpus
)
expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub(
"", expected_subword_language_corpus
)
assert (
output / "language_model" / "corpus_characters.txt"
).read_text() == expected_char_language_corpus
assert (
output / "language_model" / "corpus_words.txt"
).read_text() == expected_word_language_corpus
assert (
output / "language_model" / "corpus.txt"
).read_text() == expected_language_corpus
output / "language_model" / "corpus_subwords.txt"
).read_text() == expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens = [
t if t != " " else "" for t in sorted(list(expected_charset))
"" if t.isspace() else t for t in sorted(list(expected_charset))
]
expected_language_tokens.append("")
assert (output / "language_model" / "tokens.txt").read_text() == "\n".join(
@@ -503,11 +659,29 @@ def test_extract(
)
# Check "language_lexicon.txt"
expected_language_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (output / "language_model" / "lexicon.txt").read_text() == "\n".join(
expected_language_lexicon
expected_language_char_lexicon = [f"{t} {t}" for t in expected_language_tokens]
assert (
output / "language_model" / "lexicon_characters.txt"
).read_text() == "\n".join(expected_language_char_lexicon)
word_vocab = set([word for word in expected_word_language_corpus.split()])
expected_language_word_lexicon = [
f"{word} {' '.join(word)}" for word in sorted(word_vocab)
]
assert (output / "language_model" / "lexicon_words.txt").read_text() == "\n".join(
expected_language_word_lexicon
)
subword_vocab = set(
[subword for subword in expected_subword_language_corpus.split()]
)
expected_language_subword_lexicon = [
f"{subword} {' '.join(subword)}" for subword in sorted(subword_vocab)
]
assert (
output / "language_model" / "lexicon_subwords.txt"
).read_text() == "\n".join(expected_language_subword_lexicon)
# Check cropped images
for expected_path in expected_paths:
if expected_path.suffix != ".jpg":
Loading