diff --git a/tests/test_extract.py b/tests/test_extract.py index 19f59cd1ca99fb9092d523e755b8ab53919c5d06..3502f496d8fad2e63ce38ee1732b95af0f34c284 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -319,11 +319,127 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path): arkindex_extractor.process_element(element, "val") -@pytest.mark.parametrize("load_entities", (True, False)) -@pytest.mark.parametrize("keep_spaces", (True, False)) -# Transcription and entities have the same worker version @pytest.mark.parametrize( - "transcription_entities_worker_version", ("worker_version_id", False) + "load_entities,keep_spaces,transcription_entities_worker_version,expected_subword_language_corpus,subword_vocab_size", + ( + ( + True, + True, + "worker_version_id", + """â– â“¢ c a i l l e t â– â“• m a u r i c e â– â“‘ 28. 9.0 6 +â– â“¢ re b ou l â– â“• j e a n â– â“‘ 30. 9.0 2 +â– â“¢ b a re y re â– â“• j e a n â– â“‘ 28. 3 . 1 1 +â– â“¢ r ou s s y â– â“• j e a n â– â“‘ 4 . 1 1 . 1 4 +â– â“¢ m a r i n â– â“• m a r c e l â– â“‘ 1 0 . 8 . 0 6 +â– â“¢ a m i c a l â– â“• e l o i â– â“‘ 1 1 . 1 0 . 0 4 +â– â“¢ b i r o s â– â“• m a e l â– â“‘ 30. 1 0 . 1 0""", + 40, + ), + ( + True, + False, + "worker_version_id", + """â– â“¢ c a i l l e t â– â“• m a u r i c e â– â“‘ 28. 9.0 6 +â– â“¢ re b ou l â– â“• j e a n â– â“‘ 30. 9.0 2 +â– â“¢ b a re y re â– â“• j e a n â– â“‘ 28. 3 . 1 1 +â– â“¢ r ou s s y â– â“• j e a n â– â“‘ 4 . 1 1 . 1 4 +â– â“¢ m a r i n â– â“• m a r c e l â– â“‘ 1 0 . 8 . 0 6 +â– â“¢ a m i c a l â– â“• e l o i â– â“‘ 1 1 . 1 0 . 0 4 +â– â“¢ b i r o s â– â“• m a e l â– â“‘ 30. 1 0 . 1 0""", + 40, + ), + ( + False, + True, + "worker_version_id", + """â– ca i l l e t â– ma u r i ce â– 28. 9.0 6 +â– re b o u l â– j e a n â– 30. 9.0 2 +â– b a re y re â– j e a n â– 28. 3 . 1 1 +â– r o u s s y â– j e a n â– 4 . 11.1 4 +â– ma r i n â– ma r ce l â– 10. 8 . 0 6 +â– a m i ca l â– el o i â– 11.1 0 . 0 4 +â– b i r o s â– ma el â– 30. 10. 1 0""", + 40, + ), + ( + False, + False, + "worker_version_id", + """â– ca i l l e t â– ma u r i ce â– 28. 9.0 6 +â– re b o u l â– j e a n â– 30. 9.0 2 +â– b a re y re â– j e a n â– 28. 3 . 1 1 +â– r o u s s y â– j e a n â– 4 . 11.1 4 +â– ma r i n â– ma r ce l â– 10. 8 . 0 6 +â– a m i ca l â– el o i â– 11.1 0 . 0 4 +â– b i r o s â– ma el â– 30. 10. 1 0""", + 40, + ), + ( + True, + True, + False, + """â– â“¢ C a i l l e t â– â“• M a u r i c e â– â“‘ 2 8 . 9 . 0 6 +â– â“¢ R e b o u l â– â“• J e a n â– â“‘ 3 0 . 9 . 0 2 +â– â“¢ B a r e y r e â– â“• J e a n â– â“‘ 2 8 . 3 . 1 1 +â– â“¢ R o u s s y â– â“• J e a n â– â“‘ 4 . 1 1 . 1 4 +â– â“¢ M a r i n â– â“• M a r c e l â– â“‘ 1 0 . 8 . 0 6 +â– â“¢ A m i c a l â– â“• E l o i â– â“‘ 1 1 . 1 0 . 0 4 +â– â“¢ B i r o s â– â“• M a e l â– â“‘ 3 0 . 1 0 . 1 0""", + 40, + ), + ( + True, + True, + False, + """â– â“¢ C a i l l e t â– â“• M a u ri ce â– â“‘ 28. 9.0 6 +â– â“¢ R e b ou l â– â“• J e a n â– â“‘ 30. 9.0 2 +â– â“¢ B a re y re â– â“• J e a n â– â“‘ 28. 3 . 1 1 +â– â“¢ R ou s s y â– â“• J e a n â– â“‘ 4 . 11.1 4 +â– â“¢ Mar i n â– â“• Mar ce l â– â“‘ 10. 8 . 0 6 +â– â“¢ A m ic a l â– â“• E l o i â– â“‘ 11.1 0 . 0 4 +â– â“¢ B i r o s â– â“• M a e l â– â“‘ 30. 10. 10""", + 55, + ), + ( + True, + False, + False, + """â– â“¢ C a i l l e t â– â“• M a u r i c e â– â“‘ 2 8 . 9 . 0 6 +â– â“¢ R e b o u l â– â“• J e a n â– â“‘ 3 0 . 9 . 0 2 +â– â“¢ B a r e y r e â– â“• J e a n â– â“‘ 2 8 . 3 . 1 1 +â– â“¢ R o u s s y â– â“• J e a n â– â“‘ 4 . 1 1 . 1 4 +â– â“¢ M a r i n â– â“• M a r c e l â– â“‘ 1 0 . 8 . 0 6 +â– â“¢ A m i c a l â– â“• E l o i â– â“‘ 1 1 . 1 0 . 0 4 +â– â“¢ B i r o s â– â“• M a e l â– â“‘ 3 0 . 1 0 . 1 0""", + 40, + ), + ( + False, + True, + False, + """â– C a i l l e t â– Ma u r i c e â– 28. 9.0 6 +â– R e b o u l â– J e a n â– 30. 9.0 2 +â– B a r e y r e â– J e a n â– 28. 3 . 1 1 +â– R o u s s y â– J e a n â– 4 . 1 1 . 1 4 +â– Ma r i n â– Ma r c e l â– 1 0 . 8 . 0 6 +â– A m i c a l â– E l o i â– 1 1 . 1 0 . 0 4 +â– B i r o s â– Ma e l â– 30. 1 0 . 1 0""", + 40, + ), + ( + False, + False, + False, + """â– C a i l l e t â– Ma u r i c e â– 28. 9.0 6 +â– R e b o u l â– J e a n â– 30. 9.0 2 +â– B a r e y r e â– J e a n â– 28. 3 . 1 1 +â– R o u s s y â– J e a n â– 4 . 1 1 . 1 4 +â– Ma r i n â– Ma r c e l â– 1 0 . 8 . 0 6 +â– A m i c a l â– E l o i â– 1 1 . 1 0 . 0 4 +â– B i r o s â– Ma e l â– 30. 1 0 . 1 0""", + 40, + ), + ), ) @patch("dan.datasets.extract.arkindex.download_image") def test_extract( @@ -332,6 +448,8 @@ def test_extract( keep_spaces, transcription_entities_worker_version, mock_database, + expected_subword_language_corpus, + subword_vocab_size, tmp_path, ): output = tmp_path / "extraction" @@ -362,6 +480,7 @@ def test_extract( else None, keep_spaces=keep_spaces, image_extension=".jpg", + subword_vocab_size=subword_vocab_size, ) # Mock build_image_url to simply return the path to the image extractor.build_iiif_url = mock_build_image_url @@ -488,17 +607,11 @@ def test_extract( â“¢ Amical â– â“• Eloi â– â“‘ 11 â– . â– 10 â– . â– 04 â“¢ Biros â– â“• Mael â– â“‘ 30 â– . â– 10 â– . â– 10""" - expected_subword_language_corpus = """â– â“¢ C a i l l e t â– â“• M a u ri ce â– â“‘ 28. 9.0 6 -â– â“¢ R e b ou l â– â“• J e a n â– â“‘ 30. 9.0 2 -â– â“¢ B a re y re â– â“• J e a n â– â“‘ 28. 3 .11 -â– â“¢ R ou s s y â– â“• J e a n â– â“‘ 4 . 11.1 4 -â– â“¢ Mar i n â– â“• Mar ce l â– â“‘ 10. 8. 0 6 -â– â“¢ A m ic a l â– â“• E l o i â– â“‘ 11.1 0 . 0 4 -â– â“¢ B i r o s â– â“• M a e l â– â“‘ 30. 10. 10""" - # Transcriptions with worker version are in lowercase if transcription_entities_worker_version: expected_char_language_corpus = expected_char_language_corpus.lower() + expected_word_language_corpus = expected_word_language_corpus.lower() + expected_subword_language_corpus = expected_subword_language_corpus.lower() # If we do not load entities, remove tokens if not load_entities: @@ -506,12 +619,23 @@ def test_extract( expected_char_language_corpus = ENTITY_TOKEN_SPACE.sub( "", expected_char_language_corpus ) - + expected_word_language_corpus = ENTITY_TOKEN_SPACE.sub( + "", expected_word_language_corpus + ) + expected_subword_language_corpus = ENTITY_TOKEN_SPACE.sub( + "", expected_subword_language_corpus + ) # Replace double spaces with regular space if not keep_spaces: expected_char_language_corpus = TWO_SPACES_LM_REGEX.sub( "â–", expected_char_language_corpus ) + expected_word_language_corpus = TWO_SPACES_LM_REGEX.sub( + "â–", expected_word_language_corpus + ) + expected_subword_language_corpus = TWO_SPACES_LM_REGEX.sub( + "â–", expected_subword_language_corpus + ) assert ( output / "language_model" / "corpus_characters.txt" @@ -520,7 +644,8 @@ def test_extract( assert ( output / "language_model" / "corpus_words.txt" ).read_text() == expected_word_language_corpus - + print((output / "language_model" / "corpus_subwords.txt").read_text()) + print(expected_subword_language_corpus) assert ( output / "language_model" / "corpus_subwords.txt" ).read_text() == expected_subword_language_corpus