diff --git a/dan/datasets/extract/arkindex.py b/dan/datasets/extract/arkindex.py index 2b15cea2ef2b4aa85fd44a1768504a2eb38007f9..ba40bf90e7dd19b06529fe539c455b1f99fa3d69 100644 --- a/dan/datasets/extract/arkindex.py +++ b/dan/datasets/extract/arkindex.py @@ -82,8 +82,24 @@ class ArkindexExtractor: self.keep_spaces = keep_spaces self.subword_vocab_size = subword_vocab_size + # Loading file from precedent extraction + data_path = self.output / "split.json" + charset_path = self.output / "charset.pkl" + + is_data_file = data_path.exists() + is_charset_file = charset_path.exists() + self.data: Dict = defaultdict(dict) self.charset = set() + + if is_data_file and is_charset_file: + self.data.update(json.loads(data_path.read_bytes())) + self.charset.update(sorted(pickle.loads(charset_path.read_bytes()))) + elif is_data_file ^ is_charset_file: + raise FileNotFoundError( + f"The file '{data_path.name}' or `{charset_path.name}` is missing at location {self.output.as_posix()}" + ) + self.language_corpus = defaultdict(list) self.language_tokens = [] self.language_lexicon = defaultdict(list) diff --git a/tests/test_extract.py b/tests/test_extract.py index 308a34c3a54c469ffe2a8161da509f1be5ae9407..368d99984dace2f1de58708b8fc06c340e6151c6 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -229,6 +229,7 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path): ), ), ) +@pytest.mark.parametrize("existing", ((True, False))) def test_extract( load_entities, keep_spaces, @@ -238,6 +239,7 @@ def test_extract( expected_subword_language_corpus, subword_vocab_size, tmp_path, + existing, ): output = tmp_path / "extraction" output.mkdir(parents=True, exist_ok=True) @@ -250,6 +252,48 @@ def test_extract( if token ] + # Add character to fake previous extract file in the folder + previous_character = "%" + + if existing: + charset_path = output / "charset.pkl" + data_path = output / "split.json" + + dataset_type = "train" + + data_id = "train-page_1-line_5" + + data = { + "dataset-id": "dataset-id", + "image": { + "iiif_url": f"{FIXTURES}/extraction/images/text_line/test-page_1-line_1.jpg", + "polygon": [ + [37, 191], + [37, 339], + [767, 339], + [767, 191], + [37, 191], + ], + }, + "text": previous_character, + } + + charset_path.write_bytes(pickle.dumps([previous_character])) + data_path.write_text( + json.dumps( + {dataset_type: {data_id: data}}, + ) + ) + + split_content[dataset_type][data_id] = data + + keys = list(split_content["train"].keys()) + keys.sort() + split_content["train"] = {i: split_content["train"][i] for i in keys} + + # Add 1 to subword_vocab_size because we have one more subword who is {previous_character} + subword_vocab_size += 1 + extractor = ArkindexExtractor( dataset_ids=["dataset_id"], element_type=["text_line"], @@ -264,6 +308,7 @@ def test_extract( keep_spaces=keep_spaces, subword_vocab_size=subword_vocab_size, ) + extractor.run() expected_paths = [ @@ -337,6 +382,17 @@ def test_extract( â“¢ Amical â– â“• Eloi â– â“‘ 11 â– . â– 10 â– . â– 04 â“¢ Biros â– â“• Mael â– â“‘ 30 â– . â– 10 â– . â– 10""" + if existing: + expected_char_language_corpus = ( + f"{previous_character}\n" + expected_char_language_corpus + ) + expected_word_language_corpus = ( + f"{previous_character}\n" + expected_word_language_corpus + ) + expected_subword_language_corpus = ( + f"â– {previous_character}\n" + expected_subword_language_corpus + ) + # Transcriptions with worker version are in lowercase if transcription_entities_worker_version: expected_char_language_corpus = expected_char_language_corpus.lower()