Merge branch 'merge-datasets-in-extraction' into 'main'

Merge datasets Closes #239 See merge request !428

Merge branch 'merge-datasets-in-extraction' into 'main'
ea9f766a · Yoann Schneider · 20161125 · fe52dafb · ea9f766a · ea9f766a
Commit ea9f766a authored 8 months ago by Yoann Schneider
--- a/dan/datasets/extract/arkindex.py
+++ b/dan/datasets/extract/arkindex.py
@@ -82,8 +82,24 @@ class ArkindexExtractor:
        self.keep_spaces = keep_spaces
        self.subword_vocab_size = subword_vocab_size

+        # Loading file from precedent extraction
+        data_path = self.output / "split.json"
+        charset_path = self.output / "charset.pkl"
+
+        is_data_file = data_path.exists()
+        is_charset_file = charset_path.exists()
+
        self.data: Dict = defaultdict(dict)
        self.charset = set()
+
+        if is_data_file and is_charset_file:
+            self.data.update(json.loads(data_path.read_bytes()))
+            self.charset.update(sorted(pickle.loads(charset_path.read_bytes())))
+        elif is_data_file ^ is_charset_file:
+            raise FileNotFoundError(
+                f"The file '{data_path.name}' or `{charset_path.name}` is missing at location {self.output.as_posix()}"
+            )
+
        self.language_corpus = defaultdict(list)
        self.language_tokens = []
        self.language_lexicon = defaultdict(list)

--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -229,6 +229,7 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
        ),
    ),
 )
+@pytest.mark.parametrize("existing", ((True, False)))
 def test_extract(
    load_entities,
    keep_spaces,
@@ -238,6 +239,7 @@ def test_extract(
    expected_subword_language_corpus,
    subword_vocab_size,
    tmp_path,
+    existing,
 ):
    output = tmp_path / "extraction"
    output.mkdir(parents=True, exist_ok=True)
@@ -250,6 +252,48 @@ def test_extract(
        if token
    ]

+    # Add character to fake previous extract file in the folder
+    previous_character = "%"
+
+    if existing:
+        charset_path = output / "charset.pkl"
+        data_path = output / "split.json"
+
+        dataset_type = "train"
+
+        data_id = "train-page_1-line_5"
+
+        data = {
+            "dataset-id": "dataset-id",
+            "image": {
+                "iiif_url": f"{FIXTURES}/extraction/images/text_line/test-page_1-line_1.jpg",
+                "polygon": [
+                    [37, 191],
+                    [37, 339],
+                    [767, 339],
+                    [767, 191],
+                    [37, 191],
+                ],
+            },
+            "text": previous_character,
+        }
+
+        charset_path.write_bytes(pickle.dumps([previous_character]))
+        data_path.write_text(
+            json.dumps(
+                {dataset_type: {data_id: data}},
+            )
+        )
+
+        split_content[dataset_type][data_id] = data
+
+        keys = list(split_content["train"].keys())
+        keys.sort()
+        split_content["train"] = {i: split_content["train"][i] for i in keys}
+
+        # Add 1 to subword_vocab_size because we have one more subword who is {previous_character}
+        subword_vocab_size += 1
+
    extractor = ArkindexExtractor(
        dataset_ids=["dataset_id"],
        element_type=["text_line"],
@@ -264,6 +308,7 @@ def test_extract(
        keep_spaces=keep_spaces,
        subword_vocab_size=subword_vocab_size,
    )
+
    extractor.run()

    expected_paths = [
@@ -337,6 +382,17 @@ def test_extract(
 ⓢ Amical ▁ ⓕ Eloi ▁ ⓑ 11 ▁ . ▁ 10 ▁ . ▁ 04
 ⓢ Biros ▁ ⓕ Mael ▁ ⓑ 30 ▁ . ▁ 10 ▁ . ▁ 10"""

+    if existing:
+        expected_char_language_corpus = (
+            f"{previous_character}\n" + expected_char_language_corpus
+        )
+        expected_word_language_corpus = (
+            f"{previous_character}\n" + expected_word_language_corpus
+        )
+        expected_subword_language_corpus = (
+            f"▁ {previous_character}\n" + expected_subword_language_corpus
+        )
+
    # Transcriptions with worker version are in lowercase
    if transcription_entities_worker_version:
        expected_char_language_corpus = expected_char_language_corpus.lower()