Skip to content
Snippets Groups Projects
Commit ea9f766a authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Merge branch 'merge-datasets-in-extraction' into 'main'

Merge datasets

Closes #239

See merge request !428
parents 20161125 fe52dafb
No related branches found
No related tags found
1 merge request!428Merge datasets
......@@ -82,8 +82,24 @@ class ArkindexExtractor:
self.keep_spaces = keep_spaces
self.subword_vocab_size = subword_vocab_size
# Loading file from precedent extraction
data_path = self.output / "split.json"
charset_path = self.output / "charset.pkl"
is_data_file = data_path.exists()
is_charset_file = charset_path.exists()
self.data: Dict = defaultdict(dict)
self.charset = set()
if is_data_file and is_charset_file:
self.data.update(json.loads(data_path.read_bytes()))
self.charset.update(sorted(pickle.loads(charset_path.read_bytes())))
elif is_data_file ^ is_charset_file:
raise FileNotFoundError(
f"The file '{data_path.name}' or `{charset_path.name}` is missing at location {self.output.as_posix()}"
)
self.language_corpus = defaultdict(list)
self.language_tokens = []
self.language_lexicon = defaultdict(list)
......
......@@ -229,6 +229,7 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path):
),
),
)
@pytest.mark.parametrize("existing", ((True, False)))
def test_extract(
load_entities,
keep_spaces,
......@@ -238,6 +239,7 @@ def test_extract(
expected_subword_language_corpus,
subword_vocab_size,
tmp_path,
existing,
):
output = tmp_path / "extraction"
output.mkdir(parents=True, exist_ok=True)
......@@ -250,6 +252,48 @@ def test_extract(
if token
]
# Add character to fake previous extract file in the folder
previous_character = "%"
if existing:
charset_path = output / "charset.pkl"
data_path = output / "split.json"
dataset_type = "train"
data_id = "train-page_1-line_5"
data = {
"dataset-id": "dataset-id",
"image": {
"iiif_url": f"{FIXTURES}/extraction/images/text_line/test-page_1-line_1.jpg",
"polygon": [
[37, 191],
[37, 339],
[767, 339],
[767, 191],
[37, 191],
],
},
"text": previous_character,
}
charset_path.write_bytes(pickle.dumps([previous_character]))
data_path.write_text(
json.dumps(
{dataset_type: {data_id: data}},
)
)
split_content[dataset_type][data_id] = data
keys = list(split_content["train"].keys())
keys.sort()
split_content["train"] = {i: split_content["train"][i] for i in keys}
# Add 1 to subword_vocab_size because we have one more subword who is {previous_character}
subword_vocab_size += 1
extractor = ArkindexExtractor(
dataset_ids=["dataset_id"],
element_type=["text_line"],
......@@ -264,6 +308,7 @@ def test_extract(
keep_spaces=keep_spaces,
subword_vocab_size=subword_vocab_size,
)
extractor.run()
expected_paths = [
......@@ -337,6 +382,17 @@ def test_extract(
ⓢ Amical ▁ ⓕ Eloi ▁ ⓑ 11 ▁ . ▁ 10 ▁ . ▁ 04
ⓢ Biros ▁ ⓕ Mael ▁ ⓑ 30 ▁ . ▁ 10 ▁ . ▁ 10"""
if existing:
expected_char_language_corpus = (
f"{previous_character}\n" + expected_char_language_corpus
)
expected_word_language_corpus = (
f"{previous_character}\n" + expected_word_language_corpus
)
expected_subword_language_corpus = (
f"{previous_character}\n" + expected_subword_language_corpus
)
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
expected_char_language_corpus = expected_char_language_corpus.lower()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment