Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
G
Generic Training Dataset
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Workers
Generic Training Dataset
Commits
316f067b
Commit
316f067b
authored
1 year ago
by
Eva Bardou
Browse files
Options
Downloads
Patches
Plain Diff
Save the cache in the archive too
parent
2ce6237a
No related branches found
No related tags found
1 merge request
!8
New DatasetExtractor using a DatasetWorker
Pipeline
#138669
passed
1 year ago
Stage: test
Stage: build
Stage: release
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
tests/test_worker.py
+11
-6
11 additions, 6 deletions
tests/test_worker.py
worker_generic_training_dataset/worker.py
+14
-8
14 additions, 8 deletions
worker_generic_training_dataset/worker.py
with
25 additions
and
14 deletions
tests/test_worker.py
+
11
−
6
View file @
316f067b
...
...
@@ -19,11 +19,13 @@ def test_process_split(tmp_path, downloaded_images):
worker
=
DatasetExtractor
()
# Parse some arguments
worker
.
args
=
Namespace
(
database
=
None
)
worker
.
data_folder
=
tmp_path
worker
.
configure_cache
()
worker
.
cached_images
=
dict
()
# Where to save the downloaded images
worker
.
image_folder
=
tmp_path
worker
.
images_folder
=
tmp_path
/
"
images
"
worker
.
images_folder
.
mkdir
(
parents
=
True
)
first_page_id
=
UUID
(
"
e26e6803-18da-4768-be30-a0a68132107c
"
)
second_page_id
=
UUID
(
"
c673bd94-96b1-4a2e-8662-a4d806940b5f
"
)
...
...
@@ -80,11 +82,6 @@ def test_process_split(tmp_path, downloaded_images):
==
f
"
https://europe-gamma.iiif.teklia.com/iiif/2/public%2Fiam%2F
{
page_name
}
.png
"
)
assert
sorted
(
tmp_path
.
rglob
(
"
*
"
))
==
[
tmp_path
/
f
"
{
first_image_id
}
.jpg
"
,
tmp_path
/
f
"
{
second_image_id
}
.jpg
"
,
]
# Should have created 17 transcriptions
assert
CachedTranscription
.
select
().
count
()
==
17
# Check transcription of first line on first page
...
...
@@ -127,3 +124,11 @@ def test_process_split(tmp_path, downloaded_images):
assert
tr_entity
.
length
==
23
assert
tr_entity
.
confidence
==
1.0
assert
tr_entity
.
worker_run_id
is
None
# Full structure of the archive
assert
sorted
(
tmp_path
.
rglob
(
"
*
"
))
==
[
tmp_path
/
"
db.sqlite
"
,
tmp_path
/
"
images
"
,
tmp_path
/
"
images
"
/
f
"
{
first_image_id
}
.jpg
"
,
tmp_path
/
"
images
"
/
f
"
{
second_image_id
}
.jpg
"
,
]
This diff is collapsed.
Click to expand it.
worker_generic_training_dataset/worker.py
+
14
−
8
View file @
316f067b
...
...
@@ -61,6 +61,9 @@ class DatasetExtractor(DatasetWorker):
# Download corpus
self
.
download_latest_export
()
def
configure_storage
(
self
)
->
None
:
self
.
data_folder
=
Path
(
tempfile
.
mkdtemp
(
suffix
=
"
-arkindex-data
"
))
# Initialize db that will be written
self
.
configure_cache
()
...
...
@@ -68,17 +71,17 @@ class DatasetExtractor(DatasetWorker):
self
.
cached_images
=
dict
()
# Where to save the downloaded images
self
.
image_folder
=
Path
(
tempfile
.
mkdtemp
(
suffix
=
"
-arkindex-data
"
))
logger
.
info
(
f
"
Images will be saved at `
{
self
.
image_folder
}
`.
"
)
self
.
images_folder
=
self
.
data_folder
/
"
images
"
self
.
images_folder
.
mkdir
(
parents
=
True
)
logger
.
info
(
f
"
Images will be saved at `
{
self
.
images_folder
}
`.
"
)
def
configure_cache
(
self
)
->
None
:
"""
Create an SQLite database compatible with base-worker cache and initialize it.
"""
self
.
use_cache
=
True
self
.
cache_path
:
Path
=
self
.
args
.
database
or
self
.
work_dir
/
"
db.sqlite
"
# Remove previous execution result if present
self
.
cache_path
.
unlink
(
missing_ok
=
True
)
self
.
cache_path
:
Path
=
self
.
data_folder
/
"
db.sqlite
"
logger
.
info
(
f
"
Cached database will be saved at `
{
self
.
data_folder
}
`.
"
)
init_cache_db
(
self
.
cache_path
)
...
...
@@ -242,7 +245,7 @@ class DatasetExtractor(DatasetWorker):
# Download image
logger
.
info
(
"
Downloading image
"
)
download_image
(
url
=
build_image_url
(
element
)).
save
(
self
.
image_folder
/
f
"
{
element
.
image
.
id
}
.jpg
"
self
.
image
s
_folder
/
f
"
{
element
.
image
.
id
}
.jpg
"
)
# Insert image
logger
.
info
(
"
Inserting image
"
)
...
...
@@ -304,15 +307,18 @@ class DatasetExtractor(DatasetWorker):
self
.
insert_element
(
child
,
parent_id
=
element
.
id
)
def
process_dataset
(
self
,
dataset
:
Dataset
):
# Configure temporary storage for the dataset data (cache + images)
self
.
configure_storage
()
# Iterate over given splits
for
split_name
,
elements
in
self
.
list_dataset_elements_per_split
(
dataset
):
casted_elements
=
list
(
map
(
_format_element
,
elements
))
self
.
process_split
(
split_name
,
casted_elements
)
# TAR + ZSTD
I
mage folder and store as task artifact
# TAR + ZSTD
the cache and the i
mage
s
folder
,
and store as task artifact
zstd_archive_path
:
Path
=
self
.
work_dir
/
f
"
{
dataset
.
id
}
.zstd
"
logger
.
info
(
f
"
Compressing the images to
{
zstd_archive_path
}
"
)
create_tar_zst_archive
(
source
=
self
.
image
_folder
,
destination
=
zstd_archive_path
)
create_tar_zst_archive
(
source
=
self
.
data
_folder
,
destination
=
zstd_archive_path
)
def
main
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment