Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
G
Generic Training Dataset
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Workers
Generic Training Dataset
Commits
b1bbda3e
Commit
b1bbda3e
authored
1 year ago
by
Eva Bardou
Browse files
Options
Downloads
Patches
Plain Diff
Yoann's suggestions
parent
a520d362
No related branches found
No related tags found
1 merge request
!8
New DatasetExtractor using a DatasetWorker
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
worker_generic_training_dataset/worker.py
+26
-25
26 additions, 25 deletions
worker_generic_training_dataset/worker.py
with
26 additions
and
25 deletions
worker_generic_training_dataset/worker.py
+
26
−
25
View file @
b1bbda3e
...
...
@@ -27,6 +27,7 @@ from arkindex_worker.cache import db as cache_database
from
arkindex_worker.cache
import
init_cache_db
from
arkindex_worker.image
import
download_image
from
arkindex_worker.models
import
Dataset
from
arkindex_worker.models
import
Element
as
WorkerElement
from
arkindex_worker.utils
import
create_tar_zst_archive
from
arkindex_worker.worker.base
import
BaseWorker
from
arkindex_worker.worker.dataset
import
DatasetMixin
,
DatasetState
...
...
@@ -61,27 +62,27 @@ class DatasetWorker(BaseWorker, DatasetMixin):
self
.
generator
=
generator
def
list_dataset_elements_per_s
e
t
(
def
list_dataset_elements_per_s
pli
t
(
self
,
dataset
:
Dataset
)
->
Iterator
[
Tuple
[
str
,
Element
]]:
)
->
Iterator
[
Tuple
[
str
,
List
[
Element
]]
]
:
"""
Calls `list_dataset_elements` but returns results grouped by Set
"""
def
format_element
(
element
)
:
def
format_element
(
element
:
Tuple
[
str
,
WorkerElement
])
->
Element
:
return
Element
.
get
(
Element
.
id
==
element
[
1
].
id
)
def
format_s
et
(
set
):
return
(
set
[
0
],
list
(
map
(
format_element
,
list
(
set
[
1
]))))
return
list
(
map
(
format_set
,
groupby
(
sorted
(
self
.
list_dataset_elements
(
dataset
),
key
=
itemgetter
(
0
)),
key
=
itemgetter
(
0
),
),
)
def
format_s
plit
(
split
:
Tuple
[
str
,
Iterator
[
Tuple
[
str
,
WorkerElement
]]]
)
->
Tuple
[
str
,
List
[
Element
]]:
return
(
split
[
0
],
list
(
map
(
format_element
,
list
(
split
[
1
]))))
return
map
(
format_split
,
groupby
(
sorted
(
self
.
list_dataset_elements
(
dataset
),
key
=
itemgetter
(
0
)
)
,
key
=
itemgetter
(
0
),
)
,
)
def
process_dataset
(
self
,
dataset
:
Dataset
):
...
...
@@ -91,20 +92,20 @@ class DatasetWorker(BaseWorker, DatasetMixin):
:param dataset: The dataset to process.
"""
def
list_datasets
(
self
)
->
List
[
Dataset
]
|
List
[
str
]:
def
list_datasets
(
self
)
->
Iterator
[
Dataset
]
|
Iterator
[
str
]:
"""
Calls `list_process_datasets` if not is_read_only,
else simply give the list of IDs provided via CLI
"""
if
self
.
is_read_only
:
return
list
(
map
(
str
,
self
.
args
.
dataset
)
)
return
map
(
str
,
self
.
args
.
dataset
)
return
self
.
list_process_datasets
()
def
run
(
self
):
self
.
configure
()
datasets
:
List
[
Dataset
]
|
List
[
str
]
=
self
.
list_datasets
()
datasets
:
Iterator
[
Dataset
]
|
Iterator
[
str
]
=
list
(
self
.
list_datasets
()
)
if
not
datasets
:
logger
.
warning
(
"
No datasets to process, stopping.
"
)
sys
.
exit
(
1
)
...
...
@@ -125,11 +126,11 @@ class DatasetWorker(BaseWorker, DatasetMixin):
if
self
.
generator
:
assert
(
dataset
.
state
==
DatasetState
.
Open
.
value
),
"
When generating a new dataset, its state should be Open
"
),
"
When generating a new dataset, its state should be Open
.
"
else
:
assert
(
dataset
.
state
==
DatasetState
.
Complete
.
value
),
"
When processing an existing dataset, its state should be Complete
"
),
"
When processing an existing dataset, its state should be Complete
.
"
if
self
.
generator
:
# Update the dataset state to Building
...
...
@@ -414,15 +415,15 @@ class DatasetExtractor(DatasetWorker):
# Insert entities
self
.
insert_entities
(
transcriptions
)
def
process_s
e
t
(
self
,
s
e
t_name
:
str
,
elements
:
List
[
Element
])
->
None
:
def
process_s
pli
t
(
self
,
s
pli
t_name
:
str
,
elements
:
List
[
Element
])
->
None
:
logger
.
info
(
f
"
Filling the cache with information from elements in the s
e
t
{
s
e
t_name
}
"
f
"
Filling the cache with information from elements in the s
pli
t
{
s
pli
t_name
}
"
)
# First list all pages
nb_elements
:
int
=
len
(
elements
)
for
idx
,
element
in
enumerate
(
elements
,
start
=
1
):
logger
.
info
(
f
"
Processing `
{
s
e
t_name
}
` element (
{
idx
}
/
{
nb_elements
}
)
"
)
logger
.
info
(
f
"
Processing `
{
s
pli
t_name
}
` element (
{
idx
}
/
{
nb_elements
}
)
"
)
# Insert page
self
.
insert_element
(
element
)
...
...
@@ -436,9 +437,9 @@ class DatasetExtractor(DatasetWorker):
self
.
insert_element
(
child
,
parent_id
=
element
.
id
)
def
process_dataset
(
self
,
dataset
:
Dataset
):
# Iterate over given s
e
ts
for
s
e
t_name
,
elements
in
self
.
list_dataset_elements_per_s
e
t
(
dataset
):
self
.
process_s
et
(
se
t_name
,
elements
)
# Iterate over given s
pli
ts
for
s
pli
t_name
,
elements
in
self
.
list_dataset_elements_per_s
pli
t
(
dataset
):
self
.
process_s
plit
(
spli
t_name
,
elements
)
# TAR + ZSTD Image folder and store as task artifact
zstd_archive_path
:
Path
=
self
.
work_dir
/
f
"
{
dataset
.
id
}
.zstd
"
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment