Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
Backend
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkindex
Backend
Commits
88583ad2
Commit
88583ad2
authored
6 years ago
by
Erwan Rouchet
Browse files
Options
Downloads
Patches
Plain Diff
Image index importer
parent
06aee500
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!22
Add score to transcriptions
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
arkindex/images/importer.py
+38
-3
38 additions, 3 deletions
arkindex/images/importer.py
with
38 additions
and
3 deletions
arkindex/images/importer.py
+
38
−
3
View file @
88583ad2
from
arkindex.images.models
import
Zone
,
Image
from
arkindex.documents.models
import
Transcription
,
ElementLink
from
arkindex.documents.models
import
Transcription
,
ElementLink
,
ElementType
,
Page
from
arkindex.project.tools
import
BoundingBox
import
os
import
re
import
gzip
import
logging
import
fnmatch
REGEX_INDEX
=
re
.
compile
(
b
'
^(?:line_(\d+) )?(.+) \d+ ([\de\-\.]+) (\d+) (\d+) (\d+) (\d+)
'
)
...
...
@@ -12,13 +13,14 @@ REGEX_INDEX = re.compile(
logger
=
logging
.
getLogger
(
__name__
)
def
import_indexes
(
image
,
index_path
,
extension
=
'
jpg
'
):
def
import_indexes
(
image
,
page
,
index_path
,
extension
=
'
jpg
'
):
"""
Import indexes from file
One gzipped index file per image
Format: #line word n/a score-[0,1] x1 y1 x2 y2
"""
assert
isinstance
(
image
,
Image
)
assert
isinstance
(
page
,
Page
)
assert
os
.
path
.
exists
(
index_path
),
\
'
Missing index {}
'
.
format
(
index_path
)
assert
index_path
.
endswith
(
'
.idx.gz
'
)
...
...
@@ -54,7 +56,7 @@ def import_indexes(image, index_path, extension='jpg'):
logger
.
info
(
'
Created {} zones
'
.
format
(
len
(
new_zones
)))
# Create transcriptions
new_transcriptions
=
bulk_transcriptions
(
image
,
lines
)
new_transcriptions
=
bulk_transcriptions
(
image
,
page
,
lines
)
logger
.
info
(
'
Created {} transcriptions
'
.
format
(
len
(
new_transcriptions
)))
# Index all transcriptions into ES
...
...
@@ -135,3 +137,36 @@ def bulk_transcriptions(image, page, items):
)
return
all_ts
class
IndexImporter
(
object
):
"""
Import index files (.idx.gz) as transcriptions.
"""
def
__init__
(
self
,
path
):
assert
os
.
path
.
exists
(
path
)
self
.
path
=
path
def
get_index_paths
(
self
):
# Support single file & directories
if
os
.
path
.
isdir
(
self
.
path
):
for
root
,
_
,
filenames
in
os
.
walk
(
self
.
path
):
for
filename
in
fnmatch
.
filter
(
filenames
,
"
*.idx.gz
"
):
yield
os
.
path
.
join
(
root
,
filename
)
else
:
yield
os
.
path
.
realpath
(
self
.
path
)
def
get_image
(
self
,
path
):
# Remove path and .idx.gz extension
image_id
=
'
.
'
.
join
(
os
.
path
.
basename
(
path
).
split
(
'
.
'
)[:
-
2
])
return
Image
.
objects
.
get
(
path__contains
=
image_id
)
def
get_page
(
self
,
image
):
assert
isinstance
(
image
,
Image
)
return
image
.
elements
.
filter
(
type
=
ElementType
.
Page
).
first
().
page
def
run
(
self
):
for
index_path
in
self
.
get_index_paths
():
logger
.
info
(
"
Parsing index file {}
"
.
format
(
index_path
))
image
=
self
.
get_image
(
index_path
)
page
=
self
.
get_page
(
image
)
import_indexes
(
image
,
page
,
index_path
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment