Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
022f6f64
Commit
022f6f64
authored
1 year ago
by
Solene Tarride
Browse files
Options
Downloads
Patches
Plain Diff
Fix linting
parent
d413e1ad
No related branches found
No related tags found
1 merge request
!287
Support subword and word language models
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dan/datasets/extract/arkindex.py
+44
-12
44 additions, 12 deletions
dan/datasets/extract/arkindex.py
dan/datasets/extract/utils.py
+62
-30
62 additions, 30 deletions
dan/datasets/extract/utils.py
with
106 additions
and
42 deletions
dan/datasets/extract/arkindex.py
+
44
−
12
View file @
022f6f64
...
...
@@ -14,7 +14,6 @@ import cv2
import
numpy
as
np
from
PIL
import
Image
from
tqdm
import
tqdm
from
nltk.tokenize
import
wordpunct_tokenize
from
arkindex_export
import
open_database
from
dan.datasets.extract.db
import
(
...
...
@@ -31,12 +30,12 @@ from dan.datasets.extract.exceptions import (
UnknownTokenInText
,
)
from
dan.datasets.extract.utils
import
(
Tokenizer
,
download_image
,
get_bbox
,
insert_token
,
normalize_linebreaks
,
normalize_spaces
,
Tokenizer
,
)
from
dan.utils
import
EntityType
,
LMTokenMapping
,
parse_tokens
from
line_image_extractor.extractor
import
extract
...
...
@@ -371,20 +370,53 @@ class ArkindexExtractor:
# Build LM corpus
train_corpus
=
[
text
for
text
in
self
.
data
[
"
train
"
].
values
()]
tokenizer
=
Tokenizer
(
train_corpus
,
outdir
=
self
.
output
/
"
language_model
"
,
mapping
=
self
.
mapping
,
tokens
=
self
.
tokens
)
tokenizer
.
train_subword_tokenizer
()
self
.
language_corpus
[
"
characters
"
]
=
[
tokenizer
.
char_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
words
"
]
=
[
tokenizer
.
word_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
subwords
"
]
=
[
tokenizer
.
subword_tokenize
(
doc
)
for
doc
in
train_corpus
]
tokenizer
=
Tokenizer
(
train_corpus
,
outdir
=
self
.
output
/
"
language_model
"
,
mapping
=
self
.
mapping
,
tokens
=
self
.
tokens
,
)
self
.
language_corpus
[
"
characters
"
]
=
[
tokenizer
.
char_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
words
"
]
=
[
tokenizer
.
word_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
subwords
"
]
=
[
tokenizer
.
subword_tokenize
(
doc
)
for
doc
in
train_corpus
]
# Build vocabulary
word_vocabulary
=
set
([
word
for
doc
in
self
.
language_corpus
[
"
words
"
]
for
word
in
doc
.
split
(
"
"
)])
subword_vocabulary
=
set
([
subword
for
doc
in
self
.
language_corpus
[
"
subwords
"
]
for
subword
in
doc
.
split
(
"
"
)])
word_vocabulary
=
set
(
[
word
for
doc
in
self
.
language_corpus
[
"
words
"
]
for
word
in
doc
.
split
()
if
word
!=
""
]
)
subword_vocabulary
=
set
(
[
subword
for
doc
in
self
.
language_corpus
[
"
subwords
"
]
for
subword
in
doc
.
split
()
if
subword
!=
""
]
)
# Build LM lexicon
self
.
language_lexicon
[
"
chars
"
]
=
[
f
"
{
token
}
{
tokenizer
.
char_tokenize
(
token
)
}
"
for
token
in
self
.
language_tokens
]
self
.
language_lexicon
[
"
words
"
]
=
[
f
"
{
word
}
{
tokenizer
.
char_tokenize
(
word
)
}
"
for
word
in
word_vocabulary
]
self
.
language_lexicon
[
"
subwords
"
]
=
[
f
"
{
subword
}
{
tokenizer
.
char_tokenize
(
subword
)
}
"
for
subword
in
subword_vocabulary
]
self
.
language_lexicon
[
"
characters
"
]
=
[
f
"
{
token
}
{
token
}
"
for
token
in
self
.
language_tokens
]
self
.
language_lexicon
[
"
words
"
]
=
[
f
"
{
word
}
{
tokenizer
.
char_tokenize
(
word
)
}
"
for
word
in
sorted
(
word_vocabulary
)
if
word
!=
""
]
self
.
language_lexicon
[
"
subwords
"
]
=
[
f
"
{
subword
}
{
tokenizer
.
char_tokenize
(
subword
)
}
"
for
subword
in
sorted
(
subword_vocabulary
)
]
def
export
(
self
):
(
self
.
output
/
"
labels.json
"
).
write_text
(
...
...
This diff is collapsed.
Click to expand it.
dan/datasets/extract/utils.py
+
62
−
30
View file @
022f6f64
...
...
@@ -2,9 +2,12 @@
import
logging
import
re
from
io
import
BytesIO
from
pathlib
import
Path
from
typing
import
List
import
requests
import
sentencepiece
as
spm
from
nltk
import
wordpunct_tokenize
from
PIL
import
Image
,
ImageOps
from
tenacity
import
(
retry
,
...
...
@@ -12,10 +15,8 @@ from tenacity import (
stop_after_attempt
,
wait_exponential
,
)
from
pathlib
import
Path
from
dan.utils
import
EntityType
import
sentencepiece
as
spm
from
nltk
import
wordpunct_tokenize
from
dan.utils
import
EntityType
,
LMTokenMapping
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -119,68 +120,97 @@ def get_bbox(polygon: List[List[int]]) -> str:
return
"
,
"
.
join
(
list
(
map
(
str
,
[
int
(
x
),
int
(
y
),
int
(
width
),
int
(
height
)])))
class
Tokenizer
()
:
"""
class
Tokenizer
:
"""
A multi-level tokenizer (char, subword, word)
Subword tokenizer is trained using sentencepiece.
"""
def
__init__
(
self
,
training_corpus
,
outdir
,
mapping
,
tokens
=
[])
->
None
:
def
__init__
(
self
,
training_corpus
:
List
[
str
],
outdir
:
Path
,
mapping
:
LMTokenMapping
,
tokens
:
EntityType
=
None
,
subword_vocab_size
:
int
=
1000
,
)
->
None
:
self
.
corpus
=
training_corpus
self
.
outdir
=
outdir
self
.
prefix
=
f
"
{
self
.
outdir
}
/subword_tokenizer
"
self
.
sentencepiece_model
=
None
self
.
mapping
=
mapping
self
.
tokens
=
tokens
self
.
mapping
=
mapping
# Train the subword tokenizer
self
.
user_subword_vocab_size
=
subword_vocab_size
self
.
sentencepiece_model
=
self
.
train_subword_tokenizer
()
@property
def
ner_tokens
(
self
):
return
[
entity
.
start
for
entity
in
self
.
tokens
.
values
()]
+
[
entity
.
end
for
entity
in
self
.
tokens
.
values
()
if
entity
.
end
!=
""
]
def
ner_tokens
(
self
)
->
List
[
str
]:
if
self
.
tokens
is
None
:
return
[]
return
[
entity
.
start
for
entity
in
self
.
tokens
.
values
()]
+
[
entity
.
end
for
entity
in
self
.
tokens
.
values
()
if
entity
.
end
!=
""
]
@property
def
mapping_tokens
(
self
):
def
mapping_tokens
(
self
)
->
List
[
str
]
:
return
[
token
.
encoded
for
token
in
self
.
mapping
]
@property
def
special_tokens
(
self
)
:
def
special_tokens
(
self
)
->
List
[
str
]:
return
list
(
set
(
self
.
ner_tokens
+
self
.
mapping_tokens
))
@property
def
subword_vocab_size
(
self
):
n_words
=
len
(
set
([
word
for
doc
in
self
.
corpus
for
word
in
doc
.
split
()]))
return
min
(
self
.
user_subword_vocab_size
,
3
*
n_words
)
def
train_subword_tokenizer
(
self
):
"""
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file
=
Path
(
self
.
outdir
/
f
"
tmp
_training_corpus
.txt
"
)
corpus_file
=
Path
(
self
.
outdir
/
"
tmp.txt
"
)
corpus_file
.
write_text
(
"
\n
"
.
join
(
self
.
corpus
))
# Train the tokenizer and load it
logger
.
info
(
"
Training sentencepiece model for subword tokenization
"
)
spm
.
SentencePieceTrainer
.
train
(
input
=
str
(
corpus_file
),
vocab_size
=
1000
,
model_prefix
=
self
.
prefix
,
user_defined_symbols
=
self
.
special_tokens
)
spm
.
SentencePieceTrainer
.
train
(
input
=
str
(
corpus_file
),
vocab_size
=
self
.
subword_vocab_size
,
model_prefix
=
self
.
prefix
,
user_defined_symbols
=
self
.
special_tokens
,
)
# Delete the corpus file
corpus_file
.
unlink
()
# Load the
corpus
self
.
sentencepiece_model
=
spm
.
SentencePieceProcessor
(
model_file
=
f
"
{
self
.
prefix
}
.model
"
)
# Load the
model and return it
return
spm
.
SentencePieceProcessor
(
model_file
=
f
"
{
self
.
prefix
}
.model
"
)
def
subword_tokenize
(
self
,
text
:
str
,
enable_sampling
=
True
,
alpha
=
0.1
,
nbest_size
=-
1
)
->
List
[
str
]:
"""
Tokenize into subwords.
As s
ampling is
en
abled
, a text can be tokenized in different ways
.
def
subword_tokenize
(
self
,
text
:
str
)
->
List
[
str
]:
"""
Tokenize into subwords.
S
ampling is
dis
abled
to ensure reproducibility
.
"""
tokens
=
self
.
sentencepiece_model
.
encode
(
text
,
out_type
=
str
,
enable_sampling
=
enable_sampling
,
alpha
=
alpha
,
nbest_size
=
nbest_size
)
# Replace special sentencepiece space token
tokens
=
[
t
.
replace
(
"
▁
"
,
"
⎵
"
)
for
t
in
tokens
]
tokens
=
self
.
sentencepiece_model
.
encode
(
text
,
out_type
=
str
)
# Return encoded tokenized text
return
"
"
.
join
([
""
.
join
(
self
.
encode
(
subword
))
for
subword
in
tokens
])
def
word_tokenize
(
self
,
text
:
str
)
->
List
[
str
]:
"""
"""
Tokenize text into words
Spaces (⎵) and NER tokens are considered as
distinct
words.
Spaces (⎵) and NER tokens are considered as words.
"""
words
=
[
""
.
join
(
self
.
encode
(
word
))
for
word
in
wordpunct_tokenize
(
text
)]
words
=
"
"
.
join
([
word
+
"
⎵
"
if
(
i
!=
len
(
words
)
-
1
and
word
not
in
self
.
ner_tokens
)
else
word
for
i
,
word
in
enumerate
(
words
)])
words
=
"
"
.
join
(
[
word
+
f
"
{
self
.
mapping
.
space
.
encoded
}
"
if
(
i
!=
len
(
words
)
-
1
and
word
not
in
self
.
ner_tokens
)
else
word
for
i
,
word
in
enumerate
(
words
)
]
)
return
words
def
char_tokenize
(
self
,
text
:
str
)
->
List
[
str
]:
"""
Tokenize text into characters
...
...
@@ -188,5 +218,7 @@ class Tokenizer():
return
"
"
.
join
(
self
.
encode
(
list
(
text
)))
def
encode
(
self
,
text
:
List
[
str
])
->
List
[
str
]:
"""
Encode special tokens
"""
return
map
(
self
.
mapping
.
encode_token
,
text
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment