Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Merge requests
!287
Support subword and word language models
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Support subword and word language models
subword-and-word-lm
into
main
Overview
34
Commits
47
Pipelines
0
Changes
8
Merged
Solene Tarride
requested to merge
subword-and-word-lm
into
main
1 year ago
Overview
14
Commits
47
Pipelines
0
Changes
8
Expand
Closes
#199 (closed)
Ref
https://redmine.teklia.com/issues/4941
0
0
Merge request reports
Viewing commit
c9dedfa1
Show latest version
8 files
+
37
−
246
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
8
Search (e.g. *.vue) (Ctrl+P)
c9dedfa1
Fix rebase errors
· c9dedfa1
Solene Tarride
authored
1 year ago
dan/datasets/extract/utils.py
+
123
−
2
Options
# -*- coding: utf-8 -*-
import
itertools
import
logging
import
operator
import
re
from
dataclasses
import
dataclass
,
field
from
io
import
BytesIO
from
typing
import
List
from
pathlib
import
Path
from
tempfile
import
NamedTemporaryFile
from
typing
import
Iterator
,
List
,
Optional
,
Union
import
requests
import
sentencepiece
as
spm
from
nltk
import
wordpunct_tokenize
from
PIL
import
Image
,
ImageOps
from
tenacity
import
(
retry
,
@@ -13,7 +20,7 @@ from tenacity import (
wait_exponential
,
)
from
dan.utils
import
EntityType
from
dan.utils
import
EntityType
,
LMTokenMapping
logger
=
logging
.
getLogger
(
__name__
)
@@ -117,3 +124,117 @@ def get_bbox(polygon: List[List[int]]) -> str:
x
,
y
=
min
(
all_x
),
min
(
all_y
)
width
,
height
=
max
(
all_x
)
-
x
,
max
(
all_y
)
-
y
return
"
,
"
.
join
(
list
(
map
(
str
,
[
int
(
x
),
int
(
y
),
int
(
width
),
int
(
height
)])))
def
get_vocabulary
(
tokenized_text
:
List
[
str
])
->
set
[
str
]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return
sorted
(
set
([
token
for
doc
in
tokenized_text
for
token
in
doc
.
split
()]))
@dataclass
class
Tokenizer
:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
training_corpus
:
List
[
str
]
charset
:
List
[
str
]
unknown_token
:
str
outdir
:
Path
mapping
:
LMTokenMapping
tokens
:
Optional
[
EntityType
]
=
None
subword_vocab_size
:
int
=
1000
sentencepiece_model
:
spm
.
SentencePieceProcessor
=
field
(
init
=
False
)
@property
def
prefix
(
self
):
return
self
.
outdir
/
"
subword_tokenizer
"
@property
def
ner_tokens
(
self
)
->
Union
[
List
[
str
],
Iterator
[
str
]]:
if
self
.
tokens
is
None
:
return
[]
return
itertools
.
chain
(
map
(
operator
.
attrgetter
(
"
start
"
),
self
.
tokens
.
values
()),
filter
(
operator
.
truth
,
map
(
operator
.
attrgetter
(
"
end
"
),
self
.
tokens
.
values
())
),
)
@property
def
mapping_tokens
(
self
)
->
List
[
str
]:
return
[
token
.
encoded
for
token
in
self
.
mapping
]
@property
def
special_tokens
(
self
)
->
List
[
str
]:
return
list
(
set
(
itertools
.
chain
(
self
.
mapping_tokens
,
self
.
ner_tokens
)))
def
__post_init__
(
self
)
->
None
:
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
logger
.
info
(
"
Training a sentencepiece model for subword tokenization
"
)
with
NamedTemporaryFile
(
dir
=
self
.
outdir
,
suffix
=
"
.txt
"
,
mode
=
"
w
"
)
as
tmp
:
tmp
.
write
(
"
\n
"
.
join
(
self
.
training_corpus
))
tmp
.
flush
()
spm
.
SentencePieceTrainer
.
train
(
input
=
tmp
.
name
,
vocab_size
=
self
.
subword_vocab_size
,
model_prefix
=
self
.
prefix
,
user_defined_symbols
=
self
.
special_tokens
,
)
# Load the model
self
.
sentencepiece_model
=
spm
.
SentencePieceProcessor
(
model_file
=
str
(
self
.
prefix
.
with_suffix
(
"
.model
"
))
)
def
subword_tokenize
(
self
,
text
:
str
)
->
str
:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens
=
self
.
sentencepiece_model
.
encode
(
text
,
out_type
=
str
)
return
"
"
.
join
(
map
(
""
.
join
,
map
(
self
.
encode
,
tokens
)))
def
word_tokenize
(
self
,
text
:
str
)
->
str
:
"""
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words
=
list
(
map
(
""
.
join
,
map
(
self
.
encode
,
wordpunct_tokenize
(
text
))))
return
"
"
.
join
(
[
word
+
f
"
{
self
.
mapping
.
space
.
encoded
}
"
if
(
i
!=
len
(
words
)
-
1
and
word
not
in
self
.
ner_tokens
)
else
word
for
i
,
word
in
enumerate
(
words
)
]
)
def
char_tokenize
(
self
,
text
:
str
)
->
str
:
"""
Tokenize text into a string of space-separated characters.
:param text: Text to be tokenized.
"""
return
"
"
.
join
(
[
char
if
char
in
self
.
charset
else
self
.
unknown_token
for
char
in
self
.
encode
(
text
)
]
)
def
encode
(
self
,
text
:
List
[
str
])
->
List
[
str
]:
"""
Encode special tokens.
:param text: Text to be encoded.
"""
return
map
(
self
.
mapping
.
encode_token
,
text
)
Loading