Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
23cd8204
Commit
23cd8204
authored
1 year ago
by
Solene Tarride
Committed by
Solene Tarride
1 year ago
Browse files
Options
Downloads
Patches
Plain Diff
Improve code
parent
8269a94d
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
dan/datasets/extract/extract.py
+13
-19
13 additions, 19 deletions
dan/datasets/extract/extract.py
dan/datasets/extract/utils.py
+41
-39
41 additions, 39 deletions
dan/datasets/extract/utils.py
tests/test_extract.py
+2
-3
2 additions, 3 deletions
tests/test_extract.py
with
56 additions
and
61 deletions
dan/datasets/extract/extract.py
+
13
−
19
View file @
23cd8204
...
...
@@ -372,35 +372,29 @@ class ArkindexExtractor:
for
text
in
self
.
data
[
"
train
"
].
values
()
]
tokenizer
=
Tokenizer
(
train_corpus
,
training_corpus
=
train_corpus
,
outdir
=
self
.
output
/
"
language_model
"
,
mapping
=
self
.
mapping
,
tokens
=
self
.
tokens
,
subword_vocab_size
=
self
.
subword_vocab_size
,
)
self
.
language_corpus
[
"
characters
"
]
=
[
tokenizer
.
char_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
words
"
]
=
[
tokenizer
.
word_tokenize
(
doc
)
for
doc
in
train_corpus
]
self
.
language_corpus
[
"
subwords
"
]
=
[
tokenizer
.
subword_tokenize
(
doc
)
for
doc
in
train_corpus
]
for
level
,
tokenize
in
(
(
"
characters
"
,
tokenizer
.
char_tokenize
),
(
"
words
"
,
tokenizer
.
word_tokenize
),
(
"
subwords
"
,
tokenizer
.
subword_tokenize
),
):
self
.
language_corpus
[
level
]
=
list
(
map
(
tokenize
,
train_corpus
))
# Build LM lexicon
self
.
language_lexicon
[
"
characters
"
]
=
[
f
"
{
token
}
{
token
}
"
for
token
in
self
.
language_tokens
]
self
.
language_lexicon
[
"
words
"
]
=
[
f
"
{
word
}
{
tokenizer
.
char_tokenize
(
word
)
}
"
for
word
in
get_vocabulary
(
self
.
language_corpus
[
"
words
"
])
if
word
!=
""
]
self
.
language_lexicon
[
"
subwords
"
]
=
[
f
"
{
subword
}
{
tokenizer
.
char_tokenize
(
subword
)
}
"
for
subword
in
get_vocabulary
(
self
.
language_corpus
[
"
subwords
"
])
]
for
level
in
[
"
words
"
,
"
subwords
"
]:
self
.
language_lexicon
[
level
]
=
[
f
"
{
token
}
{
tokenizer
.
char_tokenize
(
token
)
}
"
for
token
in
get_vocabulary
(
self
.
language_corpus
[
level
])
]
def
export
(
self
):
(
self
.
output
/
"
labels.json
"
).
write_text
(
...
...
This diff is collapsed.
Click to expand it.
dan/datasets/extract/utils.py
+
41
−
39
View file @
23cd8204
# -*- coding: utf-8 -*-
import
itertools
import
logging
import
operator
import
re
from
dataclasses
import
dataclass
,
field
from
io
import
BytesIO
from
pathlib
import
Path
from
typing
import
List
from
tempfile
import
NamedTemporaryFile
from
typing
import
Iterator
,
List
,
Optional
,
Union
import
requests
import
sentencepiece
as
spm
...
...
@@ -132,6 +136,7 @@ def get_vocabulary(tokenized_text: List[str]) -> set[str]:
)
@dataclass
class
Tokenizer
:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
...
...
@@ -142,30 +147,27 @@ class Tokenizer:
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
def
__init__
(
self
,
training_corpus
:
List
[
str
],
outdir
:
Path
,
mapping
:
LMTokenMapping
,
tokens
:
EntityType
=
None
,
subword_vocab_size
:
int
=
1000
,
)
->
None
:
self
.
corpus
=
training_corpus
self
.
outdir
=
outdir
self
.
prefix
=
f
"
{
self
.
outdir
}
/subword_tokenizer
"
self
.
tokens
=
tokens
self
.
mapping
=
mapping
# Train the subword tokenizer
self
.
subword_vocab_size
=
subword_vocab_size
self
.
sentencepiece_model
=
self
.
train_subword_tokenizer
()
training_corpus
:
List
[
str
]
outdir
:
Path
mapping
:
LMTokenMapping
tokens
:
Optional
[
EntityType
]
=
None
subword_vocab_size
:
int
=
1000
sentencepiece_model
:
spm
.
SentencePieceProcessor
=
field
(
init
=
False
)
@property
def
ner_tokens
(
self
)
->
List
[
str
]:
def
prefix
(
self
):
return
self
.
outdir
/
"
subword_tokenizer
"
@property
def
ner_tokens
(
self
)
->
Union
[
List
[
str
],
Iterator
[
str
]]:
if
self
.
tokens
is
None
:
return
[]
return
[
entity
.
start
for
entity
in
self
.
tokens
.
values
()]
+
[
entity
.
end
for
entity
in
self
.
tokens
.
values
()
if
entity
.
end
!=
""
]
return
itertools
.
chain
(
map
(
operator
.
attrgetter
(
"
start
"
),
self
.
tokens
.
values
()),
filter
(
operator
.
truth
,
map
(
operator
.
attrgetter
(
"
end
"
),
self
.
tokens
.
values
())
),
)
@property
def
mapping_tokens
(
self
)
->
List
[
str
]:
...
...
@@ -173,42 +175,42 @@ class Tokenizer:
@property
def
special_tokens
(
self
)
->
List
[
str
]:
return
list
(
set
(
self
.
ner
_tokens
+
self
.
mapping
_tokens
))
return
list
(
set
(
itertools
.
chain
(
self
.
mapping
_tokens
,
self
.
ner
_tokens
))
)
def
train_subword_tokenizer
(
self
)
:
def
__post_init__
(
self
)
->
None
:
"""
Train a sentencepiece model on the training corpus.
"""
# Write the corpus in a text file
corpus_file
=
Path
(
self
.
outdir
/
"
tmp.txt
"
)
corpus_file
.
write_text
(
"
\n
"
.
join
(
self
.
corpus
))
# Train the tokenizer
logger
.
info
(
"
Training sentencepiece model for subword tokenization
"
)
spm
.
SentencePieceTrainer
.
train
(
input
=
str
(
corpus_file
),
vocab_size
=
self
.
subword_vocab_size
,
model_prefix
=
self
.
prefix
,
user_defined_symbols
=
self
.
special_tokens
,
logger
.
info
(
"
Training a sentencepiece model for subword tokenization
"
)
with
NamedTemporaryFile
(
dir
=
self
.
outdir
,
suffix
=
"
.txt
"
,
mode
=
"
w
"
)
as
tmp
:
tmp
.
write
(
"
\n
"
.
join
(
self
.
training_corpus
))
tmp
.
flush
()
spm
.
SentencePieceTrainer
.
train
(
input
=
tmp
.
name
,
vocab_size
=
self
.
subword_vocab_size
,
model_prefix
=
self
.
prefix
,
user_defined_symbols
=
self
.
special_tokens
,
)
# Load the model
self
.
sentencepiece_model
=
spm
.
SentencePieceProcessor
(
model_file
=
str
(
self
.
prefix
.
with_suffix
(
"
.model
"
))
)
# Delete the corpus file and load the model
corpus_file
.
unlink
()
return
spm
.
SentencePieceProcessor
(
model_file
=
f
"
{
self
.
prefix
}
.model
"
)
def
subword_tokenize
(
self
,
text
:
str
)
->
str
:
"""
Tokenize into subwords. Sampling is disabled to ensure reproducibility.
"""
tokens
=
self
.
sentencepiece_model
.
encode
(
text
,
out_type
=
str
)
return
"
"
.
join
(
[
""
.
join
(
self
.
encode
(
subword
))
for
subword
in
tokens
]
)
return
"
"
.
join
(
map
(
""
.
join
,
map
(
self
.
encode
,
tokens
))
)
def
word_tokenize
(
self
,
text
:
str
)
->
str
:
"""
Tokenize text into a string of space-separated words. Spaces (⎵) and NER tokens are considered as words.
:param text: Text to be tokenized.
"""
words
=
[
""
.
join
(
self
.
encode
(
word
))
for
word
in
wordpunct_tokenize
(
text
)
]
words
=
list
(
map
(
""
.
join
,
map
(
self
.
encode
,
wordpunct_tokenize
(
text
)
)))
return
"
"
.
join
(
[
word
+
f
"
{
self
.
mapping
.
space
.
encoded
}
"
...
...
This diff is collapsed.
Click to expand it.
tests/test_extract.py
+
2
−
3
View file @
23cd8204
...
...
@@ -644,15 +644,14 @@ def test_extract(
assert
(
output
/
"
language_model
"
/
"
corpus_words.txt
"
).
read_text
()
==
expected_word_language_corpus
print
((
output
/
"
language_model
"
/
"
corpus_subwords.txt
"
).
read_text
())
print
(
expected_subword_language_corpus
)
assert
(
output
/
"
language_model
"
/
"
corpus_subwords.txt
"
).
read_text
()
==
expected_subword_language_corpus
# Check "language_tokens.txt"
expected_language_tokens
=
[
t
if
t
!=
"
"
else
"
▁
"
for
t
in
sorted
(
list
(
expected_charset
))
"
▁
"
if
t
.
isspace
()
else
t
for
t
in
sorted
(
list
(
expected_charset
))
]
expected_language_tokens
.
append
(
"
◌
"
)
assert
(
output
/
"
language_model
"
/
"
tokens.txt
"
).
read_text
()
==
"
\n
"
.
join
(
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment