Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
D
DAN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Container Registry
Operate
Terraform modules
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Automatic Text Recognition
DAN
Commits
3ca6578c
Commit
3ca6578c
authored
1 year ago
by
Solene Tarride
Browse files
Options
Downloads
Patches
Plain Diff
Simplify code
parent
12928ec0
No related branches found
No related tags found
1 merge request
!287
Support subword and word language models
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dan/datasets/extract/arkindex.py
+7
-25
7 additions, 25 deletions
dan/datasets/extract/arkindex.py
dan/datasets/extract/utils.py
+11
-1
11 additions, 1 deletion
dan/datasets/extract/utils.py
with
18 additions
and
26 deletions
dan/datasets/extract/arkindex.py
+
7
−
25
View file @
3ca6578c
...
...
@@ -33,6 +33,7 @@ from dan.datasets.extract.utils import (
Tokenizer
,
download_image
,
get_bbox
,
get_vocabulary
,
insert_token
,
normalize_linebreaks
,
normalize_spaces
,
...
...
@@ -363,14 +364,13 @@ class ArkindexExtractor:
self
.
language_tokens
.
append
(
self
.
mapping
.
encode
[
token
]
)
if
token
in
self
.
mapping
.
encode
else
self
.
language_tokens
.
append
(
token
)
self
.
language_tokens
.
append
(
self
.
mapping
.
ctc
.
encoded
)
assert
all
(
[
len
(
token
)
==
1
for
token
in
self
.
language_lexicon
]
),
"
Tokens should be single characters.
"
# Build LM corpus
train_corpus
=
[
text
.
replace
(
"
\n
"
,
"
"
)
for
text
in
self
.
data
[
"
train
"
].
values
()]
train_corpus
=
[
text
.
replace
(
self
.
mapping
.
linebreak
.
display
,
self
.
mapping
.
space
.
display
)
for
text
in
self
.
data
[
"
train
"
].
values
()
]
tokenizer
=
Tokenizer
(
train_corpus
,
outdir
=
self
.
output
/
"
language_model
"
,
...
...
@@ -388,36 +388,18 @@ class ArkindexExtractor:
tokenizer
.
subword_tokenize
(
doc
)
for
doc
in
train_corpus
]
# Build vocabulary
word_vocabulary
=
set
(
[
word
for
doc
in
self
.
language_corpus
[
"
words
"
]
for
word
in
doc
.
split
()
if
word
!=
""
]
)
subword_vocabulary
=
set
(
[
subword
for
doc
in
self
.
language_corpus
[
"
subwords
"
]
for
subword
in
doc
.
split
()
if
subword
!=
""
]
)
# Build LM lexicon
self
.
language_lexicon
[
"
characters
"
]
=
[
f
"
{
token
}
{
token
}
"
for
token
in
self
.
language_tokens
]
self
.
language_lexicon
[
"
words
"
]
=
[
f
"
{
word
}
{
tokenizer
.
char_tokenize
(
word
)
}
"
for
word
in
sorted
(
word_vocabulary
)
for
word
in
get_vocabulary
(
self
.
language_corpus
[
"
words
"
]
)
if
word
!=
""
]
self
.
language_lexicon
[
"
subwords
"
]
=
[
f
"
{
subword
}
{
tokenizer
.
char_tokenize
(
subword
)
}
"
for
subword
in
sorted
(
subword_vocabulary
)
for
subword
in
get_vocabulary
(
self
.
language_corpus
[
"
subwords
"
]
)
]
def
export
(
self
):
...
...
This diff is collapsed.
Click to expand it.
dan/datasets/extract/utils.py
+
11
−
1
View file @
3ca6578c
...
...
@@ -122,12 +122,22 @@ def get_bbox(polygon: List[List[int]]) -> str:
return
"
,
"
.
join
(
list
(
map
(
str
,
[
int
(
x
),
int
(
y
),
int
(
width
),
int
(
height
)])))
def
get_vocabulary
(
tokenized_text
:
List
[
str
])
->
set
[
str
]:
"""
Compute set of vocabulary from tokenzied text.
:param tokenized_text: List of tokenized text.
"""
return
sorted
(
set
([
token
for
doc
in
tokenized_text
for
token
in
doc
.
split
()
if
token
!=
""
])
)
class
Tokenizer
:
"""
A multi-level tokenizer (char, subword, word), where the subword tokenizer is trained using sentencepiece.
:param training_corpus: List of training text.
:param outdir: Path to save the subword tokenizer.
:param mapping: Mapping between displayed and encoded versions of special characters
:param mapping: Mapping between displayed and encoded versions of special characters
.
:param tokens: Start and end tokens used to represent named entities.
:param subword_vocab_size: Size of the vocabulary size to use to train the subword tokenizer.
"""
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment