Skip to content
Snippets Groups Projects
Commit 00d54e0c authored by Solene Tarride's avatar Solene Tarride
Browse files

Deal with unknown token separately

parent c0967ee5
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !287. Comments created here will be created in the context of that merge request.
......@@ -36,13 +36,13 @@ from dan.datasets.extract.utils import (
normalize_linebreaks,
normalize_spaces,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
from line_image_extractor.image_utils import (
BoundingBox,
Extraction,
polygon_to_bbox,
)
from dan.utils import EntityType, LMTokenMapping, parse_tokens
from line_image_extractor.extractor import extract
IMAGES_DIR = "images" # Subpath to the images directory.
LANGUAGE_DIR = "language_model" # Subpath to the language model directory.
......
......@@ -81,6 +81,7 @@ def insert_token(text: str, entity_type: EntityType, offset: int, length: int) -
+ (entity_type.end if entity_type else "")
)
def normalize_linebreaks(text: str) -> str:
"""
Remove begin/ending linebreaks
......@@ -106,4 +107,3 @@ def get_bbox(polygon: List[List[int]]) -> str:
x, y = min(all_x), min(all_y)
width, height = max(all_x) - x, max(all_y) - y
return ",".join(list(map(str, [int(x), int(y), int(width), int(height)])))
......@@ -505,8 +505,8 @@ class CTCLanguageDecoder:
tokens=tokens_path,
lm_weight=self.language_model_weight,
blank_token=self.mapping.ctc.encoded,
unk_word=self.mapping.unknown.encoded,
sil_token=self.mapping.space.encoded,
unk_word="",
nbest=1,
)
# No GPU support
......
......@@ -25,7 +25,6 @@ class LMTokenMapping(NamedTuple):
space: Token = Token("", " ")
linebreak: Token = Token("", "\n")
ctc: Token = Token("", "<ctc>")
unknown: Token = Token("", "<unk>")
@property
def display(self):
......
......@@ -470,8 +470,8 @@ def test_extract(
ⓢ B a r e y r e ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 2 8 . 3 . 1 1
ⓢ R o u s s y ⎵ ⎵ ⓕ J e a n ⎵ ⎵ ⓑ 4 . 1 1 . 1 4
ⓢ M a r i n ⎵ ⎵ ⓕ M a r c e l ⎵ ⎵ ⓑ 1 0 . 8 . 0 6
R o q u e s ⎵ ⎵ ⓕ E l o i ⎵ ⎵ ⓑ 1 1 . 1 0 . 0 4
G i r o s ⎵ ⎵ ⓕ P a u l ⎵ ⎵ ⓑ 3 0 . 1 0 . 1 0"""
A m i c a l ⎵ ⎵ ⓕ E l o i ⎵ ⎵ ⓑ 1 1 . 1 0 . 0 4
B i r o s ⎵ ⎵ ⓕ M a e l ⎵ ⎵ ⓑ 3 0 . 1 0 . 1 0"""
# Transcriptions with worker version are in lowercase
if transcription_entities_worker_version:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment