diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 0d7002b58e26d705b1578678764414d0d3176402..a7fa8d32e32c70ac8cef0c7b95523141e19bc26f 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -36,13 +36,13 @@ from dan.datasets.extract.utils import ( normalize_linebreaks, normalize_spaces, ) +from dan.utils import EntityType, LMTokenMapping, parse_tokens +from line_image_extractor.extractor import extract from line_image_extractor.image_utils import ( BoundingBox, Extraction, polygon_to_bbox, ) -from dan.utils import EntityType, LMTokenMapping, parse_tokens -from line_image_extractor.extractor import extract IMAGES_DIR = "images" # Subpath to the images directory. LANGUAGE_DIR = "language_model" # Subpath to the language model directory. diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 5e867a39e4c9bde0b94d7989da7e2439deba889f..a2184f0777c13d5aca2b15d4005b9b2c10f0a804 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -81,6 +81,7 @@ def insert_token(text: str, entity_type: EntityType, offset: int, length: int) - + (entity_type.end if entity_type else "") ) + def normalize_linebreaks(text: str) -> str: """ Remove begin/ending linebreaks @@ -106,4 +107,3 @@ def get_bbox(polygon: List[List[int]]) -> str: x, y = min(all_x), min(all_y) width, height = max(all_x) - x, max(all_y) - y return ",".join(list(map(str, [int(x), int(y), int(width), int(height)]))) - diff --git a/dan/ocr/decoder.py b/dan/ocr/decoder.py index bd858c16a56e551157d4d497cfe8ba22a85fea91..b4da94d0464c37d6543b8b7310faa5fbed06776b 100644 --- a/dan/ocr/decoder.py +++ b/dan/ocr/decoder.py @@ -505,8 +505,8 @@ class CTCLanguageDecoder: tokens=tokens_path, lm_weight=self.language_model_weight, blank_token=self.mapping.ctc.encoded, - unk_word=self.mapping.unknown.encoded, sil_token=self.mapping.space.encoded, + unk_word="â‡", nbest=1, ) # No GPU support diff --git a/dan/utils.py b/dan/utils.py index c65df263f1789e2799e1cc9c8cadf1b63dfea9f7..69e7d82ac2610d5fbdfedf336df69177b3fd8471 100644 --- a/dan/utils.py +++ b/dan/utils.py @@ -25,7 +25,6 @@ class LMTokenMapping(NamedTuple): space: Token = Token("⎵", " ") linebreak: Token = Token("↵", "\n") ctc: Token = Token("â—Œ", "<ctc>") - unknown: Token = Token("â‡", "<unk>") @property def display(self): diff --git a/tests/test_extract.py b/tests/test_extract.py index 0a5e09559e6b81b12d5972c142642e67aa629ee6..cfd78846d19351d206e4fc689b6ac742e9c759ab 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -470,8 +470,8 @@ def test_extract( â“¢ B a r e y r e ⎵ ⎵ â“• J e a n ⎵ ⎵ â“‘ 2 8 . 3 . 1 1 â“¢ R o u s s y ⎵ ⎵ â“• J e a n ⎵ ⎵ â“‘ 4 . 1 1 . 1 4 â“¢ M a r i n ⎵ ⎵ â“• M a r c e l ⎵ ⎵ â“‘ 1 0 . 8 . 0 6 -â“¢ R o q u e s ⎵ ⎵ â“• E l o i ⎵ ⎵ â“‘ 1 1 . 1 0 . 0 4 -â“¢ G i r o s ⎵ ⎵ â“• P a u l ⎵ ⎵ â“‘ 3 0 . 1 0 . 1 0""" +â“¢ A m i c a l ⎵ ⎵ â“• E l o i ⎵ ⎵ â“‘ 1 1 . 1 0 . 0 4 +â“¢ B i r o s ⎵ ⎵ â“• M a e l ⎵ ⎵ â“‘ 3 0 . 1 0 . 1 0""" # Transcriptions with worker version are in lowercase if transcription_entities_worker_version: