From e14367f7a7fa63c19956d5e9cd3108d533ab2712 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com> Date: Thu, 5 Jan 2023 14:09:51 +0000 Subject: [PATCH] Fix dataset extraction offset --- dan/datasets/extract/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 4a582228..269755f8 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -22,6 +22,7 @@ def save_json(path, dict): def insert_token(text, count, start_token, end_token, offset, length): """ Insert the given tokens at the right position in the text + start_token or end_token can be empty strings """ text = ( # Text before entity @@ -35,7 +36,9 @@ def insert_token(text, count, start_token, end_token, offset, length): # Text after entity + text[count + 1 + offset + length :] ) - return text, count + 2 + + token_offset = len(start_token) + len(end_token) + return text, count + token_offset def parse_tokens(filename): -- GitLab