From e14367f7a7fa63c19956d5e9cd3108d533ab2712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Thu, 5 Jan 2023 14:09:51 +0000
Subject: [PATCH] Fix dataset extraction offset

---
 dan/datasets/extract/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py
index 4a582228..269755f8 100644
--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -22,6 +22,7 @@ def save_json(path, dict):
 def insert_token(text, count, start_token, end_token, offset, length):
     """
     Insert the given tokens at the right position in the text
+    start_token or end_token can be empty strings
     """
     text = (
         # Text before entity
@@ -35,7 +36,9 @@ def insert_token(text, count, start_token, end_token, offset, length):
         # Text after entity
         + text[count + 1 + offset + length :]
     )
-    return text, count + 2
+
+    token_offset = len(start_token) + len(end_token)
+    return text, count + token_offset
 
 
 def parse_tokens(filename):
-- 
GitLab