diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 4a5822289d72091999253a6587d85fd00c9aa25b..269755f882eab5f7eb9e87095c09581225bece8a 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -22,6 +22,7 @@ def save_json(path, dict): def insert_token(text, count, start_token, end_token, offset, length): """ Insert the given tokens at the right position in the text + start_token or end_token can be empty strings """ text = ( # Text before entity @@ -35,7 +36,9 @@ def insert_token(text, count, start_token, end_token, offset, length): # Text after entity + text[count + 1 + offset + length :] ) - return text, count + 2 + + token_offset = len(start_token) + len(end_token) + return text, count + token_offset def parse_tokens(filename):