Skip to content
Snippets Groups Projects

Parse a tokens yaml instead of hardcoding the tokens

Merged Thibault Lavigne requested to merge parse-tokens into main
All threads resolved!
+ 7
21
@@ -248,21 +248,13 @@ class DAN:
return out
def add_results(text, char_confidences, index, end_tokens_exists=False):
if end_tokens_exists:
return [
{
"text": f"{text[start_tokens: end_tokens]}".replace("\n", " "),
"confidence_ner": f"{np.around(np.mean(char_confidences[start_tokens : end_tokens]), 2)}",
}
for start_tokens, end_tokens in index
]
def parse_ner_predictions(text, char_confidences, predictions):
return [
{
"text": f"{text[current: next_token]}".replace("\n", " "),
"confidence_ner": f"{np.around(np.mean(char_confidences[current : next_token]), 2)}",
}
for current, next_token in pairwise(index + [None])
for current, next_token in predictions
]
@@ -365,22 +357,16 @@ def process_batch(
assert len(start_tokens) == len(
end_tokens
), "You don't have the same number of starting tokens and ending tokens"
index = [
indices = [
[pos_start, pos_end]
for pos_start, pos_end in zip(start_tokens, end_tokens)
]
result["confidences"]["by ner token"] = add_results(
text, char_confidences, index, end_tokens=True
)
else:
index = [pos for pos, char in enumerate(text) if char in start_tokens]
indices = pairwise([pos for pos, char in enumerate(text) if char in start_tokens] + [None])
# calculates scores by token
result["confidences"]["by ner token"] = add_results(
text, char_confidences, index
)
result["confidences"]["by ner token"] = parse_ner_predictions(
text, char_confidences, indices
)
result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)
Loading