Thibault Lavigne · Mélodie Boillet · 2a8a7c8b · ad793b4e · 6c602a37 · 6d174a9a
--- a/dan/predict/prediction.py

+ 7

− 21
+++ b/dan/predict/prediction.py

+ 7

− 21
 @@ -248,21 +248,13 @@ class DAN:
        return out


-def add_results(text, char_confidences, index, end_tokens_exists=False):
-    if end_tokens_exists:
-        return [
-            {
-                "text": f"{text[start_tokens: end_tokens]}".replace("\n", " "),
-                "confidence_ner": f"{np.around(np.mean(char_confidences[start_tokens : end_tokens]), 2)}",
-            }
-            for start_tokens, end_tokens in index
-        ]
+def parse_ner_predictions(text, char_confidences, predictions):
    return [
        {
            "text": f"{text[current: next_token]}".replace("\n", " "),
            "confidence_ner": f"{np.around(np.mean(char_confidences[current : next_token]), 2)}",
        }
-        for current, next_token in pairwise(index + [None])
+        for current, next_token in predictions
    ]


 @@ -365,22 +357,16 @@ def process_batch(
                assert len(start_tokens) == len(
                    end_tokens
                ), "You don't have the same number of starting tokens and ending tokens"
-                index = [
+                indices = [
                    [pos_start, pos_end]
                    for pos_start, pos_end in zip(start_tokens, end_tokens)
                ]
-
-                result["confidences"]["by ner token"] = add_results(
-                    text, char_confidences, index, end_tokens=True
-                )
-
            else:
-                index = [pos for pos, char in enumerate(text) if char in start_tokens]
+                indices = pairwise([pos for pos, char in enumerate(text) if char in start_tokens] + [None])

-                # calculates scores by token
-                result["confidences"]["by ner token"] = add_results(
-                    text, char_confidences, index
-                )
+            result["confidences"]["by ner token"] = parse_ner_predictions(
+                text, char_confidences, indices
+            )

            result["confidences"]["total"] = np.around(np.mean(char_confidences), 2)