From 9c71269bf43261bc5a72f6d39a9deda643655e8d Mon Sep 17 00:00:00 2001
From: Blanche Miret <bmiret@teklia.com>
Date: Mon, 31 May 2021 13:31:30 +0000
Subject: [PATCH] Precise entities delimitation

---
 README.md                                     |  44 ++-
 nerval/evaluate.py                            | 332 +++++++++++++-----
 setup.py                                      |   4 +-
 tests/test_compute_matches.py                 | 160 +++++++--
 ..._aligned.py => test_get_labels_aligned.py} |  60 ++--
 tests/test_nested.bio                         |  13 +
 tests/test_parse_bio.py                       |  32 +-
 tests/test_run.py                             |  28 +-
 8 files changed, 489 insertions(+), 184 deletions(-)
 rename tests/{test_get_tags_aligned.py => test_get_labels_aligned.py} (57%)
 create mode 100644 tests/test_nested.bio

diff --git a/README.md b/README.md
index 50dfed6..9be2709 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,9 @@
 >
 >  -- <cite>GÃ©rard de Nerval</cite>
 
-Nerval is an evaluation library written in python  implementing a metric for named-entity recognition evaluation  on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions.
+Nerval is an evaluation library written in python implementing a metric for named-entity recognition evaluation on noisy text, typically to measure NER performances on OCR or Handwritten text recognition predictions.
+
+Expected inputs are a ground truth and a prediction BIOES/BILOU files without any  ''Â§'' occurrences, this character having a special meaning during evaluation.
 
 ## Usage
 
@@ -38,15 +40,31 @@ We also provide two annotation and prediction toy files, which are identical for
 $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
 ```
 
-
-
 ## Metric
 
 This metric uses string alignment at character level.
 
 The automatic transcription is first aligned with the ground truth at character level, by minimising the Levenshtein distance between them. Each entity in the ground truth is then matched with a corresponding entity in the aligned transcription, with the same entity label, or an empty character string if no match is found. If the edit distance between the two entities is less than 30% of the ground truth entity length, the predicted entity is considered as recognised. For the purpose of matching detected entities to existing databases, we estimated that a 70% match between the entity texts was a fair threshold.
 
-#### Details :
+**Nested entities -** Nerval makes an approximate evaluation of nested entities: containing entities and  nested entities will be evaluated separately. But note that in the BIOES/BILOU format, a nested entity at the end of a containing entity cannot be properly distinguished from a simple end and beginning of entity, hence the approximate evaluation. Therefore, in the following example, the detected and evaluated entities will be "Louis par la grÃ¢ce de Dieu roy de France et de" (PER), "France" (LOC), "Navarre" (LOC).
+
+```
+Louis B-PER
+par I-PER
+la I-PER
+grÃ¢ce I-PER
+de I-PER
+Dieu I-PER
+roy I-PER
+de I-PER
+France B-LOC
+et I-PER
+de I-PER
+Navarre B-LOC
+. O
+```
+
+#### Details
 
 - From the bio files in input, retrieval of the text content and extension of a word-level tagging to a character-level tagging
     - spaces added between each word
@@ -65,13 +83,13 @@ writer B-OCC
 produces the following list of tags, one per character plus spaces:
 
 ```
-['PER','PER','PER','PER','PER','PER','PER',
+['B-PER','I-PER','I-PER','I-PER','I-PER','I-PER','I-PER',
  'O',
  'O', 'O', 'O',
  'O',
  'O',
  'O',
- 'OCC','OCC','OCC','OCC','OCC','OCC',
+ 'B-OCC','I-OCC','I-OCC','I-OCC','I-OCC','I-OCC',
  'O',
  'O']
 ```
@@ -88,11 +106,11 @@ writear B-OCC
 producing:
 
 ```
-['PER','PER','PER','PER','PER','PER','PER','PER','PER',
+['B-PER','I-PER,'I-PER','I-PER','I-PER','I-PER','I-PER','I-PER','I-PER',
  'O',
  'O', 'O', 'O',
  'O',
- 'OCC','OCC','OCC','OCC','OCC','OCC','OCC',
+ 'B-OCC','I-OCC','I-OCC','I-OCC','I-OCC','I-OCC','I-OCC',
  'O',
  'O','O']
 ```
@@ -123,10 +141,10 @@ prediction : Tolkieene xas --writear ,.
              PPPPPPPPPOOOOOOOCCCCCCCOOO
 ```
 - Search for matching entity for each entity in the annotation
-  - Inspecting the annotation character by character, when a new entity tag (not 'O') is encountered, the character is considered as the beginning of an entity to be matched.
+  - Inspecting the annotation character by character, when a new "B-" label is encountered, the character is the beginning of an entity to be matched.
   - Considering the opposite character in the prediction string, if the entity tags match on these two characters, tags are back-tracked in the prediction string to detect the beginning of the entity; that is, the first occurrence of said entity tag.
   - Else, if the entity tags don't match on the first character, beginning of matching entity in prediction is looked for until the end of the entity in the annotation.
-  - Both for the annotation and the prediction, detected entities end with the last occurrence of the tag of the first character.
+  - Both for the annotation and the prediction, detected entities end with the last occurrence of the tag of their first character. At this point, the rest of the annotation and prediction are inspected to check for nested entities and collect the end of potential containing entity.
 
 Here are examples of several situations with the delimitation of the matched entities in each case.
 
@@ -145,10 +163,10 @@ prediction : OOOO|PPPPPPPPPPP|OOOOPPPPOOOOOOO
 annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
 prediction : OOOOOOO|P|OPPPPPPPPPPPPPPOOOOOOO
 
-annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
-prediction : OOOOOOOOOOOOOOOOOOOOOOOOOOPPPPOO
+annotation : OOOOOOO|PPPPPP|LLL|PPPPPP|OOOOOO
+prediction : OOOOOOO|PPPPPP|LLL|PPPPPP|OOOOOO
 
-For this last example, no match is found in the prediction.
+For this last example, "PPPPPPLLLPPPPPP" and "LLL" are evaluated separately.
 ```
 - Get a score on the two matched strings :
   - Compute the Levenshtein distance between the two strings, ignoring the "-" characters
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index e48d398..97fe775 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -1,7 +1,4 @@
 # -*- coding: utf-8 -*-
-"""
-This script takes two bio files of annotation and prediction, and compute recall and precision for each NE label.
-"""
 
 import argparse
 import logging
@@ -16,76 +13,183 @@ THRESHOLD = 0.30
 NOT_ENTITY_TAG = "O"
 
 
+def get_type_label(label: str) -> str:
+    """Return the type (tag) of a label
+
+    Input format: "[BIELUS]-type"
+    """
+    try:
+        tag = (
+            NOT_ENTITY_TAG
+            if label == NOT_ENTITY_TAG
+            else re.match(r"[BIELUS]-(.{3,4})", label)[1]
+        )
+    except TypeError:
+        raise (Exception(f"The label {label} is not valid in BIOES/BILOU format."))
+
+    return tag
+
+
 def parse_bio(path: str) -> dict:
-    """Parse a BIO file to get text content, character-level NE tags and entity types count.
+    """Parse a BIO file to get text content, character-level NE labels and entity types count.
 
     Input : path to a valid BIO file
-    Output format : { "words": str, "tags": list; "entity_count" : { tag : int} }
+    Output format : { "words": str, "labels": list; "entity_count" : { tag : int } }
     """
-
     assert os.path.exists(path), f"Error: Input file {path} does not exist"
 
     words = []
-    tags = []
+    labels = []
     entity_count = {"All": 0}
     last_tag = None
 
     with open(path, "r") as fd:
+        lines = list(filter(lambda x: x != "\n", fd.readlines()))
+
+    if "Â§" in " ".join(lines):
+        raise (
+            Exception(
+                f"Â§ found in input file {path}. Since this character is used in a specific way during evaluation, prease remove it from files."
+            )
+        )
+
+    # Track nested entities infos
+    in_nested_entity = False
+    containing_tag = None
 
-        for line in list(filter(lambda x: x != "\n", fd.readlines())):
+    for index, line in enumerate(lines):
 
+        try:
             word, label = line.split()
+        except ValueError:
+            raise (Exception(f"The file {path} given in input is not in BIO format."))
 
-            # Preservation of '-' characters and avoid confusion with the dashes added later during the alignment
-            word = word.replace("-", "Â§")
-            words.append(word)
+        # Preserve hyphens to avoid confusion with the hyphens added later during alignment
+        word = word.replace("-", "Â§")
+        words.append(word)
 
-            try:
-                tag = (
-                    NOT_ENTITY_TAG
-                    if label == NOT_ENTITY_TAG
-                    else re.match(r"[BIES]-(.{3})", label)[1]
-                )
-            except TypeError:
-                raise (
-                    Exception(f"The file {path} given in input is not in BIO format.")
-                )
+        tag = get_type_label(label)
+
+        # Spaces will be added between words and have to get a label
+        if index != 0:
 
-            # Spaces will be added between words and have to get a tag
-            # If previous word has the same tag as current, the space also gets the tag
-            if last_tag is not None:
-                if last_tag == tag:
-                    tags.append(tag)
+            # If new word has same tag as previous, not new entity and in entity, continue entity
+            if last_tag == tag and "B" not in label and tag != NOT_ENTITY_TAG:
+                labels.append(f"I-{last_tag}")
+
+            # If new word begins a new entity of different type, check for nested entity to correctly tag the space
+            elif (
+                last_tag != tag
+                and "B" in label
+                and tag != NOT_ENTITY_TAG
+                and last_tag != NOT_ENTITY_TAG
+            ):
+
+                # Advance to next word with different label as current
+                future_label = label
+                while (
+                    index < len(lines)
+                    and future_label != NOT_ENTITY_TAG
+                    and get_type_label(future_label) != last_tag
+                ):
+                    index += 1
+                    future_label = lines[index].split()[1]
+
+                # Check for continuation of the original entity
+                if (
+                    index < len(lines)
+                    and "B" not in future_label
+                    and get_type_label(future_label) == last_tag
+                ):
+                    labels.append(f"I-{last_tag}")
+                    in_nested_entity = True
+                    containing_tag = last_tag
                 else:
-                    tags.append(NOT_ENTITY_TAG)
+                    labels.append(NOT_ENTITY_TAG)
+                    in_nested_entity = False
 
-            # Add a tag for each letter in the word
-            tags += [
-                tag,
-            ] * len(word)
+            elif in_nested_entity:
+                labels.append(f"I-{containing_tag}")
+
+            else:
+                labels.append(NOT_ENTITY_TAG)
+                in_nested_entity = False
+
+        # Add a tag for each letter in the word
+        if "B" in label:
+            labels += [f"B-{tag}"] + [f"I-{tag}"] * (len(word) - 1)
+        else:
+            labels += [label] * len(word)
+
+        # Count nb entity for each type
+        if "B" in label:
+            entity_count[tag] = entity_count.get(tag, 0) + 1
+            entity_count["All"] += 1
+
+        last_tag = tag
 
-            # Count nb entity for each type
-            if (not label == "O") and (not last_tag == tag):
-                entity_count[tag] = entity_count.get(tag, 0) + 1
-                entity_count["All"] += 1
+    result = None
 
-            last_tag = tag
+    if words:
 
-        result = None
+        result = dict()
+        result["words"] = " ".join(words)
+        result["labels"] = labels
+        result["entity_count"] = entity_count
 
-        if words:
-            # Make string out of words
-            result = dict()
-            result["words"] = " ".join(words)
-            result["tags"] = tags
-            result["entity_count"] = entity_count
-            assert len(result["words"]) == len(result["tags"])
+        assert len(result["words"]) == len(result["labels"])
+        for tag in result["entity_count"]:
+            if tag != "All":
+                assert result["labels"].count(f"B-{tag}") == result["entity_count"][tag]
 
     return result
 
 
+def look_for_further_entity_part(index, tag, characters, labels):
+    """Get further entities parts for long entities with nested entities.
+
+    Input:
+        index: the starting index to look for rest of entity (one after last character included)
+        tag: the type of the entity investigated
+        characters: the string of the annotation or prediction
+        the labels associated with characters
+    Output :
+        complete string of the rest of the entity found
+        visited: indexes of the characters used for this last entity part OF THE DESIGNATED TAG. Do not process again later
+    """
+    original_index = index
+    last_loop_index = index
+    research = True
+    visited = []
+    while research:
+        while (
+            index < len(characters)
+            and labels[index] != NOT_ENTITY_TAG
+            and get_type_label(labels[index]) != tag
+        ):
+            index += 1
+        while (
+            index < len(characters)
+            and "B" not in labels[index]
+            and get_type_label(labels[index]) == tag
+        ):
+            visited.append(index)
+            index += 1
+
+        research = index != last_loop_index and get_type_label(labels[index - 1]) == tag
+        last_loop_index = index
+
+    characters_to_add = (
+        characters[original_index:index]
+        if get_type_label(labels[index - 1]) == tag
+        else []
+    )
+
+    return characters_to_add, visited
+
+
 def compute_matches(
-    annotation: str, prediction: str, tags_annot: list, tags_predict: list
+    annotation: str, prediction: str, labels_annot: list, labels_predict: list
 ) -> dict:
     """Compute prediction score from annotation string to prediction string.
 
@@ -95,12 +199,12 @@ def compute_matches(
     This is done in looking for a sub-string roughly at the same position in the prediction, and with the right entity-tag.
     Here is an example to illustrate the method used :
 
-                   *-------*       *----*
-    tags_annot   : PPPPPPPPPOOOOOOOCCCCCCOO
-    annotation   : Tolkie-n- was a writer .
-    prediction   : Tolkieene xas --writer .
-    tags_predict : PPPPPPPPPOCCCCCCCCCCCCCC
-                   *-------* <-----*----*->
+                     *-------*       *----*
+    labels_annot   : PPPPPPPPPOOOOOOOCCCCCCOO
+    annotation     : Tolkie-n- was a writer .
+    prediction     : Tolkieene xas --writer .
+    labels_predict : PPPPPPPPPOCCCCCCCCCCCCCC
+                     *-------* <-----*----*->
 
     Each entity in the annotation string gets a prediction score based on the number
     of characters well predicted and labeled in the prediction string.
@@ -110,33 +214,42 @@ def compute_matches(
     Inputs :
     annotation : str, example : "Tolkie-n- was a writer- -."
     prediction : str, example : "Tolkieene xas --writear ,."
-    tags_annot : list of strings,   example : ['P','P','P','P','P','P','P','P','P','O', ...]
-    tags_predict : list of string , example : ['P','P','P','P','P','P','P','P','P','O', ...]
+    labels_annot : list of strings,   example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...]
+    labels_predict : list of string , example : ['B-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','I-P','O', ...]
 
     Output : {TAG1 : nb_entity_matched, ...}, example : {'All': 1, 'OCC': 0, 'PER': 1}
     """
-
     assert annotation
     assert prediction
-    assert tags_annot
-    assert tags_predict
+    assert labels_annot
+    assert labels_predict
 
     entity_count = {"All": 0}
     last_tag = NOT_ENTITY_TAG
 
-    # Inspecting reference string
+    # Track indexes of characters found for continuation of nested entities
+    visited_annot = []
+    visited_predict = []
+
+    # Iterating on reference string
     for i, char_annot in enumerate(annotation):
-        tag_ref = tags_annot[i]
+
+        if i in visited_annot:
+            continue
+
+        label_ref = labels_annot[i]
+        tag_ref = get_type_label(label_ref)
+        label_predict = labels_predict[i]
+        tag_predict = get_type_label(label_predict)
 
         # If character not in entity
         if tag_ref == NOT_ENTITY_TAG:
             last_tag = NOT_ENTITY_TAG
 
-        # Else, in entity
         else:
 
             # If beginning new entity
-            if not tag_ref == last_tag:
+            if "B" in label_ref:
                 current_ref, current_compar = [], []
                 last_tag = tag_ref
                 found_aligned_beginning = False
@@ -145,12 +258,20 @@ def compute_matches(
             current_ref.append(char_annot)
 
             # Searching character string corresponding with tag
-            if not found_aligned_end and tags_predict[i] == tag_ref:
+            if not found_aligned_end and tag_predict == tag_ref:
+
+                if i in visited_predict:
+                    continue
 
                 # If just beginning new entity, backtrack tags on prediction string
-                if len(current_ref) == 1:
+                if len(current_ref) == 1 and "B" not in labels_predict[i]:
                     j = i - 1
-                    while j >= 0 and tags_predict[j] == tag_ref:
+                    while (
+                        j >= 0
+                        and get_type_label(labels_predict[j]) == tag_ref
+                        and "B" not in labels_predict[j]
+                        and j not in visited_predict
+                    ):
                         j -= 1
                     current_compar += prediction[j + 1 : i]
 
@@ -161,20 +282,26 @@ def compute_matches(
             elif found_aligned_beginning:
                 found_aligned_end = True
 
-            # Detect end of entity in annotation
+            # If detect end of (1st part) entity in annotation: check for nested entity and compare
             if (i + 1 == len(annotation)) or (
-                i + 1 < len(annotation) and not tags_annot[i + 1] == last_tag
+                i + 1 < len(annotation)
+                and get_type_label(labels_annot[i + 1]) != last_tag
             ):
 
-                # Aligned entity may end further in prediction, so get the rest of the characters
                 if not found_aligned_end:
-                    j = i + 1
-                    while j < len(tags_predict) and tags_predict[j] == tag_ref:
-                        j += 1
-                    for k in range(i + 1, j):
-                        current_compar.append(prediction[k])
+                    rest_predict, visited = look_for_further_entity_part(
+                        i + 1, tag_ref, prediction, labels_predict
+                    )
+                    current_compar += rest_predict
+                    visited_predict += visited
+
+                rest_annot, visited = look_for_further_entity_part(
+                    i + 1, tag_ref, annotation, labels_annot
+                )
+                current_ref += rest_annot
+                visited_annot += visited
 
-                # Normalize found character strings
+                # Normalize collected strings
                 entity_ref = "".join(current_ref)
                 entity_ref = entity_ref.replace("-", "")
                 len_entity = len(entity_ref)
@@ -182,7 +309,7 @@ def compute_matches(
                 entity_compar = entity_compar.replace("-", "")
 
                 # One entity is counted as recognized (score of 1) if the Levenhstein distance between the expected and predicted entities
-                # represents less than 30% (Threshold) of the length of the expected entity.
+                # represents less than 30% (THRESHOLD) of the length of the expected entity.
                 # Precision and recall will be computed for each category in comparing the numbers of recognized entities and expected entities
                 score = (
                     1
@@ -192,46 +319,57 @@ def compute_matches(
                 )
                 entity_count[last_tag] = entity_count.get(last_tag, 0) + score
                 entity_count["All"] += score
+                current_ref = []
+                current_compar = []
 
     return entity_count
 
 
-def get_tags_aligned(original: str, aligned: str, tags_original: list) -> list:
-    """Takes original string, original string tags and aligned string given by edlib.align.
-    Returns a list of tags corresponding to the aligned string.
+def get_labels_aligned(original: str, aligned: str, labels_original: list) -> list:
+    """Takes original string, original string labels and aligned string given by edlib.align.
+    Returns a list of labels corresponding to the aligned string.
 
-    Output format : list of strings
+    Input formats:
+        original: str
+        aligned: str with hyphens
+        labels_original: list of labels ["O", "B-LOC", "I-LOC", ...]
+    Output format :
+        list of strings
     """
     assert original
     assert aligned
-    assert tags_original
+    assert labels_original
 
-    tags_aligned = []
+    labels_aligned = []
     index_original = 0
-    last_tag = NOT_ENTITY_TAG
+    last_label = NOT_ENTITY_TAG
 
     # Inspecting aligned string
     for i, char in enumerate(aligned):
-        new_tag = ""
+        # new_label = ""
 
-        # If original string has been fully processed, rest of tags are O ('-' characters at aligned end)
+        # If original string has been fully processed, rest of labels are "O" ('-' characters at aligned end)
         if index_original >= len(original):
-            new_tag = NOT_ENTITY_TAG
+            new_label = NOT_ENTITY_TAG
 
         # If current aligned char does not match current original char ('-' characters in aligned)
-        # Keep last_tag and don't increment index_original
+        # Keep last_label and don't increment index_original
         elif not char == original[index_original]:
-            new_tag = last_tag
+            new_label = (
+                last_label
+                if "B" not in last_label
+                else f"I-{get_type_label(last_label)}"
+            )
 
         # Until matching of characters)
         else:
-            new_tag = tags_original[index_original]
-            last_tag = new_tag
+            new_label = labels_original[index_original]
+            last_label = new_label
             index_original += 1
 
-        tags_aligned.append(new_tag)
+        labels_aligned.append(new_label)
 
-    return tags_aligned
+    return labels_aligned
 
 
 def compute_scores(
@@ -309,10 +447,10 @@ def print_results(scores: dict):
 def run(annotation: str, prediction: str) -> dict:
     """Compute recall and precision for each entity type found in annotation and/or prediction.
 
-    Each measure is given at document level, global score is a micro-average over tag types.
+    Each measure is given at document level, global score is a micro-average across entity types.
     """
 
-    # Get string and list of tags per character
+    # Get string and list of labels per character
     annot = parse_bio(annotation)
     predict = parse_bio(prediction)
 
@@ -328,15 +466,17 @@ def run(annotation: str, prediction: str) -> dict:
     annot_aligned = nice_alignment["query_aligned"]
     predict_aligned = nice_alignment["target_aligned"]
 
-    # Align tags from string alignment
-    tags_annot_aligned = get_tags_aligned(annot["words"], annot_aligned, annot["tags"])
-    tags_predict_aligned = get_tags_aligned(
-        predict["words"], predict_aligned, predict["tags"]
+    # Align labels from string alignment
+    labels_annot_aligned = get_labels_aligned(
+        annot["words"], annot_aligned, annot["labels"]
+    )
+    labels_predict_aligned = get_labels_aligned(
+        predict["words"], predict_aligned, predict["labels"]
     )
 
     # Get nb match
     matches = compute_matches(
-        annot_aligned, predict_aligned, tags_annot_aligned, tags_predict_aligned
+        annot_aligned, predict_aligned, labels_annot_aligned, labels_predict_aligned
     )
 
     # Compute scores
diff --git a/setup.py b/setup.py
index b3f9432..4ecabee 100644
--- a/setup.py
+++ b/setup.py
@@ -14,10 +14,10 @@ install_requires = requirements("requirements.txt")
 
 setup(
     name="Nerval",
-    version=0.1,
+    version=0.3,
     description="Tool to evaluate NER on noisy text.",
     author="Teklia",
-    author_email="contact@teklia.com",
+    author_email="bmiret@teklia.com",
     packages=["nerval"],
     entry_points={"console_scripts": ["nerval=nerval.evaluate:main"]},
     install_requires=install_requires,
diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py
index cf596dd..8d5d28a 100644
--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -6,52 +6,142 @@ from nerval import evaluate
 fake_annot_aligned = "GÃ©rard de -N-erval was bo-rn in Paris in 1808 -."
 fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
 
+fake_string_nested = "Louis par la grÃ¢ce de Dieu roy de France et de Navarre."
+
 # fmt: off
-fake_annot_tags_aligned = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'O',
-    'O', 'O', 'O',
-    'O',
-    'O', 'O', 'O', 'O', 'O',
-    'O',
-    'O', 'O',
-    'O',
-    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
-    'O',
-    'O', 'O',
+fake_tags_aligned_nested_perfect = [
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'O'
+]
+
+
+fake_tags_aligned_nested_false = [
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
-    'O', 'O'
+    'O'
 ]
 
 fake_predict_tags_aligned = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
     'O',
     'O', 'O', 'O', 'O',
     'O', 'O', 'O', 'O', 'O',
     'O',
     'O', 'O',
     'O',
-    '***', '***', '***', '***', '***',
+    'B-***', 'I-***', 'I-***', 'I-***', 'I-***',
     'O',
     'O', 'O',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
     'O',
     'O', 'O'
 ]
 # fmt: on
 
+fake_annot_tags_aligned = [
+    "B-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "I-PER",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "O",
+    "B-LOC",
+    "I-LOC",
+    "I-LOC",
+    "I-LOC",
+    "I-LOC",
+    "O",
+    "O",
+    "O",
+    "O",
+    "B-DAT",
+    "I-DAT",
+    "I-DAT",
+    "I-DAT",
+    "O",
+    "O",
+    "O",
+]
+
 expected_matches = {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}
+expected_matches_nested_perfect = {"All": 3, "PER": 1, "LOC": 2}
+expected_matches_nested_false = {"All": 2, "PER": 1, "LOC": 1}
 
 
 @pytest.mark.parametrize(
@@ -65,7 +155,25 @@ expected_matches = {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}
                 fake_predict_tags_aligned,
             ),
             expected_matches,
-        )
+        ),
+        (
+            (
+                fake_string_nested,
+                fake_string_nested,
+                fake_tags_aligned_nested_perfect,
+                fake_tags_aligned_nested_perfect,
+            ),
+            expected_matches_nested_perfect,
+        ),
+        (
+            (
+                fake_string_nested,
+                fake_string_nested,
+                fake_tags_aligned_nested_perfect,
+                fake_tags_aligned_nested_false,
+            ),
+            expected_matches_nested_false,
+        ),
     ],
 )
 def test_compute_matches(test_input, expected):
diff --git a/tests/test_get_tags_aligned.py b/tests/test_get_labels_aligned.py
similarity index 57%
rename from tests/test_get_tags_aligned.py
rename to tests/test_get_labels_aligned.py
index a107053..a2d45bb 100644
--- a/tests/test_get_tags_aligned.py
+++ b/tests/test_get_labels_aligned.py
@@ -11,11 +11,11 @@ fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
 
 # fmt: off
 fake_annot_tags_original = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
     'O',
     'O', 'O', 'O',
     'O',
@@ -23,21 +23,21 @@ fake_annot_tags_original = [
     'O',
     'O', 'O',
     'O',
-    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
     'O', 'O',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
     'O',
     'O'
 ]
 
 fake_predict_tags_original = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
     'O',
     'O', 'O', 'O', 'O', 'O',
     'O',
@@ -47,17 +47,17 @@ fake_predict_tags_original = [
     'O',
     'O', 'O',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
     'O',
     'O', 'O'
 ]
 
 expected_annot_tags_aligned = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
     'O',
     'O', 'O', 'O',
     'O',
@@ -65,21 +65,21 @@ expected_annot_tags_aligned = [
     'O',
     'O', 'O',
     'O',
-    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+    'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
     'O',
     'O', 'O',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
     'O',
     'O', 'O'
 ]
 
 expected_predict_tags_aligned = [
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-    'PER',
-    'PER', 'PER',
-    'PER',
-    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER',
+    'I-PER',
+    'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
     'O',
     'O', 'O', 'O', 'O',
     'O', 'O', 'O', 'O', 'O',
@@ -90,7 +90,7 @@ expected_predict_tags_aligned = [
     'O',
     'O', 'O',
     'O',
-    'DAT', 'DAT', 'DAT', 'DAT',
+    'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
     'O',
     'O', 'O'
 ]
@@ -110,10 +110,10 @@ expected_predict_tags_aligned = [
         ),
     ],
 )
-def test_get_tags_aligned(test_input, expected):
-    assert evaluate.get_tags_aligned(*test_input) == expected
+def test_get_labels_aligned(test_input, expected):
+    assert evaluate.get_labels_aligned(*test_input) == expected
 
 
-def test_get_tags_aligned_empty_entry():
+def test_get_labels_aligned_empty_entry():
     with pytest.raises(AssertionError):
-        evaluate.get_tags_aligned(None, None, None)
+        evaluate.get_labels_aligned(None, None, None)
diff --git a/tests/test_nested.bio b/tests/test_nested.bio
new file mode 100644
index 0000000..6e74429
--- /dev/null
+++ b/tests/test_nested.bio
@@ -0,0 +1,13 @@
+Louis B-PER
+par I-PER
+la I-PER
+grÃ¢ce I-PER
+de I-PER
+Dieu I-PER
+roy I-PER
+de I-PER
+France B-LOC
+et I-PER
+de I-PER
+Navarre B-LOC
+. O
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 543ec20..2572dc8 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -12,12 +12,12 @@ FAKE_PREDICT_BIO = "tests/test_predict.bio"
 # fmt: off
 expected_parsed_annot = {
     'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
-    'tags': [
-        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-        'PER',
-        'PER', 'PER',
-        'PER',
-        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'labels': [
+        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+        'I-PER',
+        'I-PER', 'I-PER',
+        'I-PER',
+        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
         'O',
         'O', 'O', 'O',
         'O',
@@ -25,11 +25,11 @@ expected_parsed_annot = {
         'O',
         'O', 'O',
         'O',
-        'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+        'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC',
         'O',
         'O', 'O',
         'O',
-        'DAT', 'DAT', 'DAT', 'DAT',
+        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
         'O',
         'O'
     ],
@@ -38,22 +38,22 @@ expected_parsed_annot = {
 
 expected_parsed_predict = {
     'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
-    'tags': [
-        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-        'PER',
-        'PER', 'PER',
-        'PER',
-        'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'labels': [
+        'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
+        'I-PER',
+        'I-PER', 'I-PER',
+        'I-PER',
+        'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
         'O',
         'O', 'O', 'O', 'O', 'O',
         'O',
         'O', 'O',
         'O',
-        '***', '***', '***', '***', '***',
+        'B-***', 'I-***', 'I-***', 'I-***', 'I-***',
         'O',
         'O', 'O',
         'O',
-        'DAT', 'DAT', 'DAT', 'DAT',
+        'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT',
         'O',
         'O', 'O'
     ],
diff --git a/tests/test_run.py b/tests/test_run.py
index 8cf9c84..a9ca424 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -6,6 +6,28 @@ from nerval import evaluate
 FAKE_ANNOT_BIO = "tests/test_annot.bio"
 FAKE_PREDICT_BIO = "tests/test_predict.bio"
 EMPTY_BIO = "tests/test_empty.bio"
+FAKE_BIO_NESTED = "tests/test_nested.bio"
+
+expected_scores_nested = {
+    "All": {
+        "P": 1.0,
+        "R": 1.0,
+        "F1": 1.0,
+        "predicted": 3,
+        "matched": 3,
+        "Support": 3,
+    },
+    "PER": {"P": 1.0, "R": 1.0, "F1": 1.0, "predicted": 1, "matched": 1, "Support": 1},
+    "LOC": {
+        "P": 1.0,
+        "R": 1.0,
+        "F1": 1.0,
+        "predicted": 2,
+        "matched": 2,
+        "Support": 2,
+    },
+}
+
 
 expected_scores = {
     "***": {
@@ -38,7 +60,11 @@ expected_scores = {
 
 
 @pytest.mark.parametrize(
-    "test_input, expected", [((FAKE_ANNOT_BIO, FAKE_PREDICT_BIO), expected_scores)]
+    "test_input, expected",
+    [
+        ((FAKE_ANNOT_BIO, FAKE_PREDICT_BIO), expected_scores),
+        ((FAKE_BIO_NESTED, FAKE_BIO_NESTED), expected_scores_nested),
+    ],
 )
 def test_run(test_input, expected):
     # print(evaluate.run(*test_input))
-- 
GitLab