diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..21a2baed9dd04a0fdf033062db21300e707ec93a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.env/
+*.egg-info/
+__pycache__
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ec5f0d4c70ef732aa4d399282a5ddb2a008fd
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include README.md
diff --git a/README.md b/README.md
index 2e769438256b2435405133a9bc11fa1d4116d977..7b8bd8ee4c8f2be8fddb32c1445fa598798c6801 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,186 @@
-D'après l'oeuvre de Gérard de Nerval.
+# Nerval
+
+> Je suis l'autre
+>
+>  -- <cite>Gérard de Nerval</cite>
+
+A NER evaluation metric on noisy text, typically to measure NER performances on HTR predictions.
+
+## Usage
+
+After cloning the repository, install the package with:
+
+```
+$ cd nerval
+$ pip3 install .
+```
+
+To run the tests and check that everything is fine:
+```
+$ cd tests
+$ pytest
+```
+
+You can now use Nerval in command line :
+```
+$ nerval -a/--annot <annot_file.bio> -p/--predict <predict-file.bio>
+```
+
+To use the demo files :
+
+```
+$ nerval -a demo/demo_annot.bio -p demo/demo_predict.bio
+```
+
+We also provide two annotation and prediction toy files, which are identical for now and produce perfect scores. Feel free to play with the the text and entity tags in the prediction file to see the impact on the score.
+
+```
+$ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
+```
+
+
+
+## Metric
+
+This metric uses string alignment at character level.
+
+The automatic transcription is first aligned with the ground truth at character level, by minimising the Levenshtein distance between them. Each entity in the ground truth is then matched with a corresponding entity in the aligned transcription, with the same entity label, or an empty character string if no match is found. If the edit distance between the two entities is less than 30% of the ground truth entity length, the predicted entity is considered as recognised. For the purpose of matching detected entities to existing databases, we estimated that a 70% match between the entity texts was a fair threshold. 
+
+#### Details :
+
+- From the bio files in input, retrieval of the text content and extension of a word-level tagging to a character-level tagging
+    - spaces added between each word
+    - spaces between two words with the same tag get the same tag, else O
+    - information about beginning of entity is dropped
+
+For instance, the following annotation file:
+
+```
+Tolkien B-PER
+was O
+a O
+writer B-OCC
+. O
+```
+produces the following list of tags, one per character plus spaces:
+
+```
+['PER','PER','PER','PER','PER','PER','PER',
+ 'O',
+ 'O', 'O', 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'OCC','OCC','OCC','OCC','OCC','OCC',
+ 'O',
+ 'O'] 
+```
+
+And the prediction file could be:
+
+```
+Tolkieene B-PER
+xas O
+writear B-OCC
+,. O
+```
+
+producing:
+
+```
+['PER','PER','PER','PER','PER','PER','PER','PER','PER',
+ 'O',
+ 'O', 'O', 'O',
+ 'O',
+ 'OCC','OCC','OCC','OCC','OCC','OCC','OCC',
+ 'O',
+ 'O','O'] 
+```
+
+- Character level alignment between annotation and prediction adds '-' characters to both strings so they are the same length
+
+With the following input texts :
+
+```
+annotation : Tolkien was a writer .
+prediction : Tolkieen xas writear ,.
+```
+
+the alignment result is:
+
+```
+annotation : Tolkie-n- was a writer- -.
+prediction : Tolkieene xas --writear ,. 
+```
+
+- Adapt character-level tag to aligned strings 
+  - '-' characters in aligned strings get the same tag as the previous proper character in the string
+
+```
+             PPPPPPPPPOOOOOOOCCCCCCCOOO
+annotation : Tolkie-n- was a writer- -.
+prediction : Tolkieene xas --writear ,. 
+             PPPPPPPPPOOOOOOOCCCCCCCOOO
+```
+- Search for matching entity for each entity in the annotation
+  - Inspecting the annotation character by character, when a new entity tag (not 'O') is encountered, the character is considered as the beginning of an entity to be matched.
+  - Considering the opposite character in the prediction string, if the entity tags match on these two characters, tags are back-tracked in the prediction string to detect the beginning of the entity; that is, the first occurrence of said entity tag.
+  - Else, if the entity tags don't match on the first character, beginning of matching entity in prediction is looked for until the end of the entity in the annotation.
+  - Both for the annotation and the prediction, detected entities end with the last occurrence of the tag of the first character.
+
+Here are examples of several situations with the delimitation of the matched entities in each case.
+
+```
+Matches delimitations are represented by ||
+
+annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
+prediction : OOOO|PPPPPPPPPPP|OOOOOOOOOOOOOOO
+
+annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
+prediction : OOOOOOOOOOOOOO|PPPPPPPPPPPPPP|OO
+
+annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
+prediction : OOOO|PPPPPPPPPPP|OOOOPPPPOOOOOOO
+
+annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
+prediction : OOOOOOO|P|OPPPPPPPPPPPPPPOOOOOOO
+
+annotation : OOOOOOO|PPPPPPPPPPPPPPPPP|OOOOOO
+prediction : OOOOOOOOOOOOOOOOOOOOOOOOOOPPPPOO
+
+For this last example, no match is found in the prediction.
+```
+- Get a score on the two matched strings :
+  - Compute the Levenshtein distance between the two strings, ignoring the "-" characters
+  - If edit_distance / length(annotation_entity) < 0.3, the entity is considered as recognised
+
+```
+edit_distance("Tolkien", "Tolkieene") = 2
+len("Tolkien") = 7
+2/7 = 0.29 < 0.3
+OK
+
+edit_distance("writer", "writear") = 1
+len("writer") = 6
+1/6 = 0.17 < 0.3
+OK
+```
+
+- Final scores, Precision, Recall and F1-score, are given for each entity types, on entity-level. The total ("ALL") is a micro-average across entity types
+
+```
+PER :
+P = 1/1
+R = 1/1
+F1 = 2*1*1/(1+1)
+
+OCC :
+P = 1/1
+R = 1/1
+F1 = 2*1*1/(1+1)
+
+ALL :
+P = 2/2
+R = 2/2
+F1 = 2*1*1/(1+1)
+```
diff --git a/demo/demo_annot.bio b/demo/demo_annot.bio
new file mode 100644
index 0000000000000000000000000000000000000000..cf16200fcd0436d943f71127f63894e428fafea7
--- /dev/null
+++ b/demo/demo_annot.bio
@@ -0,0 +1,82 @@
+Césaire B-PER
+Alphonse I-PER
+Garon I-PER
+marraine O
+Adeline B-PER
+Dionne I-PER
+, O
+soussignés O
+Lecture O
+faite O
+Adéline O
+Dionne O
+Arsène O
+Côté O
+Arpin O
+R O
+Le O
+onze B-DAT
+aout I-DAT
+mil I-DAT
+neuf I-DAT
+cent I-DAT
+un I-DAT
+nous O
+prêtre O
+soussigné O
+avons O
+baptisé O
+Marie B-PER
+Luce I-PER
+Louise I-PER
+, O
+née O
+la B-DAT
+veille I-DAT
+, O
+fille O
+légitime O
+de O
+Carmel B-PER
+Côté I-PER
+, O
+cordonnier B-OCC
+, O
+pré O
+- O
+sent O
+, O
+déclarant O
+ne O
+savoir O
+signer O
+, O
+et O
+de O
+Eugé B-PER
+nie I-PER
+Fréchette I-PER
+, O
+de O
+cette B-LOC
+paroisse I-LOC
+. O
+Parrain O
+Napoléon B-PER
+Fréchette I-PER
+, O
+marraine O
+Adeline B-PER
+Tremblay I-PER
+, O
+soussignés O
+, O
+de O
+Ste B-LOC
+Luce I-LOC
+, O
+Lec O
+- O
+ture O
+faite O
+. O
diff --git a/demo/demo_predict.bio b/demo/demo_predict.bio
new file mode 100644
index 0000000000000000000000000000000000000000..7e01c2d127aa47c95c49ab87a06e06d86f9af9b0
--- /dev/null
+++ b/demo/demo_predict.bio
@@ -0,0 +1,81 @@
+Césaire B-PER
+Alphonse O
+Garon B-PER
+marraine O
+Adeline B-PER
+Dionne I-PER
+, O
+soussignés O
+Lecture O
+faite O
+Adéline O
+Dionne O
+Arsène O
+Côté O
+Arpin O
+R O
+Le O
+onze B-DAT
+aout I-DAT
+mil I-DAT
+neuf I-DAT
+cent I-DAT
+un O
+nous O
+pretre O
+soussigné O
+avons O
+baptisé O
+Marie B-PER
+Luce I-PER
+Louise I-PER
+, O
+née O
+la B-DAT
+veille I-DAT
+, O
+fille O
+légitime O
+de O
+Carmel B-PER
+Côté I-PER
+, O
+cordonnier B-OCC
+, O
+pré O
+- O
+sent O
+, O
+déclarant O
+ne O
+savoir O
+signer O
+, O
+et O
+de O
+Eugé B-PER
+nie I-PER
+Fréchette I-PER
+, O
+de O
+cette B-LOC
+paroisse I-LOC
+. O
+Parrain O
+Napoléon B-PER
+Fréchette I-PER
+, O
+marraine O
+Adéline B-PER
+Tremblay I-PER
+, O
+sousignés O
+, O
+de O
+St B-LOC
+. I-LOC
+Luce O
+, O
+Lec O
+ture O
+faite O
diff --git a/demo/toy_test_annot.bio b/demo/toy_test_annot.bio
new file mode 100644
index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef
--- /dev/null
+++ b/demo/toy_test_annot.bio
@@ -0,0 +1,39 @@
+John B-PER
+Ronald I-PER
+Reuel I-PER
+Tolkien I-PER
+was O
+born O
+on O
+three B-DAT
+January I-DAT
+eighteen I-DAT
+ninety I-DAT
+- I-DAT
+two I-DAT
+in O
+Bloemfontein B-LOC
+in O
+the O
+Orange B-LOC
+Free I-LOC
+State I-LOC
+, O
+to O
+Arthur B-PER
+Reuel I-PER
+Tolkien I-PER
+, O
+an O
+English O
+bank B-OCC
+manager I-OCC
+, O
+and O
+his O
+wife O
+Mabel B-PER
+, O
+née O
+Suffield B-PER
+. O
diff --git a/demo/toy_test_predict.bio b/demo/toy_test_predict.bio
new file mode 100644
index 0000000000000000000000000000000000000000..5a941ee8ce361e67dbd213b2b2cd5d6a6a53e4ef
--- /dev/null
+++ b/demo/toy_test_predict.bio
@@ -0,0 +1,39 @@
+John B-PER
+Ronald I-PER
+Reuel I-PER
+Tolkien I-PER
+was O
+born O
+on O
+three B-DAT
+January I-DAT
+eighteen I-DAT
+ninety I-DAT
+- I-DAT
+two I-DAT
+in O
+Bloemfontein B-LOC
+in O
+the O
+Orange B-LOC
+Free I-LOC
+State I-LOC
+, O
+to O
+Arthur B-PER
+Reuel I-PER
+Tolkien I-PER
+, O
+an O
+English O
+bank B-OCC
+manager I-OCC
+, O
+and O
+his O
+wife O
+Mabel B-PER
+, O
+née O
+Suffield B-PER
+. O
diff --git a/nerval/__init__.py b/nerval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..93b35c2d3b29ed40853bc86bd99f34025b85ef19
--- /dev/null
+++ b/nerval/evaluate.py
@@ -0,0 +1,325 @@
+import edlib
+import argparse
+import logging
+import editdistance
+import pandas as pd
+import os
+import re
+
+'''
+This script takes two bio files of annotation and prediction, and compute recall and precision for each NE label.
+'''
+
+THRESHOLD = 0.30
+NOT_ENTITY_TAG = "O"
+
+def parse_bio(path : str) -> dict :
+    ''' Parse a BIO file to get text content, character-level NE tags and entity types count.
+
+    Input : path to a valid BIO file
+    Output format : { "words": str, "tags": list; "entity_count" : { tag : int} }
+    '''
+
+    assert os.path.exists(path)
+
+    words = []
+    tags = []
+    entity_count = {'All':0}
+    last_tag = None
+
+    with open(path, 'r') as fd :
+
+        for line in list(filter(lambda x:x!='\n', fd.readlines())) :
+
+            word, label = line.split()
+
+            # Preservation of '-' characters and avoid confusion with the dashes added later during the alignment
+            word = word.replace('-', '§')
+            words.append(word)
+
+            try :
+                tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else re.match(r"[BIES]-(.{3})", label)[1]
+            except TypeError as e:
+                raise(Exception(f"The file {path} given in input is not in BIO format."))
+
+            # Spaces will be added between words and have to get a tag
+            # If previous word has the same tag as current, the space also gets the tag
+            if last_tag is not None :
+                if last_tag == tag :
+                    tags.append(tag)
+                else :
+                    tags.append(NOT_ENTITY_TAG)
+
+            # Add a tag for each letter in the word
+            tags += [tag,] * len(word)
+
+            # Count nb entity for each type
+            if (not label == 'O') and (not last_tag == tag) :
+                entity_count[tag] = entity_count.get(tag, 0) + 1
+                entity_count['All'] += 1
+
+            last_tag = tag
+
+        result = None
+
+        if words :
+            # Make string out of words
+            result = dict()
+            result["words"] = " ".join(words)
+            result["tags"] = tags
+            result["entity_count"] = entity_count
+            assert len(result["words"]) == len(result["tags"])
+
+    return result
+
+
+def compute_matches(annotation : str, prediction : str, tags_annot : list, tags_predict : list) -> dict :
+    '''Compute prediction score from annotation string to prediction string.
+
+    Annotation and prediction strings should be the same length.
+
+    For each entity in the annotation string, a match is found in the prediction.
+    This is done in looking for a sub-string roughly at the same position in the prediction, and with the right entity-tag.
+    Here is an example to illustrate the method used :
+
+                   *-------*       *----*
+    tags_annot   : PPPPPPPPPOOOOOOOCCCCCCOO
+    annotation   : Tolkie-n- was a writer .
+    prediction   : Tolkieene xas --writer .
+    tags_predict : PPPPPPPPPOCCCCCCCCCCCCCC
+                   *-------* <-----*----*->
+
+    Each entity in the annotation string gets a prediction score based on the number
+    of characters well predicted and labeled in the prediction string.
+    The score of a label is the addition of entity scores divided by the number
+    of entities.
+
+    Inputs :
+    annotation : str, example : "Tolkie-n- was a writer- -."
+    prediction : str, example : "Tolkieene xas --writear ,."
+    tags_annot : list of strings,   example : ['P','P','P','P','P','P','P','P','P','O', ...]
+    tags_predict : list of string , example : ['P','P','P','P','P','P','P','P','P','O', ...]
+
+    Output : {TAG1 : nb_entity_matched, ...}, example : {'All': 1, 'OCC': 0, 'PER': 1}
+    '''
+
+    assert annotation
+    assert prediction
+    assert tags_annot
+    assert tags_predict
+
+    entity_count = {"All":0}
+    last_tag = NOT_ENTITY_TAG
+
+    # Inspecting reference string
+    for i, char_annot in enumerate(annotation) :
+        tag_ref = tags_annot[i]
+
+        # If character not in entity
+        if tag_ref == NOT_ENTITY_TAG :
+            last_tag =  NOT_ENTITY_TAG
+
+        # Else, in entity
+        else :
+
+            # If beginning new entity
+            if not tag_ref == last_tag :
+                current_ref, current_compar = [], []
+                last_tag = tag_ref
+                found_aligned_beginning = False
+                found_aligned_end = False
+
+            current_ref.append(char_annot)
+
+            # Searching character string corresponding with tag
+            if not found_aligned_end and tags_predict[i] == tag_ref :
+
+                # If just beginning new entity, backtrack tags on prediction string
+                if len(current_ref) == 1:
+                    j = i-1
+                    while j >= 0 and tags_predict[j] == tag_ref :
+                        j -= 1
+                    current_compar += prediction[j+1:i]
+
+                found_aligned_beginning = True
+                current_compar.append(prediction[i])
+
+            # If tags don't match and beginning was found : end of predicted entity
+            elif found_aligned_beginning :
+                found_aligned_end = True
+
+            # Detect end of entity in annotation
+            if (i+1 == len(annotation)) or (i+1 < len(annotation) and not tags_annot[i+1] == last_tag) :
+
+                # Aligned entity may end further in prediction, so get the rest of the characters
+                if not found_aligned_end :
+                    j = i+1
+                    while j < len(tags_predict) and tags_predict[j] == tag_ref :
+                        j += 1
+                    for k in range(i+1, j) :
+                        current_compar.append(prediction[k])
+
+                # Normalize found character strings
+                entity_ref = "".join(current_ref)
+                entity_ref = entity_ref.replace("-", "")
+                len_entity = len(entity_ref)
+                entity_compar = "".join(current_compar)
+                entity_compar = entity_compar.replace("-", "")
+
+                # One entity is counted as recognized (score of 1) if the Levenhstein distance between the expected and predicted entities
+                # represents less than 30% (Treshold) of the length of the expected entity.
+                # Precision and recall will be computed for each category in comparing the numbers of recognized entities and expected entities
+                score = 1 if editdistance.eval(entity_ref, entity_compar)/len_entity < THRESHOLD else 0
+                entity_count[last_tag] = entity_count.get(last_tag, 0) + score
+                entity_count["All"] += score
+
+    return entity_count
+
+def get_tags_aligned(original : str, aligned : str, tags_original : list) -> list:
+    ''' Takes original string, original string tags and aligned string given by edlib.align.
+    Returns a list of tags corresponding to the aligned string.
+
+    Ouptut format : list of strings
+    '''
+    assert original
+    assert aligned
+    assert tags_original
+
+    tags_aligned = []
+    index_original = 0
+    last_tag = NOT_ENTITY_TAG
+
+    # Inspecting aligned string
+    for i, char in enumerate(aligned) :
+        new_tag = ""
+
+        # If original string has been fully processed, rest of tags are O ('-' characters at aligned end)
+        if index_original >= len(original) :
+            new_tag = NOT_ENTITY_TAG
+
+        # If current aligned char does not match current original char ('-' characters in aligned)
+        # Keep last_tag and don't increment index_original
+        elif not char == original[index_original] :
+            new_tag = last_tag
+
+        # Until matching of characters)
+        else :
+            new_tag = tags_original[index_original]
+            last_tag = new_tag
+            index_original += 1
+
+        tags_aligned.append(new_tag)
+
+    return tags_aligned
+
+def compute_scores(annot_tags_count : dict, predict_tags_count : dict, matches : dict) -> dict :
+    ''' Compute Precision, Recall and F1 score for all entity types found in annotation and prediction.
+
+    Each measure is given at document level, global score is a micro-average over tag types.
+
+    Inputs :
+    annot :   { TAG1(str) : nb_entity(int), ...}
+    predict : { TAG1(str) : nb_entity(int), ...}
+    matches : { TAG1(str) : nb_entity_matched(int), ...}
+
+    Output :
+    scores : { TAG1(str) : {"P" : float, "R" : float, "F1" : float}, ... }
+    '''
+
+    annot_tags = set(annot_tags_count.keys())
+    predict_tags = set(predict_tags_count.keys())
+    tags = annot_tags | predict_tags
+
+    scores = { tag : {"P" : None, "R" : None, "F1" : None} for tag in tags}
+
+    for tag in sorted(tags)[::-1] :
+        nb_predict = predict_tags_count.get(tag)
+        nb_annot = annot_tags_count.get(tag)
+        nb_match = matches.get(tag,0)
+        prec = None if not nb_predict else nb_match / nb_predict
+        rec = None if not nb_annot else nb_match / nb_annot
+        f1 = None if (prec is None) or (rec is None) else 0 if (prec+rec==0) else 2 * (prec * rec) / (prec + rec)
+
+        scores[tag]["P"] = prec
+        scores[tag]["R"] = rec
+        scores[tag]["F1"] = f1
+
+    return scores
+
+def print_results(scores : dict) :
+    ''' Display final results.
+
+    None values are kept to indicate the absence of a certain tag in either annotation or prediction.
+    '''
+    logging.info("-- Results --")
+
+    for tag in sorted(scores.keys())[::-1] :
+
+        prec = None if scores[tag]["P"] is None else round(scores[tag]["P"],2)
+        rec = None if scores[tag]["R"] is None else round(scores[tag]["R"],2)
+        f1 = None if scores[tag]["F1"] is None else round(scores[tag]["F1"],2)
+
+        logging.info(f"{tag} :")
+        logging.info(f"P = {prec}")
+        logging.info(f"R = {rec}")
+        logging.info(f"F1 = {f1}")
+
+
+def run(annotation : str, prediction : str) -> dict :
+    ''' Compute recall and precision for each entity type found in annotation and/or prediction.
+
+    Each measure is given at document level, global score is a micro-average over tag types.
+    '''
+
+    # Get string and list of tags per character
+    annot = parse_bio(annotation)
+    predict = parse_bio(prediction)
+
+    if not annot or not predict :
+        raise Exception("No content found in annotation or prediction files.")
+
+    # Align annotation and prediction
+    align_result = edlib.align(annot["words"], predict["words"], task="path")
+    nice_alignment = edlib.getNiceAlignment(align_result, annot["words"], predict["words"])
+
+    annot_aligned = nice_alignment["query_aligned"]
+    predict_aligned = nice_alignment["target_aligned"]
+
+    # Align tags from string alignment
+    tags_annot_aligned = get_tags_aligned(annot["words"], annot_aligned, annot["tags"])
+    tags_predict_aligned = get_tags_aligned(predict["words"], predict_aligned, predict["tags"])
+
+    # Get nb match
+    matches = compute_matches(annot_aligned, predict_aligned, tags_annot_aligned, tags_predict_aligned)
+
+    # Compute scores
+    scores = compute_scores(annot["entity_count"], predict["entity_count"], matches)
+
+    # Print results
+    print_results(scores)
+
+    return scores
+
+def main() :
+    ''' Get arguments and run.
+    '''
+
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser(description="Compute score of NER on predict.")
+    parser.add_argument(
+        "-a", "--annot",
+        help="Annotation in BIO format.",
+        required=True
+    )
+    parser.add_argument(
+        "-p", "--predict",
+        help="Prediction in BIO format.",
+        required=True
+    )
+    args = parser.parse_args()
+
+    run(args.annot, args.predict)
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f27173558a3f4d4903d543f87b35d6a559f86a84
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+edlib==1.3.8.post2
+pandas==1.2.3
+editdistance==0.5.3
+pytest==6.2.2
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f34d7dfdcd5c2d2023c65262600eaf8e8fef9a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,20 @@
+import os.path
+from setuptools import setup
+
+def requirements(path):
+    assert os.path.exists(path), "Missing requirements {}.format(path)"
+    with open(path) as f:
+        return list(map(str.strip, f.read().splitlines()))
+
+install_requires = requirements("requirements.txt")
+
+setup(
+    name="Nerval",
+    version=0.1,
+    description="Tool to evaluate NER on noisy text.",
+    author="Teklia",
+    author_email="contact@teklia.com",
+    packages=['nerval'],
+    entry_points={"console_scripts": ["nerval=nerval.evaluate:main"]},
+    install_requires=install_requires
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/test_align.py b/tests/test_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9fd966117e7b1415b785abef4c1cba732d88086
--- /dev/null
+++ b/tests/test_align.py
@@ -0,0 +1,18 @@
+import pytest
+import edlib
+from nerval import evaluate
+
+fake_annot_original = "Gérard de Nerval was born in Paris in 1808 ."
+fake_predict_original = "G*rard de *N*erval bo*rn in Paris in 1833 *."
+
+expected_alignment = {
+    'query_aligned': "Gérard de -N-erval was bo-rn in Paris in 1808 -.",
+    'matched_aligned': '|.||||||||-|-||||||----||-|||||||||||||||||..|-|',
+    'target_aligned': "G*rard de *N*erval ----bo*rn in Paris in 1833 *."}
+
+@pytest.mark.parametrize("test_input, expected",
+    [((fake_annot_original, fake_predict_original), expected_alignment)])
+def test_align(test_input, expected) :
+    a = edlib.align(*test_input, task="path")
+    result_alignment = edlib.getNiceAlignment(a, *test_input)
+    assert result_alignment == expected
diff --git a/tests/test_annot.bio b/tests/test_annot.bio
new file mode 100644
index 0000000000000000000000000000000000000000..9068e77fbbca822b75be60f187cd9cb6c4704ae1
--- /dev/null
+++ b/tests/test_annot.bio
@@ -0,0 +1,10 @@
+Gérard B-PER
+de I-PER
+Nerval I-PER
+was O
+born O
+in O
+Paris B-LOC
+in O
+1808 B-DAT
+. O
diff --git a/tests/test_bad.bio b/tests/test_bad.bio
new file mode 100644
index 0000000000000000000000000000000000000000..b3038cda387e3d2ef8a3525415db1094a7ed3cbb
--- /dev/null
+++ b/tests/test_bad.bio
@@ -0,0 +1,12 @@
+This B-
+file
+is O
+not O
+build O-
+as B-LOC B-PER
+a
+BIO
+file
+is
+expected
+.
diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5459fe397650f6a8cf73109968256c5e4d9434d
--- /dev/null
+++ b/tests/test_compute_matches.py
@@ -0,0 +1,60 @@
+import pytest
+from nerval import evaluate
+
+fake_annot_aligned = "Gérard de -N-erval was bo-rn in Paris in 1808 -."
+fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
+
+fake_annot_tags_aligned = [
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'PER',
+            'PER', 'PER',
+            'PER',
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'O',
+            'O','O','O',
+            'O',
+            'O', 'O', 'O','O','O',
+            'O',
+            'O','O',
+            'O',
+            'LOC','LOC','LOC','LOC','LOC',
+            'O',
+            'O','O',
+            'O',
+            'DAT','DAT','DAT','DAT',
+            'O',
+            'O', 'O'
+           ]
+
+fake_predict_tags_aligned = [
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'PER',
+            'PER', 'PER',
+            'PER',
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
+            'O',
+            'O', 'O','O','O',
+            'O', 'O','O','O','O',
+            'O',
+            'O','O',
+            'O',
+            '***','***','***','***','***',
+            'O',
+            'O','O',
+            'O',
+            'DAT','DAT','DAT','DAT',
+            'O',
+            'O', 'O'
+           ]
+
+expected_matches = {'All': 1, 'PER': 1, 'LOC': 0, 'DAT': 0}
+
+@pytest.mark.parametrize("test_input, expected",
+    [((fake_annot_aligned, fake_predict_aligned, fake_annot_tags_aligned, fake_predict_tags_aligned), expected_matches)]
+)
+def test_compute_matches(test_input, expected) :
+    assert evaluate.compute_matches(*test_input) == expected
+
+def test_compute_matches_empty_entry() :
+    with pytest.raises(AssertionError) :
+        evaluate.compute_matches(None, None, None, None)
diff --git a/tests/test_compute_scores.py b/tests/test_compute_scores.py
new file mode 100644
index 0000000000000000000000000000000000000000..e90a348b9d84383f31b3b5e5b5d96b552c2f043c
--- /dev/null
+++ b/tests/test_compute_scores.py
@@ -0,0 +1,21 @@
+import pytest
+from nerval import evaluate
+
+fake_annot_entity_count = {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1}
+fake_predict_entity_count = {'All': 3, 'DAT': 1, '***': 1, 'PER': 1}
+fake_matches = {'All': 1, 'PER': 1, 'LOC': 0, 'DAT': 0}
+
+expected_scores = {
+            '***': {'P': 0.0, 'R': None, 'F1': None},
+            'DAT': {'P': 0.0, 'R': 0.0, 'F1': 0},
+            'All': {'P': 0.3333333333333333, 'R': 0.3333333333333333, 'F1': 0.3333333333333333},
+            'PER': {'P': 1.0, 'R': 1.0, 'F1': 1.0},
+            'LOC': {'P': None, 'R': 0.0, 'F1': None}
+            }
+
+
+@pytest.mark.parametrize("test_input, expected",
+    [((fake_annot_entity_count, fake_predict_entity_count, fake_matches), expected_scores)]
+)
+def test_compute_scores(test_input, expected) :
+    assert evaluate.compute_scores(*test_input) == expected
diff --git a/tests/test_empty.bio b/tests/test_empty.bio
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/test_get_tags_aligned.py b/tests/test_get_tags_aligned.py
new file mode 100644
index 0000000000000000000000000000000000000000..c299c0cd9759d464f34db8ec04a9179ba03a2bee
--- /dev/null
+++ b/tests/test_get_tags_aligned.py
@@ -0,0 +1,104 @@
+import pytest
+from nerval import evaluate
+
+fake_annot_original = "Gérard de Nerval was born in Paris in 1808 ."
+fake_predict_original = "G*rard de *N*erval bo*rn in Paris in 1833 *."
+
+fake_annot_aligned = "Gérard de -N-erval was bo-rn in Paris in 1808 -."
+fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
+
+fake_annot_tags_original = [
+           'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+           'PER',
+           'PER', 'PER',
+           'PER',
+           'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+           'O',
+           'O','O','O',
+           'O',
+           'O', 'O', 'O','O',
+           'O',
+           'O','O',
+           'O',
+           'LOC','LOC','LOC','LOC','LOC',
+           'O',
+           'O','O',
+           'O',
+           'DAT','DAT','DAT','DAT',
+           'O',
+           'O'
+           ]
+
+fake_predict_tags_original = [
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'PER',
+            'PER', 'PER',
+            'PER',
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
+            'O',
+            'O','O','O','O','O',
+            'O',
+            'O','O',
+            'O',
+            '***','***','***','***','***',
+            'O',
+            'O','O',
+            'O',
+            'DAT','DAT','DAT','DAT',
+            'O',
+            'O', 'O'
+           ]
+
+expected_annot_tags_aligned = [
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'PER',
+            'PER', 'PER',
+            'PER',
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'O',
+            'O','O','O',
+            'O',
+            'O', 'O', 'O','O','O',
+            'O',
+            'O','O',
+            'O',
+            'LOC','LOC','LOC','LOC','LOC',
+            'O',
+            'O','O',
+            'O',
+            'DAT','DAT','DAT','DAT',
+            'O',
+            'O', 'O'
+           ]
+
+expected_predict_tags_aligned = [
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+            'PER',
+            'PER', 'PER',
+            'PER',
+            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
+            'O',
+            'O', 'O','O','O',
+            'O', 'O','O','O','O',
+            'O',
+            'O','O',
+            'O',
+            '***','***','***','***','***',
+            'O',
+            'O','O',
+            'O',
+            'DAT','DAT','DAT','DAT',
+            'O',
+            'O', 'O'
+           ]
+
+@pytest.mark.parametrize("test_input, expected",
+    [((fake_annot_original, fake_annot_aligned, fake_annot_tags_original), expected_annot_tags_aligned),
+     ((fake_predict_original, fake_predict_aligned, fake_predict_tags_original), expected_predict_tags_aligned)]
+)
+def test_get_tags_aligned (test_input, expected) :
+    assert evaluate.get_tags_aligned(*test_input) == expected
+
+def test_get_tags_aligned_empty_entry() :
+    with pytest.raises(AssertionError) :
+        evaluate.get_tags_aligned(None, None, None)
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a031b78136d40cc10b2ed3fe1224e65368ea1c
--- /dev/null
+++ b/tests/test_parse_bio.py
@@ -0,0 +1,72 @@
+import pytest
+from nerval import evaluate
+
+NO_EXIST_BIO = "no_exist.bio"
+EMPTY_BIO = "test_empty.bio"
+BAD_BIO = "test_bad.bio"
+FAKE_ANNOT_BIO = "test_annot.bio"
+FAKE_PREDICT_BIO = "test_predict.bio"
+
+expected_parsed_annot = {
+        'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
+        'tags': ['PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+                 'PER',
+                 'PER', 'PER',
+                 'PER',
+                 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+                 'O',
+                 'O','O','O',
+                 'O',
+                 'O', 'O', 'O','O',
+                 'O',
+                 'O','O',
+                 'O',
+                 'LOC','LOC','LOC','LOC','LOC',
+                 'O',
+                 'O','O',
+                 'O',
+                 'DAT','DAT','DAT','DAT',
+                 'O',
+                 'O'
+                ],
+        'words': 'Gérard de Nerval was born in Paris in 1808 .'
+       }
+
+expected_parsed_predict = {
+            'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
+            'tags': ['PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+                     'PER',
+                     'PER', 'PER',
+                     'PER',
+                     'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
+                     'O',
+                     'O','O','O','O','O',
+                     'O',
+                     'O','O',
+                     'O',
+                     '***','***','***','***','***',
+                     'O',
+                     'O','O',
+                     'O',
+                     'DAT','DAT','DAT','DAT',
+                     'O',
+                     'O', 'O'
+                    ],
+            'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.'
+           }
+
+@pytest.mark.parametrize("test_input, expected",
+    [(FAKE_ANNOT_BIO, expected_parsed_annot),
+     (FAKE_PREDICT_BIO, expected_parsed_predict),
+     (EMPTY_BIO, None)],
+    )
+def test_parse_bio(test_input, expected) :
+    assert evaluate.parse_bio(test_input) == expected
+
+def test_parse_bio_bad_input() :
+    with pytest.raises(Exception) :
+        evaluate.parse_bio(BAD_BIO)
+
+def test_parse_bio_no_input() :
+    with pytest.raises(AssertionError) :
+        evaluate.parse_bio(NO_EXIST_BIO)
diff --git a/tests/test_predict.bio b/tests/test_predict.bio
new file mode 100644
index 0000000000000000000000000000000000000000..32fff7e08379cfc7e94a1127faa54ea9415afbc7
--- /dev/null
+++ b/tests/test_predict.bio
@@ -0,0 +1,9 @@
+G*rard B-PER
+de I-PER
+*N*erval I-PER
+bo*rn O
+in O
+Paris B-***
+in O
+1833 B-DAT
+*. O
diff --git a/tests/test_run.py b/tests/test_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f3f5df0b4cbe5b71b792d540980d242241086d
--- /dev/null
+++ b/tests/test_run.py
@@ -0,0 +1,27 @@
+import pytest
+from nerval import evaluate
+
+FAKE_ANNOT_BIO = "test_annot.bio"
+FAKE_PREDICT_BIO = "test_predict.bio"
+
+expected_scores = {
+            '***': {'P': 0.0, 'R': None, 'F1': None},
+            'DAT': {'P': 0.0, 'R': 0.0, 'F1': 0},
+            'All': {'P': 0.3333333333333333, 'R': 0.3333333333333333, 'F1': 0.3333333333333333},
+            'PER': {'P': 1.0, 'R': 1.0, 'F1': 1.0},
+            'LOC': {'P': None, 'R': 0.0, 'F1': None}
+            }
+
+@pytest.mark.parametrize("test_input, expected",
+    [((FAKE_ANNOT_BIO, FAKE_PREDICT_BIO), expected_scores)]
+)
+def test_run(test_input, expected) :
+    assert evaluate.run(*test_input) == expected
+
+def test_run_empty_bio() :
+    with pytest.raises(Exception) :
+        evaluate.run(EMPTY_BIO, EMPTY_BIO)
+
+def test_run_empty_entry() :
+    with pytest.raises(TypeError) :
+        evaluate.run(None, None)