diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000000000000000000000000000000000..ed02b7691ddfbf487adb28a9ed875cbe45dd301c
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+max-line-length = 120
+exclude=build,.cache,.eggs,.git,src/zeep,front
+# Flake8 ignores multiple errors by default;
+# the only interesting ignore is W503, which goes against PEP8.
+# See https://lintlyci.github.io/Flake8Rules/rules/W503.html
+ignore = E203,E501,W503
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 21a2baed9dd04a0fdf033062db21300e707ec93a..142d9beaee82224a8925bb43fa8aeb7e20dce0e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .env/
 *.egg-info/
 __pycache__
+.tox
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..85b7501646a942f3a6ad2abd1a3e3077e973a793
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,44 @@
+stages:
+  - test
+  - build
+
+variables:
+  PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
+cache:
+  paths:
+    - .cache/pip
+
+linter:
+  stage: test
+  image: python:3
+
+  cache:
+    paths:
+      - .cache/pip
+      - .cache/pre-commit
+
+  except:
+    - schedules
+
+  variables:
+    PRE_COMMIT_HOME: "$CI_PROJECT_DIR/.cache/pre-commit"
+
+  before_script:
+    - pip install pre-commit
+
+  script:
+    - pre-commit run -a
+
+tests:
+  stage: test
+  image: python:3.7
+
+  cache:
+    paths:
+      - .cache/pip
+
+  before_script:
+    - pip install tox
+
+  script:
+    - tox
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..0b0cc35b38de0362f837fcd41a23467538473aa8
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,10 @@
+[settings]
+# Compatible with black
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_wrap = 0
+use_parentheses = True
+line_length = 120
+
+default_section=FIRSTPARTY
+known_third_party = editdistance,edlib,pytest,setuptools
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..734f69c54ecacde1acb0838717c933a7f7aa833f
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,50 @@
+repos:
+  - repo: https://github.com/asottile/seed-isort-config
+    rev: v2.2.0
+    hooks:
+      - id: seed-isort-config
+  - repo: https://github.com/pre-commit/mirrors-isort
+    rev: v4.3.21
+    hooks:
+      - id: isort
+  - repo: https://github.com/ambv/black
+    rev: 20.8b1
+    hooks:
+    - id: black
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.3
+    hooks:
+      - id: flake8
+        additional_dependencies:
+          - 'flake8-coding==1.3.1'
+          - 'flake8-copyright==0.2.2'
+          - 'flake8-debugger==3.1.0'
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: check-ast
+      - id: check-docstring-first
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: debug-statements
+      - id: trailing-whitespace
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: ['--django']
+      - id: check-json
+      - id: requirements-txt-fixer
+  - repo: https://github.com/codespell-project/codespell
+    rev: v1.17.1
+    hooks:
+      - id: codespell
+        args: ['--write-changes']
+        exclude: '\.bio$'
+  - repo: meta
+    hooks:
+      - id: check-useless-excludes
+
+default_language_version:
+  python: python3.7
diff --git a/README.md b/README.md
index 7b8bd8ee4c8f2be8fddb32c1445fa598798c6801..36a175b8f44bdb4a63aae9264b7971bac7b3a63a 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,8 @@ $ pip3 install .
 
 To run the tests and check that everything is fine:
 ```
-$ cd tests
-$ pytest
+$ pip3 install tox
+$ tox
 ```
 
 You can now use Nerval in command line :
@@ -44,7 +44,7 @@ $ nerval -a demo/toy_test_annot.bio -p demo/toy_test_predict.bio
 
 This metric uses string alignment at character level.
 
-The automatic transcription is first aligned with the ground truth at character level, by minimising the Levenshtein distance between them. Each entity in the ground truth is then matched with a corresponding entity in the aligned transcription, with the same entity label, or an empty character string if no match is found. If the edit distance between the two entities is less than 30% of the ground truth entity length, the predicted entity is considered as recognised. For the purpose of matching detected entities to existing databases, we estimated that a 70% match between the entity texts was a fair threshold. 
+The automatic transcription is first aligned with the ground truth at character level, by minimising the Levenshtein distance between them. Each entity in the ground truth is then matched with a corresponding entity in the aligned transcription, with the same entity label, or an empty character string if no match is found. If the edit distance between the two entities is less than 30% of the ground truth entity length, the predicted entity is considered as recognised. For the purpose of matching detected entities to existing databases, we estimated that a 70% match between the entity texts was a fair threshold.
 
 #### Details :
 
@@ -73,7 +73,7 @@ produces the following list of tags, one per character plus spaces:
  'O',
  'OCC','OCC','OCC','OCC','OCC','OCC',
  'O',
- 'O'] 
+ 'O']
 ```
 
 And the prediction file could be:
@@ -94,7 +94,7 @@ producing:
  'O',
  'OCC','OCC','OCC','OCC','OCC','OCC','OCC',
  'O',
- 'O','O'] 
+ 'O','O']
 ```
 
 - Character level alignment between annotation and prediction adds '-' characters to both strings so they are the same length
@@ -110,16 +110,16 @@ the alignment result is:
 
 ```
 annotation : Tolkie-n- was a writer- -.
-prediction : Tolkieene xas --writear ,. 
+prediction : Tolkieene xas --writear ,.
 ```
 
-- Adapt character-level tag to aligned strings 
+- Adapt character-level tag to aligned strings
   - '-' characters in aligned strings get the same tag as the previous proper character in the string
 
 ```
              PPPPPPPPPOOOOOOOCCCCCCCOOO
 annotation : Tolkie-n- was a writer- -.
-prediction : Tolkieene xas --writear ,. 
+prediction : Tolkieene xas --writear ,.
              PPPPPPPPPOOOOOOOCCCCCCCOOO
 ```
 - Search for matching entity for each entity in the annotation
@@ -184,3 +184,20 @@ P = 2/2
 R = 2/2
 F1 = 2*1*1/(1+1)
 ```
+
+## Linting
+
+We use [pre-commit](https://pre-commit.com/) to check the Python source code syntax of this project.
+
+To be efficient, you should run pre-commit before committing (hence the name...).
+
+To do that, run once :
+
+```
+pip install pre-commit
+pre-commit install
+```
+
+The linting workflow will now run on modified files before committing, and may fix issues for you.
+
+If you want to run the full workflow on all the files: `pre-commit run -a`.
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index 93b35c2d3b29ed40853bc86bd99f34025b85ef19..f0cea430ff0c8aa9b7f5de0869af281cea69d0b0 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -1,68 +1,78 @@
-import edlib
+# -*- coding: utf-8 -*-
+"""
+This script takes two bio files of annotation and prediction, and compute recall and precision for each NE label.
+"""
+
 import argparse
 import logging
-import editdistance
-import pandas as pd
 import os
 import re
 
-'''
-This script takes two bio files of annotation and prediction, and compute recall and precision for each NE label.
-'''
+import editdistance
+import edlib
 
 THRESHOLD = 0.30
 NOT_ENTITY_TAG = "O"
 
-def parse_bio(path : str) -> dict :
-    ''' Parse a BIO file to get text content, character-level NE tags and entity types count.
+
+def parse_bio(path: str) -> dict:
+    """Parse a BIO file to get text content, character-level NE tags and entity types count.
 
     Input : path to a valid BIO file
     Output format : { "words": str, "tags": list; "entity_count" : { tag : int} }
-    '''
+    """
 
     assert os.path.exists(path)
 
     words = []
     tags = []
-    entity_count = {'All':0}
+    entity_count = {"All": 0}
     last_tag = None
 
-    with open(path, 'r') as fd :
+    with open(path, "r") as fd:
 
-        for line in list(filter(lambda x:x!='\n', fd.readlines())) :
+        for line in list(filter(lambda x: x != "\n", fd.readlines())):
 
             word, label = line.split()
 
             # Preservation of '-' characters and avoid confusion with the dashes added later during the alignment
-            word = word.replace('-', 'Â§')
+            word = word.replace("-", "Â§")
             words.append(word)
 
-            try :
-                tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else re.match(r"[BIES]-(.{3})", label)[1]
-            except TypeError as e:
-                raise(Exception(f"The file {path} given in input is not in BIO format."))
+            try:
+                tag = (
+                    NOT_ENTITY_TAG
+                    if label == NOT_ENTITY_TAG
+                    else re.match(r"[BIES]-(.{3})", label)[1]
+                )
+            except TypeError:
+                raise (
+                    Exception(f"The file {path} given in input is not in BIO format.")
+                )
 
             # Spaces will be added between words and have to get a tag
             # If previous word has the same tag as current, the space also gets the tag
-            if last_tag is not None :
-                if last_tag == tag :
+            if last_tag is not None:
+                if last_tag == tag:
                     tags.append(tag)
-                else :
+                else:
                     tags.append(NOT_ENTITY_TAG)
 
             # Add a tag for each letter in the word
-            tags += [tag,] * len(word)
+            tags += [
+                tag,
+            ] * len(word)
 
             # Count nb entity for each type
-            if (not label == 'O') and (not last_tag == tag) :
+            if (not label == "O") and (not last_tag == tag):
                 entity_count[tag] = entity_count.get(tag, 0) + 1
-                entity_count['All'] += 1
+                entity_count["All"] += 1
 
             last_tag = tag
 
         result = None
 
-        if words :
+        if words:
             # Make string out of words
             result = dict()
             result["words"] = " ".join(words)
@@ -73,8 +83,10 @@ def parse_bio(path : str) -> dict :
     return result
 
 
-def compute_matches(annotation : str, prediction : str, tags_annot : list, tags_predict : list) -> dict :
-    '''Compute prediction score from annotation string to prediction string.
+def compute_matches(
+    annotation: str, prediction: str, tags_annot: list, tags_predict: list
+) -> dict:
+    """Compute prediction score from annotation string to prediction string.
 
     Annotation and prediction strings should be the same length.
 
@@ -101,29 +113,29 @@ def compute_matches(annotation : str, prediction : str, tags_annot : list, tags_
     tags_predict : list of string , example : ['P','P','P','P','P','P','P','P','P','O', ...]
 
     Output : {TAG1 : nb_entity_matched, ...}, example : {'All': 1, 'OCC': 0, 'PER': 1}
-    '''
+    """
 
     assert annotation
     assert prediction
     assert tags_annot
     assert tags_predict
 
-    entity_count = {"All":0}
+    entity_count = {"All": 0}
     last_tag = NOT_ENTITY_TAG
 
     # Inspecting reference string
-    for i, char_annot in enumerate(annotation) :
+    for i, char_annot in enumerate(annotation):
         tag_ref = tags_annot[i]
 
         # If character not in entity
-        if tag_ref == NOT_ENTITY_TAG :
-            last_tag =  NOT_ENTITY_TAG
+        if tag_ref == NOT_ENTITY_TAG:
+            last_tag = NOT_ENTITY_TAG
 
         # Else, in entity
-        else :
+        else:
 
             # If beginning new entity
-            if not tag_ref == last_tag :
+            if not tag_ref == last_tag:
                 current_ref, current_compar = [], []
                 last_tag = tag_ref
                 found_aligned_beginning = False
@@ -132,31 +144,33 @@ def compute_matches(annotation : str, prediction : str, tags_annot : list, tags_
             current_ref.append(char_annot)
 
             # Searching character string corresponding with tag
-            if not found_aligned_end and tags_predict[i] == tag_ref :
+            if not found_aligned_end and tags_predict[i] == tag_ref:
 
                 # If just beginning new entity, backtrack tags on prediction string
                 if len(current_ref) == 1:
-                    j = i-1
-                    while j >= 0 and tags_predict[j] == tag_ref :
+                    j = i - 1
+                    while j >= 0 and tags_predict[j] == tag_ref:
                         j -= 1
-                    current_compar += prediction[j+1:i]
+                    current_compar += prediction[j + 1 : i]
 
                 found_aligned_beginning = True
                 current_compar.append(prediction[i])
 
             # If tags don't match and beginning was found : end of predicted entity
-            elif found_aligned_beginning :
+            elif found_aligned_beginning:
                 found_aligned_end = True
 
             # Detect end of entity in annotation
-            if (i+1 == len(annotation)) or (i+1 < len(annotation) and not tags_annot[i+1] == last_tag) :
+            if (i + 1 == len(annotation)) or (
+                i + 1 < len(annotation) and not tags_annot[i + 1] == last_tag
+            ):
 
                 # Aligned entity may end further in prediction, so get the rest of the characters
-                if not found_aligned_end :
-                    j = i+1
-                    while j < len(tags_predict) and tags_predict[j] == tag_ref :
+                if not found_aligned_end:
+                    j = i + 1
+                    while j < len(tags_predict) and tags_predict[j] == tag_ref:
                         j += 1
-                    for k in range(i+1, j) :
+                    for k in range(i + 1, j):
                         current_compar.append(prediction[k])
 
                 # Normalize found character strings
@@ -167,20 +181,26 @@ def compute_matches(annotation : str, prediction : str, tags_annot : list, tags_
                 entity_compar = entity_compar.replace("-", "")
 
                 # One entity is counted as recognized (score of 1) if the Levenhstein distance between the expected and predicted entities
-                # represents less than 30% (Treshold) of the length of the expected entity.
+                # represents less than 30% (Threshold) of the length of the expected entity.
                 # Precision and recall will be computed for each category in comparing the numbers of recognized entities and expected entities
-                score = 1 if editdistance.eval(entity_ref, entity_compar)/len_entity < THRESHOLD else 0
+                score = (
+                    1
+                    if editdistance.eval(entity_ref, entity_compar) / len_entity
+                    < THRESHOLD
+                    else 0
+                )
                 entity_count[last_tag] = entity_count.get(last_tag, 0) + score
                 entity_count["All"] += score
 
     return entity_count
 
-def get_tags_aligned(original : str, aligned : str, tags_original : list) -> list:
-    ''' Takes original string, original string tags and aligned string given by edlib.align.
+
+def get_tags_aligned(original: str, aligned: str, tags_original: list) -> list:
+    """Takes original string, original string tags and aligned string given by edlib.align.
     Returns a list of tags corresponding to the aligned string.
 
-    Ouptut format : list of strings
-    '''
+    Output format : list of strings
+    """
     assert original
     assert aligned
     assert tags_original
@@ -190,20 +210,20 @@ def get_tags_aligned(original : str, aligned : str, tags_original : list) -> lis
     last_tag = NOT_ENTITY_TAG
 
     # Inspecting aligned string
-    for i, char in enumerate(aligned) :
+    for i, char in enumerate(aligned):
         new_tag = ""
 
         # If original string has been fully processed, rest of tags are O ('-' characters at aligned end)
-        if index_original >= len(original) :
+        if index_original >= len(original):
             new_tag = NOT_ENTITY_TAG
 
         # If current aligned char does not match current original char ('-' characters in aligned)
         # Keep last_tag and don't increment index_original
-        elif not char == original[index_original] :
+        elif not char == original[index_original]:
             new_tag = last_tag
 
         # Until matching of characters)
-        else :
+        else:
             new_tag = tags_original[index_original]
             last_tag = new_tag
             index_original += 1
@@ -212,8 +232,11 @@ def get_tags_aligned(original : str, aligned : str, tags_original : list) -> lis
 
     return tags_aligned
 
-def compute_scores(annot_tags_count : dict, predict_tags_count : dict, matches : dict) -> dict :
-    ''' Compute Precision, Recall and F1 score for all entity types found in annotation and prediction.
+
+def compute_scores(
+    annot_tags_count: dict, predict_tags_count: dict, matches: dict
+) -> dict:
+    """Compute Precision, Recall and F1 score for all entity types found in annotation and prediction.
 
     Each measure is given at document level, global score is a micro-average over tag types.
 
@@ -224,21 +247,27 @@ def compute_scores(annot_tags_count : dict, predict_tags_count : dict, matches :
 
     Output :
     scores : { TAG1(str) : {"P" : float, "R" : float, "F1" : float}, ... }
-    '''
+    """
 
     annot_tags = set(annot_tags_count.keys())
     predict_tags = set(predict_tags_count.keys())
     tags = annot_tags | predict_tags
 
-    scores = { tag : {"P" : None, "R" : None, "F1" : None} for tag in tags}
+    scores = {tag: {"P": None, "R": None, "F1": None} for tag in tags}
 
-    for tag in sorted(tags)[::-1] :
+    for tag in sorted(tags)[::-1]:
         nb_predict = predict_tags_count.get(tag)
         nb_annot = annot_tags_count.get(tag)
-        nb_match = matches.get(tag,0)
+        nb_match = matches.get(tag, 0)
         prec = None if not nb_predict else nb_match / nb_predict
         rec = None if not nb_annot else nb_match / nb_annot
-        f1 = None if (prec is None) or (rec is None) else 0 if (prec+rec==0) else 2 * (prec * rec) / (prec + rec)
+        f1 = (
+            None
+            if (prec is None) or (rec is None)
+            else 0
+            if (prec + rec == 0)
+            else 2 * (prec * rec) / (prec + rec)
+        )
 
         scores[tag]["P"] = prec
         scores[tag]["R"] = rec
@@ -246,18 +275,19 @@ def compute_scores(annot_tags_count : dict, predict_tags_count : dict, matches :
 
     return scores
 
-def print_results(scores : dict) :
-    ''' Display final results.
+
+def print_results(scores: dict):
+    """Display final results.
 
     None values are kept to indicate the absence of a certain tag in either annotation or prediction.
-    '''
+    """
     logging.info("-- Results --")
 
-    for tag in sorted(scores.keys())[::-1] :
+    for tag in sorted(scores.keys())[::-1]:
 
-        prec = None if scores[tag]["P"] is None else round(scores[tag]["P"],2)
-        rec = None if scores[tag]["R"] is None else round(scores[tag]["R"],2)
-        f1 = None if scores[tag]["F1"] is None else round(scores[tag]["F1"],2)
+        prec = None if scores[tag]["P"] is None else round(scores[tag]["P"], 2)
+        rec = None if scores[tag]["R"] is None else round(scores[tag]["R"], 2)
+        f1 = None if scores[tag]["F1"] is None else round(scores[tag]["F1"], 2)
 
         logging.info(f"{tag} :")
         logging.info(f"P = {prec}")
@@ -265,32 +295,38 @@ def print_results(scores : dict) :
         logging.info(f"F1 = {f1}")
 
 
-def run(annotation : str, prediction : str) -> dict :
-    ''' Compute recall and precision for each entity type found in annotation and/or prediction.
+def run(annotation: str, prediction: str) -> dict:
+    """Compute recall and precision for each entity type found in annotation and/or prediction.
 
     Each measure is given at document level, global score is a micro-average over tag types.
-    '''
+    """
 
     # Get string and list of tags per character
     annot = parse_bio(annotation)
     predict = parse_bio(prediction)
 
-    if not annot or not predict :
+    if not annot or not predict:
         raise Exception("No content found in annotation or prediction files.")
 
     # Align annotation and prediction
     align_result = edlib.align(annot["words"], predict["words"], task="path")
-    nice_alignment = edlib.getNiceAlignment(align_result, annot["words"], predict["words"])
+    nice_alignment = edlib.getNiceAlignment(
+        align_result, annot["words"], predict["words"]
+    )
 
     annot_aligned = nice_alignment["query_aligned"]
     predict_aligned = nice_alignment["target_aligned"]
 
     # Align tags from string alignment
     tags_annot_aligned = get_tags_aligned(annot["words"], annot_aligned, annot["tags"])
-    tags_predict_aligned = get_tags_aligned(predict["words"], predict_aligned, predict["tags"])
+    tags_predict_aligned = get_tags_aligned(
+        predict["words"], predict_aligned, predict["tags"]
+    )
 
     # Get nb match
-    matches = compute_matches(annot_aligned, predict_aligned, tags_annot_aligned, tags_predict_aligned)
+    matches = compute_matches(
+        annot_aligned, predict_aligned, tags_annot_aligned, tags_predict_aligned
+    )
 
     # Compute scores
     scores = compute_scores(annot["entity_count"], predict["entity_count"], matches)
@@ -300,26 +336,23 @@ def run(annotation : str, prediction : str) -> dict :
 
     return scores
 
-def main() :
-    ''' Get arguments and run.
-    '''
+
+def main():
+    """Get arguments and run."""
 
     logging.basicConfig(level=logging.INFO)
 
     parser = argparse.ArgumentParser(description="Compute score of NER on predict.")
     parser.add_argument(
-        "-a", "--annot",
-        help="Annotation in BIO format.",
-        required=True
+        "-a", "--annot", help="Annotation in BIO format.", required=True
     )
     parser.add_argument(
-        "-p", "--predict",
-        help="Prediction in BIO format.",
-        required=True
+        "-p", "--predict", help="Prediction in BIO format.", required=True
     )
     args = parser.parse_args()
 
     run(args.annot, args.predict)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/requirements.txt b/requirements.txt
index f27173558a3f4d4903d543f87b35d6a559f86a84..65d95a67fb685786042dda7cccf8dc87b2389b78 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,2 @@
-edlib==1.3.8.post2
-pandas==1.2.3
 editdistance==0.5.3
-pytest==6.2.2
+edlib==1.3.8.post2
diff --git a/setup.py b/setup.py
index 51f34d7dfdcd5c2d2023c65262600eaf8e8fef9a..b3f9432c3f680b1ac2f6a6c947f66eb4c0fcae4d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,15 @@
+# -*- coding: utf-8 -*-
 import os.path
+
 from setuptools import setup
 
+
 def requirements(path):
     assert os.path.exists(path), "Missing requirements {}.format(path)"
     with open(path) as f:
         return list(map(str.strip, f.read().splitlines()))
 
+
 install_requires = requirements("requirements.txt")
 
 setup(
@@ -14,7 +18,7 @@ setup(
     description="Tool to evaluate NER on noisy text.",
     author="Teklia",
     author_email="contact@teklia.com",
-    packages=['nerval'],
+    packages=["nerval"],
     entry_points={"console_scripts": ["nerval=nerval.evaluate:main"]},
-    install_requires=install_requires
+    install_requires=install_requires,
 )
diff --git a/tests/test_align.py b/tests/test_align.py
index c9fd966117e7b1415b785abef4c1cba732d88086..d4320252dee783deedf84680f397f72b35b22e77 100644
--- a/tests/test_align.py
+++ b/tests/test_align.py
@@ -1,18 +1,22 @@
-import pytest
+# -*- coding: utf-8 -*-
 import edlib
-from nerval import evaluate
+import pytest
 
 fake_annot_original = "GÃ©rard de Nerval was born in Paris in 1808 ."
 fake_predict_original = "G*rard de *N*erval bo*rn in Paris in 1833 *."
 
 expected_alignment = {
-    'query_aligned': "GÃ©rard de -N-erval was bo-rn in Paris in 1808 -.",
-    'matched_aligned': '|.||||||||-|-||||||----||-|||||||||||||||||..|-|',
-    'target_aligned': "G*rard de *N*erval ----bo*rn in Paris in 1833 *."}
+    "query_aligned": "GÃ©rard de -N-erval was bo-rn in Paris in 1808 -.",
+    "matched_aligned": "|.||||||||-|-||||||----||-|||||||||||||||||..|-|",
+    "target_aligned": "G*rard de *N*erval ----bo*rn in Paris in 1833 *.",
+}
+
 
-@pytest.mark.parametrize("test_input, expected",
-    [((fake_annot_original, fake_predict_original), expected_alignment)])
-def test_align(test_input, expected) :
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [((fake_annot_original, fake_predict_original), expected_alignment)],
+)
+def test_align(test_input, expected):
     a = edlib.align(*test_input, task="path")
     result_alignment = edlib.getNiceAlignment(a, *test_input)
     assert result_alignment == expected
diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py
index d5459fe397650f6a8cf73109968256c5e4d9434d..cf596dd680cd19e3aa8a8b60da7eaa1b7bab100c 100644
--- a/tests/test_compute_matches.py
+++ b/tests/test_compute_matches.py
@@ -1,60 +1,77 @@
+# -*- coding: utf-8 -*-
 import pytest
+
 from nerval import evaluate
 
 fake_annot_aligned = "GÃ©rard de -N-erval was bo-rn in Paris in 1808 -."
 fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
 
+# fmt: off
 fake_annot_tags_aligned = [
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'PER',
-            'PER', 'PER',
-            'PER',
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'O',
-            'O','O','O',
-            'O',
-            'O', 'O', 'O','O','O',
-            'O',
-            'O','O',
-            'O',
-            'LOC','LOC','LOC','LOC','LOC',
-            'O',
-            'O','O',
-            'O',
-            'DAT','DAT','DAT','DAT',
-            'O',
-            'O', 'O'
-           ]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O',
+    'O',
+    'O', 'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O', 'O'
+]
 
 fake_predict_tags_aligned = [
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'PER',
-            'PER', 'PER',
-            'PER',
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
-            'O',
-            'O', 'O','O','O',
-            'O', 'O','O','O','O',
-            'O',
-            'O','O',
-            'O',
-            '***','***','***','***','***',
-            'O',
-            'O','O',
-            'O',
-            'DAT','DAT','DAT','DAT',
-            'O',
-            'O', 'O'
-           ]
-
-expected_matches = {'All': 1, 'PER': 1, 'LOC': 0, 'DAT': 0}
-
-@pytest.mark.parametrize("test_input, expected",
-    [((fake_annot_aligned, fake_predict_aligned, fake_annot_tags_aligned, fake_predict_tags_aligned), expected_matches)]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O', 'O',
+    'O', 'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    '***', '***', '***', '***', '***',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O', 'O'
+]
+# fmt: on
+
+expected_matches = {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}
+
+
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        (
+            (
+                fake_annot_aligned,
+                fake_predict_aligned,
+                fake_annot_tags_aligned,
+                fake_predict_tags_aligned,
+            ),
+            expected_matches,
+        )
+    ],
 )
-def test_compute_matches(test_input, expected) :
+def test_compute_matches(test_input, expected):
     assert evaluate.compute_matches(*test_input) == expected
 
-def test_compute_matches_empty_entry() :
-    with pytest.raises(AssertionError) :
+
+def test_compute_matches_empty_entry():
+    with pytest.raises(AssertionError):
         evaluate.compute_matches(None, None, None, None)
diff --git a/tests/test_compute_scores.py b/tests/test_compute_scores.py
index e90a348b9d84383f31b3b5e5b5d96b552c2f043c..a6c86a18fff59433b7a65d61d38602c52586adeb 100644
--- a/tests/test_compute_scores.py
+++ b/tests/test_compute_scores.py
@@ -1,21 +1,29 @@
+# -*- coding: utf-8 -*-
 import pytest
+
 from nerval import evaluate
 
-fake_annot_entity_count = {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1}
-fake_predict_entity_count = {'All': 3, 'DAT': 1, '***': 1, 'PER': 1}
-fake_matches = {'All': 1, 'PER': 1, 'LOC': 0, 'DAT': 0}
+fake_annot_entity_count = {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}
+fake_predict_entity_count = {"All": 3, "DAT": 1, "***": 1, "PER": 1}
+fake_matches = {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}
 
 expected_scores = {
-            '***': {'P': 0.0, 'R': None, 'F1': None},
-            'DAT': {'P': 0.0, 'R': 0.0, 'F1': 0},
-            'All': {'P': 0.3333333333333333, 'R': 0.3333333333333333, 'F1': 0.3333333333333333},
-            'PER': {'P': 1.0, 'R': 1.0, 'F1': 1.0},
-            'LOC': {'P': None, 'R': 0.0, 'F1': None}
-            }
+    "***": {"P": 0.0, "R": None, "F1": None},
+    "DAT": {"P": 0.0, "R": 0.0, "F1": 0},
+    "All": {"P": 0.3333333333333333, "R": 0.3333333333333333, "F1": 0.3333333333333333},
+    "PER": {"P": 1.0, "R": 1.0, "F1": 1.0},
+    "LOC": {"P": None, "R": 0.0, "F1": None},
+}
 
 
-@pytest.mark.parametrize("test_input, expected",
-    [((fake_annot_entity_count, fake_predict_entity_count, fake_matches), expected_scores)]
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        (
+            (fake_annot_entity_count, fake_predict_entity_count, fake_matches),
+            expected_scores,
+        )
+    ],
 )
-def test_compute_scores(test_input, expected) :
+def test_compute_scores(test_input, expected):
     assert evaluate.compute_scores(*test_input) == expected
diff --git a/tests/test_get_tags_aligned.py b/tests/test_get_tags_aligned.py
index c299c0cd9759d464f34db8ec04a9179ba03a2bee..a1070531bbcebba6906d3a97acf7234c5e1d0a76 100644
--- a/tests/test_get_tags_aligned.py
+++ b/tests/test_get_tags_aligned.py
@@ -1,4 +1,6 @@
+# -*- coding: utf-8 -*-
 import pytest
+
 from nerval import evaluate
 
 fake_annot_original = "GÃ©rard de Nerval was born in Paris in 1808 ."
@@ -7,98 +9,111 @@ fake_predict_original = "G*rard de *N*erval bo*rn in Paris in 1833 *."
 fake_annot_aligned = "GÃ©rard de -N-erval was bo-rn in Paris in 1808 -."
 fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *."
 
+# fmt: off
 fake_annot_tags_original = [
-           'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-           'PER',
-           'PER', 'PER',
-           'PER',
-           'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-           'O',
-           'O','O','O',
-           'O',
-           'O', 'O', 'O','O',
-           'O',
-           'O','O',
-           'O',
-           'LOC','LOC','LOC','LOC','LOC',
-           'O',
-           'O','O',
-           'O',
-           'DAT','DAT','DAT','DAT',
-           'O',
-           'O'
-           ]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O',
+    'O',
+    'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O'
+]
 
 fake_predict_tags_original = [
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'PER',
-            'PER', 'PER',
-            'PER',
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
-            'O',
-            'O','O','O','O','O',
-            'O',
-            'O','O',
-            'O',
-            '***','***','***','***','***',
-            'O',
-            'O','O',
-            'O',
-            'DAT','DAT','DAT','DAT',
-            'O',
-            'O', 'O'
-           ]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    '***', '***', '***', '***', '***',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O', 'O'
+]
 
 expected_annot_tags_aligned = [
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'PER',
-            'PER', 'PER',
-            'PER',
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'O',
-            'O','O','O',
-            'O',
-            'O', 'O', 'O','O','O',
-            'O',
-            'O','O',
-            'O',
-            'LOC','LOC','LOC','LOC','LOC',
-            'O',
-            'O','O',
-            'O',
-            'DAT','DAT','DAT','DAT',
-            'O',
-            'O', 'O'
-           ]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O',
+    'O',
+    'O', 'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O', 'O'
+]
 
 expected_predict_tags_aligned = [
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-            'PER',
-            'PER', 'PER',
-            'PER',
-            'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
-            'O',
-            'O', 'O','O','O',
-            'O', 'O','O','O','O',
-            'O',
-            'O','O',
-            'O',
-            '***','***','***','***','***',
-            'O',
-            'O','O',
-            'O',
-            'DAT','DAT','DAT','DAT',
-            'O',
-            'O', 'O'
-           ]
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'PER',
+    'PER', 'PER',
+    'PER',
+    'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+    'O',
+    'O', 'O', 'O', 'O',
+    'O', 'O', 'O', 'O', 'O',
+    'O',
+    'O', 'O',
+    'O',
+    '***', '***', '***', '***', '***',
+    'O',
+    'O', 'O',
+    'O',
+    'DAT', 'DAT', 'DAT', 'DAT',
+    'O',
+    'O', 'O'
+]
+# fmt: on
+
 
-@pytest.mark.parametrize("test_input, expected",
-    [((fake_annot_original, fake_annot_aligned, fake_annot_tags_original), expected_annot_tags_aligned),
-     ((fake_predict_original, fake_predict_aligned, fake_predict_tags_original), expected_predict_tags_aligned)]
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        (
+            (fake_annot_original, fake_annot_aligned, fake_annot_tags_original),
+            expected_annot_tags_aligned,
+        ),
+        (
+            (fake_predict_original, fake_predict_aligned, fake_predict_tags_original),
+            expected_predict_tags_aligned,
+        ),
+    ],
 )
-def test_get_tags_aligned (test_input, expected) :
+def test_get_tags_aligned(test_input, expected):
     assert evaluate.get_tags_aligned(*test_input) == expected
 
-def test_get_tags_aligned_empty_entry() :
-    with pytest.raises(AssertionError) :
+
+def test_get_tags_aligned_empty_entry():
+    with pytest.raises(AssertionError):
         evaluate.get_tags_aligned(None, None, None)
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index d8a031b78136d40cc10b2ed3fe1224e65368ea1c..543ec202c60c5ff7f65461cf806efee5f45b0500 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -1,72 +1,84 @@
+# -*- coding: utf-8 -*-
 import pytest
+
 from nerval import evaluate
 
 NO_EXIST_BIO = "no_exist.bio"
-EMPTY_BIO = "test_empty.bio"
-BAD_BIO = "test_bad.bio"
-FAKE_ANNOT_BIO = "test_annot.bio"
-FAKE_PREDICT_BIO = "test_predict.bio"
+EMPTY_BIO = "tests/test_empty.bio"
+BAD_BIO = "tests/test_bad.bio"
+FAKE_ANNOT_BIO = "tests/test_annot.bio"
+FAKE_PREDICT_BIO = "tests/test_predict.bio"
 
+# fmt: off
 expected_parsed_annot = {
-        'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
-        'tags': ['PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-                 'PER',
-                 'PER', 'PER',
-                 'PER',
-                 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-                 'O',
-                 'O','O','O',
-                 'O',
-                 'O', 'O', 'O','O',
-                 'O',
-                 'O','O',
-                 'O',
-                 'LOC','LOC','LOC','LOC','LOC',
-                 'O',
-                 'O','O',
-                 'O',
-                 'DAT','DAT','DAT','DAT',
-                 'O',
-                 'O'
-                ],
-        'words': 'GÃ©rard de Nerval was born in Paris in 1808 .'
-       }
+    'entity_count': {'All': 3, 'DAT': 1, 'LOC': 1, 'PER': 1},
+    'tags': [
+        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+        'PER',
+        'PER', 'PER',
+        'PER',
+        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+        'O',
+        'O', 'O', 'O',
+        'O',
+        'O', 'O', 'O', 'O',
+        'O',
+        'O', 'O',
+        'O',
+        'LOC', 'LOC', 'LOC', 'LOC', 'LOC',
+        'O',
+        'O', 'O',
+        'O',
+        'DAT', 'DAT', 'DAT', 'DAT',
+        'O',
+        'O'
+    ],
+    'words': 'GÃ©rard de Nerval was born in Paris in 1808 .'
+}
 
 expected_parsed_predict = {
-            'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
-            'tags': ['PER', 'PER', 'PER', 'PER', 'PER', 'PER',
-                     'PER',
-                     'PER', 'PER',
-                     'PER',
-                     'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER','PER',
-                     'O',
-                     'O','O','O','O','O',
-                     'O',
-                     'O','O',
-                     'O',
-                     '***','***','***','***','***',
-                     'O',
-                     'O','O',
-                     'O',
-                     'DAT','DAT','DAT','DAT',
-                     'O',
-                     'O', 'O'
-                    ],
-            'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.'
-           }
+    'entity_count': {'All': 3, 'DAT': 1, '***': 1, 'PER': 1},
+    'tags': [
+        'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+        'PER',
+        'PER', 'PER',
+        'PER',
+        'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER', 'PER',
+        'O',
+        'O', 'O', 'O', 'O', 'O',
+        'O',
+        'O', 'O',
+        'O',
+        '***', '***', '***', '***', '***',
+        'O',
+        'O', 'O',
+        'O',
+        'DAT', 'DAT', 'DAT', 'DAT',
+        'O',
+        'O', 'O'
+    ],
+    'words': 'G*rard de *N*erval bo*rn in Paris in 1833 *.'
+}
+# fmt: on
+
 
-@pytest.mark.parametrize("test_input, expected",
-    [(FAKE_ANNOT_BIO, expected_parsed_annot),
-     (FAKE_PREDICT_BIO, expected_parsed_predict),
-     (EMPTY_BIO, None)],
-    )
-def test_parse_bio(test_input, expected) :
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        (FAKE_ANNOT_BIO, expected_parsed_annot),
+        (FAKE_PREDICT_BIO, expected_parsed_predict),
+        (EMPTY_BIO, None),
+    ],
+)
+def test_parse_bio(test_input, expected):
     assert evaluate.parse_bio(test_input) == expected
 
-def test_parse_bio_bad_input() :
-    with pytest.raises(Exception) :
+
+def test_parse_bio_bad_input():
+    with pytest.raises(Exception):
         evaluate.parse_bio(BAD_BIO)
 
-def test_parse_bio_no_input() :
-    with pytest.raises(AssertionError) :
+
+def test_parse_bio_no_input():
+    with pytest.raises(AssertionError):
         evaluate.parse_bio(NO_EXIST_BIO)
diff --git a/tests/test_run.py b/tests/test_run.py
index 47f3f5df0b4cbe5b71b792d540980d242241086d..fd426aa1e341cea0e4b1a5af153a341424766080 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -1,27 +1,33 @@
+# -*- coding: utf-8 -*-
 import pytest
+
 from nerval import evaluate
 
-FAKE_ANNOT_BIO = "test_annot.bio"
-FAKE_PREDICT_BIO = "test_predict.bio"
+FAKE_ANNOT_BIO = "tests/test_annot.bio"
+FAKE_PREDICT_BIO = "tests/test_predict.bio"
+EMPTY_BIO = "tests/test_empty.bio"
 
 expected_scores = {
-            '***': {'P': 0.0, 'R': None, 'F1': None},
-            'DAT': {'P': 0.0, 'R': 0.0, 'F1': 0},
-            'All': {'P': 0.3333333333333333, 'R': 0.3333333333333333, 'F1': 0.3333333333333333},
-            'PER': {'P': 1.0, 'R': 1.0, 'F1': 1.0},
-            'LOC': {'P': None, 'R': 0.0, 'F1': None}
-            }
-
-@pytest.mark.parametrize("test_input, expected",
-    [((FAKE_ANNOT_BIO, FAKE_PREDICT_BIO), expected_scores)]
+    "***": {"P": 0.0, "R": None, "F1": None},
+    "DAT": {"P": 0.0, "R": 0.0, "F1": 0},
+    "All": {"P": 0.3333333333333333, "R": 0.3333333333333333, "F1": 0.3333333333333333},
+    "PER": {"P": 1.0, "R": 1.0, "F1": 1.0},
+    "LOC": {"P": None, "R": 0.0, "F1": None},
+}
+
+
+@pytest.mark.parametrize(
+    "test_input, expected", [((FAKE_ANNOT_BIO, FAKE_PREDICT_BIO), expected_scores)]
 )
-def test_run(test_input, expected) :
+def test_run(test_input, expected):
     assert evaluate.run(*test_input) == expected
 
-def test_run_empty_bio() :
-    with pytest.raises(Exception) :
+
+def test_run_empty_bio():
+    with pytest.raises(Exception):
         evaluate.run(EMPTY_BIO, EMPTY_BIO)
 
-def test_run_empty_entry() :
-    with pytest.raises(TypeError) :
+
+def test_run_empty_entry():
+    with pytest.raises(TypeError):
         evaluate.run(None, None)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000000000000000000000000000000000000..9d9d14c3c00c474a7ebc5999373c8d5e4cf24f0e
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,11 @@
+[tox]
+envlist = nerval
+skipsdist=True
+
+[testenv:nerval]
+commands =
+  pytest {posargs}
+
+deps =
+  pytest
+  -rrequirements.txt