From 6544a3c7eb270a787129b0165103f9eafcf12784 Mon Sep 17 00:00:00 2001
From: Eva Bardou <bardou@teklia.com>
Date: Mon, 16 Dec 2024 12:35:18 +0000
Subject: [PATCH] Add support for Python 3.12 + Finish the migration to
 `pyproject.toml`

---
 .gitlab-ci.yml          | 11 +++++---
 .pre-commit-config.yaml | 32 ++++++++++++++---------
 nerval/cli.py           |  6 ++---
 nerval/evaluate.py      | 12 ++++-----
 nerval/parse.py         | 20 ++++++++------
 pyproject.toml          | 58 ++++++++++++++++++++++++++++++-----------
 requirements.txt        |  3 ---
 tests/conftest.py       | 22 ++++++++--------
 tests/test_parse_bio.py |  3 ++-
 tests/test_run.py       |  9 ++++---
 tox.ini                 |  2 --
 11 files changed, 109 insertions(+), 69 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 56fc607..bd83af3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,7 @@ cache:
 
 linter:
   stage: test
-  image: python:3.10
+  image: python:3.12-slim
 
   cache:
     paths:
@@ -26,12 +26,15 @@ linter:
   before_script:
     - pip install pre-commit
 
+    # Install git
+    - apt-get update -q -y && apt-get install -q -y --no-install-recommends git
+
   script:
     - pre-commit run -a
 
 tests:
   stage: test
-  image: python:3.10
+  image: python:3.12-slim
 
   cache:
     paths:
@@ -54,11 +57,11 @@ bump-python-deps:
     - schedules
 
   script:
-    - devops python-deps requirements.txt
+    - devops python-deps pyproject.toml
 
 deploy-pypi:
   stage: release
-  image: python:3.10
+  image: python:3.12
 
   only:
     - tags
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9fe0e4e..6bc4567 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,15 @@
 repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.8.3
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      # Run the formatter.
+      - id: ruff-format
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: check-ast
       - id: check-docstring-first
@@ -16,22 +25,19 @@ repos:
       - id: name-tests-test
         args: ['--django']
       - id: check-json
-      - id: requirements-txt-fixer
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.1.6
-    hooks:
-      # Run the linter.
-      - id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
-      # Run the formatter.
-      - id: ruff-format
+      - id: check-toml
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         args: ['--write-changes']
         exclude: '\.bio$'
   - repo: meta
     hooks:
-      - id: check-useless-excludes
\ No newline at end of file
+      - id: check-useless-excludes
+  - repo: https://gitlab.teklia.com/tools/pre-commit-hooks
+    rev: 0.1.0
+    hooks:
+      - id: long-test-files
+        args: ['1000']
+        files: '^tests\/(.*\/)?test_[^\/]*\.py$'
diff --git a/nerval/cli.py b/nerval/cli.py
index f632206..0b838a1 100644
--- a/nerval/cli.py
+++ b/nerval/cli.py
@@ -10,10 +10,10 @@ def threshold_float_type(arg):
     """Type function for argparse."""
     try:
         f = float(arg)
-    except ValueError:
-        raise argparse.ArgumentTypeError("Must be a floating point number.")
+    except ValueError as e:
+        raise argparse.ArgumentTypeError("Must be a floating point number.") from e
     if f < 0 or f > 1:
-        raise argparse.ArgumentTypeError("Must be between 0 and 1.")
+        raise argparse.ArgumentTypeError("Must be between 0 and 1.") from None
     return f
 
 
diff --git a/nerval/evaluate.py b/nerval/evaluate.py
index e89d621..4ca3cdc 100644
--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -212,16 +212,14 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
     last_label = NOT_ENTITY_TAG
 
     # Inspecting aligned string
-    for i, char in enumerate(aligned):
-        # new_label = ""
-
+    for char in aligned:
         # If original string has been fully processed, rest of labels are "O" ('-' characters at aligned end)
         if index_original >= len(original):
             new_label = NOT_ENTITY_TAG
 
         # If current aligned char does not match current original char ('-' characters in aligned)
         # Keep last_label and don't increment index_original
-        elif not char == original[index_original]:
+        elif char != original[index_original]:
             new_label = (
                 last_label
                 if get_position_label(last_label) not in BEGINNING_POS
@@ -321,7 +319,9 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict:
 
     # Compute scores
     scores = compute_scores(
-        annotation["entity_count"], prediction["entity_count"], matches
+        annotation["entity_count"],
+        prediction["entity_count"],
+        matches,
     )
     return scores
 
@@ -389,7 +389,7 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool):
 
         if not (annot and predict):
             raise Exception(
-                f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}"
+                f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}",
             )
 
         count += 1
diff --git a/nerval/parse.py b/nerval/parse.py
index abd4100..f3553be 100644
--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -16,8 +16,10 @@ def get_type_label(label: str) -> str:
     """
     try:
         tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1]
-    except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
+    except TypeError as e:
+        raise (
+            Exception(f"The label {label} is not valid in BIOES/BIOLU format.")
+        ) from e
 
     return tag
 
@@ -33,8 +35,8 @@ def get_position_label(label: str) -> str:
             if label == NOT_ENTITY_TAG
             else re.match(r"([BIESLU])-(.*)$", label)[1]
         )
-    except TypeError:
-        raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.")
+    except TypeError as e:
+        raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") from e
 
     return pos
 
@@ -46,8 +48,10 @@ def parse_line(index: int, line: str):
         assert match_iob, f"Line {line} does not match IOB regex"
 
         return match_iob.group(1, 2)
-    except AssertionError:
-        raise Exception(f"The file is not in BIO format: check line {index} ({line})")
+    except AssertionError as e:
+        raise Exception(
+            f"The file is not in BIO format: check line {index} ({line})"
+        ) from e
 
 
 def parse_bio(lines: list[str]) -> dict:
@@ -64,7 +68,7 @@ def parse_bio(lines: list[str]) -> dict:
     if "Â§" in " ".join(lines):
         raise (
             Exception(
-                "Â§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files."
+                "Â§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files.",
             )
         )
 
@@ -154,7 +158,7 @@ def parse_bio(lines: list[str]) -> dict:
         result["entity_count"] = entity_count
 
         assert len(result["words"]) == len(
-            result["labels"]
+            result["labels"],
         ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)'
         for tag in result["entity_count"]:
             if tag != ALL_ENTITIES:
diff --git a/pyproject.toml b/pyproject.toml
index b981d0f..9c72316 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,27 +6,39 @@ build-backend = "setuptools.build_meta"
 name = "teklia-nerval"
 version = "0.3.3rc3"
 description = "Tool to evaluate NER on noisy text."
-dynamic = ["dependencies"]
+dependencies = [
+    "editdistance==0.8.1",
+    "edlib==1.3.9.post1",
+    "prettytable==3.9.0",
+]
 authors = [
     { name = "Teklia", email = "contact@teklia.com" },
 ]
 maintainers = [
     { name = "Teklia", email = "contact@teklia.com" },
 ]
-readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"
+readme = { file = "README.md", content-type = "text/markdown" }
+keywords = ["python"]
+classifiers = [
+    # Specify the Python versions you support here.
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
 
 [project.scripts]
-nerval = "nerval.cli:main"
+"nerval" = "nerval.cli:main"
 
 [tool.setuptools]
 packages = ["nerval"]
 
-[tool.setuptools.dynamic]
-dependencies = { file = ["requirements.txt"] }
-
 [tool.ruff]
 exclude = [".git", "__pycache__"]
+target-version = "py312"
+
+[tool.ruff.lint]
 ignore = [
     "E501",
     # Conflicts with the formatter
@@ -42,26 +54,42 @@ select = [
     "T1",
     # Isort
     "I",
-    # Pyupgrade
+    # Implicit Optional
+    "RUF013",
+    # Invalid pyproject.toml
+    "RUF200",
+    # pyupgrade
     "UP",
-    # Pandas-vet
+    # pandas-vet
     "PD",
-    # Flake8-comprehension
-    "C4",
-    # Flake8-builtins
+    # flake8-bugbear
+    "B",
+    # flake8-builtins
     "A",
     # flake8-commas
     "COM",
+    # flake8-comprehension
+    "C4",
     # flake8-import-conventions
     "ICN",
-    # flake8-raise
-    "RSE",
     # flake8-quotes
     "Q",
+    # flake8-raise
+    "RSE",
+    # flake8-simplify
+    "SIM",
     # flake8-unused-arguments
     "ARG",
-    # flake8-use-pathlib
-    "PTH",
     # flake8-pytest-style
     "PT",
+    # flake8-use-pathlib
+    "PTH",
 ]
+
+[tool.ruff.lint.per-file-ignores]
+# Ignore `pytest-composite-assertion` rules of `flake8-pytest-style` linter for non-test files
+"nerval/**/*.py" = ["PT018"]
+
+[tool.ruff.lint.isort]
+known-first-party = []
+known-third-party = ["pytest", "setuptools", "editdistance", "edlib", "prettytable"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index ec0fabe..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-editdistance==0.6.2
-edlib==1.3.9
-prettytable==3.9.0
diff --git a/tests/conftest.py b/tests/conftest.py
index fbce7f1..7261742 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,56 +5,56 @@ import pytest
 FIXTURES = Path(__file__).parent / "fixtures"
 
 
-@pytest.fixture()
+@pytest.fixture
 def fake_annot_bio():
     return FIXTURES / "test_annot.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def fake_annot_with_empty_lines_bio():
     return FIXTURES / "test_annot_with_empty_lines.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def fake_predict_bio():
     return FIXTURES / "test_predict.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def empty_bio():
     return FIXTURES / "test_empty.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def bad_bio():
     return FIXTURES / "test_bad.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def bioeslu_bio():
     return FIXTURES / "bioeslu.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def end_of_file_bio():
     return FIXTURES / "end_of_file.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def nested_bio():
     return FIXTURES / "test_nested.bio"
 
 
-@pytest.fixture()
+@pytest.fixture
 def folder_bio():
     return FIXTURES
 
 
-@pytest.fixture()
+@pytest.fixture
 def csv_file_error():
     return FIXTURES / "test_mapping_file_error.csv"
 
 
-@pytest.fixture()
+@pytest.fixture
 def csv_file():
     return FIXTURES / "test_mapping_file.csv"
diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py
index 3266cea..cd7345e 100644
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -188,7 +188,8 @@ def test_parse_bio(test_input, expected):
 def test_parse_bio_bad_input(bad_bio):
     lines = bad_bio.read_text().strip().splitlines()
     with pytest.raises(
-        Exception, match=re.escape("The file is not in BIO format: check line 1 (file)")
+        Exception,
+        match=re.escape("The file is not in BIO format: check line 1 (file)"),
     ):
         evaluate.parse_bio(lines)
 
diff --git a/tests/test_run.py b/tests/test_run.py
index 2e0c08f..421c51c 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -101,7 +101,8 @@ def test_run(annotation, prediction, expected):
 
 def test_run_empty_bio(empty_bio):
     with pytest.raises(
-        Exception, match="No content found in annotation or prediction files."
+        Exception,
+        match="No content found in annotation or prediction files.",
     ):
         evaluate.run(empty_bio, empty_bio, 0.3, False)
 
@@ -116,13 +117,15 @@ def test_run_empty_entry():
 
 def test_run_invalid_header(csv_file_error, folder_bio):
     with pytest.raises(
-        Exception, match="Columns in the CSV mapping should be: Annotation,Prediction"
+        Exception,
+        match="Columns in the CSV mapping should be: Annotation,Prediction",
     ):
         evaluate.run_multiple(csv_file_error, folder_bio, 0.3, False)
 
 
 def test_run_multiple(csv_file, folder_bio):
     with pytest.raises(
-        Exception, match="No file found for files demo_annot.bio, demo_predict.bio"
+        Exception,
+        match="No file found for files demo_annot.bio, demo_predict.bio",
     ):
         evaluate.run_multiple(csv_file, folder_bio, 0.3, False)
diff --git a/tox.ini b/tox.ini
index 2e05b97..7879514 100644
--- a/tox.ini
+++ b/tox.ini
@@ -6,7 +6,6 @@ testpaths = tests
 addopts =
     --cov-report=term-missing
 
-
 [testenv]
 commands =
   pytest --cov=nerval {posargs}
@@ -16,4 +15,3 @@ deps =
   pytest<8
   pytest-lazy-fixture
   pytest-cov
-  -rrequirements.txt
-- 
GitLab