Add support for Python 3.12 + Finish the migration to `pyproject.toml`

6544a3c7 · Eva Bardou · Yoann Schneider · f97736ab · 6544a3c7 · 6544a3c7
Commit 6544a3c7 authored 4 months ago by Eva Bardou Committed by Yoann Schneider 4 months ago
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,7 @@ cache:

 linter:
  stage: test
-  image: python:3.10
+  image: python:3.12-slim

  cache:
    paths:
@@ -26,12 +26,15 @@ linter:
  before_script:
    - pip install pre-commit

+    # Install git
+    - apt-get update -q -y && apt-get install -q -y --no-install-recommends git
+
  script:
    - pre-commit run -a

 tests:
  stage: test
-  image: python:3.10
+  image: python:3.12-slim

  cache:
    paths:
@@ -54,11 +57,11 @@ bump-python-deps:
    - schedules

  script:
-    - devops python-deps requirements.txt
+    - devops python-deps pyproject.toml

 deploy-pypi:
  stage: release
-  image: python:3.10
+  image: python:3.12

  only:
    - tags

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
 repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.8.3
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      # Run the formatter.
+      - id: ruff-format
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
    hooks:
      - id: check-ast
      - id: check-docstring-first
@@ -16,22 +25,19 @@ repos:
      - id: name-tests-test
        args: ['--django']
      - id: check-json
-      - id: requirements-txt-fixer
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.1.6
-    hooks:
-      # Run the linter.
-      - id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
-      # Run the formatter.
-      - id: ruff-format
+      - id: check-toml
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
    hooks:
      - id: codespell
        args: ['--write-changes']
        exclude: '\.bio$'
  - repo: meta
    hooks:
-      - id: check-useless-excludes
\ No newline at end of file
+      - id: check-useless-excludes
+  - repo: https://gitlab.teklia.com/tools/pre-commit-hooks
+    rev: 0.1.0
+    hooks:
+      - id: long-test-files
+        args: ['1000']
+        files: '^tests\/(.*\/)?test_[^\/]*\.py$'
--- a/nerval/cli.py
+++ b/nerval/cli.py
@@ -10,10 +10,10 @@ def threshold_float_type(arg):
    """Type function for argparse."""
    try:
        f = float(arg)
-    except ValueError:
-        raise argparse.ArgumentTypeError("Must be a floating point number.")
+    except ValueError as e:
+        raise argparse.ArgumentTypeError("Must be a floating point number.") from e
    if f < 0 or f > 1:
-        raise argparse.ArgumentTypeError("Must be between 0 and 1.")
+        raise argparse.ArgumentTypeError("Must be between 0 and 1.") from None
    return f



--- a/nerval/evaluate.py
+++ b/nerval/evaluate.py
@@ -212,16 +212,14 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li
    last_label = NOT_ENTITY_TAG

    # Inspecting aligned string
-    for i, char in enumerate(aligned):
-        # new_label = ""
-
+    for char in aligned:
        # If original string has been fully processed, rest of labels are "O" ('-' characters at aligned end)
        if index_original >= len(original):
            new_label = NOT_ENTITY_TAG

        # If current aligned char does not match current original char ('-' characters in aligned)
        # Keep last_label and don't increment index_original
-        elif not char == original[index_original]:
+        elif char != original[index_original]:
            new_label = (
                last_label
                if get_position_label(last_label) not in BEGINNING_POS
@@ -321,7 +319,9 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict:

    # Compute scores
    scores = compute_scores(
-        annotation["entity_count"], prediction["entity_count"], matches
+        annotation["entity_count"],
+        prediction["entity_count"],
+        matches,
    )
    return scores

@@ -389,7 +389,7 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool):

        if not (annot and predict):
            raise Exception(
-                f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}"
+                f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}",
            )

        count += 1

--- a/nerval/parse.py
+++ b/nerval/parse.py
@@ -16,8 +16,10 @@ def get_type_label(label: str) -> str:
    """
    try:
        tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1]
-    except TypeError:
-        raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format."))
+    except TypeError as e:
+        raise (
+            Exception(f"The label {label} is not valid in BIOES/BIOLU format.")
+        ) from e

    return tag

@@ -33,8 +35,8 @@ def get_position_label(label: str) -> str:
            if label == NOT_ENTITY_TAG
            else re.match(r"([BIESLU])-(.*)$", label)[1]
        )
-    except TypeError:
-        raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.")
+    except TypeError as e:
+        raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") from e

    return pos

@@ -46,8 +48,10 @@ def parse_line(index: int, line: str):
        assert match_iob, f"Line {line} does not match IOB regex"

        return match_iob.group(1, 2)
-    except AssertionError:
-        raise Exception(f"The file is not in BIO format: check line {index} ({line})")
+    except AssertionError as e:
+        raise Exception(
+            f"The file is not in BIO format: check line {index} ({line})"
+        ) from e


 def parse_bio(lines: list[str]) -> dict:
@@ -64,7 +68,7 @@ def parse_bio(lines: list[str]) -> dict:
    if "§" in " ".join(lines):
        raise (
            Exception(
-                "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files."
+                "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files.",
            )
        )

@@ -154,7 +158,7 @@ def parse_bio(lines: list[str]) -> dict:
        result["entity_count"] = entity_count

        assert len(result["words"]) == len(
-            result["labels"]
+            result["labels"],
        ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)'
        for tag in result["entity_count"]:
            if tag != ALL_ENTITIES:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,27 +6,39 @@ build-backend = "setuptools.build_meta"
 name = "teklia-nerval"
 version = "0.3.3rc3"
 description = "Tool to evaluate NER on noisy text."
-dynamic = ["dependencies"]
+dependencies = [
+    "editdistance==0.8.1",
+    "edlib==1.3.9.post1",
+    "prettytable==3.9.0",
+]
 authors = [
    { name = "Teklia", email = "contact@teklia.com" },
 ]
 maintainers = [
    { name = "Teklia", email = "contact@teklia.com" },
 ]
-readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"
+readme = { file = "README.md", content-type = "text/markdown" }
+keywords = ["python"]
+classifiers = [
+    # Specify the Python versions you support here.
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]

 [project.scripts]
-nerval = "nerval.cli:main"
+"nerval" = "nerval.cli:main"

 [tool.setuptools]
 packages = ["nerval"]

-[tool.setuptools.dynamic]
-dependencies = { file = ["requirements.txt"] }
-
 [tool.ruff]
 exclude = [".git", "__pycache__"]
+target-version = "py312"
+
+[tool.ruff.lint]
 ignore = [
    "E501",
    # Conflicts with the formatter
@@ -42,26 +54,42 @@ select = [
    "T1",
    # Isort
    "I",
-    # Pyupgrade
+    # Implicit Optional
+    "RUF013",
+    # Invalid pyproject.toml
+    "RUF200",
+    # pyupgrade
    "UP",
-    # Pandas-vet
+    # pandas-vet
    "PD",
-    # Flake8-comprehension
-    "C4",
-    # Flake8-builtins
+    # flake8-bugbear
+    "B",
+    # flake8-builtins
    "A",
    # flake8-commas
    "COM",
+    # flake8-comprehension
+    "C4",
    # flake8-import-conventions
    "ICN",
-    # flake8-raise
-    "RSE",
    # flake8-quotes
    "Q",
+    # flake8-raise
+    "RSE",
+    # flake8-simplify
+    "SIM",
    # flake8-unused-arguments
    "ARG",
-    # flake8-use-pathlib
-    "PTH",
    # flake8-pytest-style
    "PT",
+    # flake8-use-pathlib
+    "PTH",
 ]
+
+[tool.ruff.lint.per-file-ignores]
+# Ignore `pytest-composite-assertion` rules of `flake8-pytest-style` linter for non-test files
+"nerval/**/*.py" = ["PT018"]
+
+[tool.ruff.lint.isort]
+known-first-party = []
+known-third-party = ["pytest", "setuptools", "editdistance", "edlib", "prettytable"]
--- a/requirements.txt
+++ b/requirements.txt
-editdistance==0.6.2
-edlib==1.3.9
-prettytable==3.9.0
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,56 +5,56 @@ import pytest
 FIXTURES = Path(__file__).parent / "fixtures"


-@pytest.fixture()
+@pytest.fixture
 def fake_annot_bio():
    return FIXTURES / "test_annot.bio"


-@pytest.fixture()
+@pytest.fixture
 def fake_annot_with_empty_lines_bio():
    return FIXTURES / "test_annot_with_empty_lines.bio"


-@pytest.fixture()
+@pytest.fixture
 def fake_predict_bio():
    return FIXTURES / "test_predict.bio"


-@pytest.fixture()
+@pytest.fixture
 def empty_bio():
    return FIXTURES / "test_empty.bio"


-@pytest.fixture()
+@pytest.fixture
 def bad_bio():
    return FIXTURES / "test_bad.bio"


-@pytest.fixture()
+@pytest.fixture
 def bioeslu_bio():
    return FIXTURES / "bioeslu.bio"


-@pytest.fixture()
+@pytest.fixture
 def end_of_file_bio():
    return FIXTURES / "end_of_file.bio"


-@pytest.fixture()
+@pytest.fixture
 def nested_bio():
    return FIXTURES / "test_nested.bio"


-@pytest.fixture()
+@pytest.fixture
 def folder_bio():
    return FIXTURES


-@pytest.fixture()
+@pytest.fixture
 def csv_file_error():
    return FIXTURES / "test_mapping_file_error.csv"


-@pytest.fixture()
+@pytest.fixture
 def csv_file():
    return FIXTURES / "test_mapping_file.csv"
--- a/tests/test_parse_bio.py
+++ b/tests/test_parse_bio.py
@@ -188,7 +188,8 @@ def test_parse_bio(test_input, expected):
 def test_parse_bio_bad_input(bad_bio):
    lines = bad_bio.read_text().strip().splitlines()
    with pytest.raises(
-        Exception, match=re.escape("The file is not in BIO format: check line 1 (file)")
+        Exception,
+        match=re.escape("The file is not in BIO format: check line 1 (file)"),
    ):
        evaluate.parse_bio(lines)


--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -101,7 +101,8 @@ def test_run(annotation, prediction, expected):

 def test_run_empty_bio(empty_bio):
    with pytest.raises(
-        Exception, match="No content found in annotation or prediction files."
+        Exception,
+        match="No content found in annotation or prediction files.",
    ):
        evaluate.run(empty_bio, empty_bio, 0.3, False)

@@ -116,13 +117,15 @@ def test_run_empty_entry():

 def test_run_invalid_header(csv_file_error, folder_bio):
    with pytest.raises(
-        Exception, match="Columns in the CSV mapping should be: Annotation,Prediction"
+        Exception,
+        match="Columns in the CSV mapping should be: Annotation,Prediction",
    ):
        evaluate.run_multiple(csv_file_error, folder_bio, 0.3, False)


 def test_run_multiple(csv_file, folder_bio):
    with pytest.raises(
-        Exception, match="No file found for files demo_annot.bio, demo_predict.bio"
+        Exception,
+        match="No file found for files demo_annot.bio, demo_predict.bio",
    ):
        evaluate.run_multiple(csv_file, folder_bio, 0.3, False)
--- a/tox.ini
+++ b/tox.ini
@@ -6,7 +6,6 @@ testpaths = tests
 addopts =
    --cov-report=term-missing

-
 [testenv]
 commands =
  pytest --cov=nerval {posargs}
@@ -16,4 +15,3 @@ deps =
  pytest<8
  pytest-lazy-fixture
  pytest-cov
-  -rrequirements.txt