From 6544a3c7eb270a787129b0165103f9eafcf12784 Mon Sep 17 00:00:00 2001 From: Eva Bardou <bardou@teklia.com> Date: Mon, 16 Dec 2024 12:35:18 +0000 Subject: [PATCH] Add support for Python 3.12 + Finish the migration to `pyproject.toml` --- .gitlab-ci.yml | 11 +++++--- .pre-commit-config.yaml | 32 ++++++++++++++--------- nerval/cli.py | 6 ++--- nerval/evaluate.py | 12 ++++----- nerval/parse.py | 20 ++++++++------ pyproject.toml | 58 ++++++++++++++++++++++++++++++----------- requirements.txt | 3 --- tests/conftest.py | 22 ++++++++-------- tests/test_parse_bio.py | 3 ++- tests/test_run.py | 9 ++++--- tox.ini | 2 -- 11 files changed, 109 insertions(+), 69 deletions(-) delete mode 100644 requirements.txt diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 56fc607..bd83af3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,7 @@ cache: linter: stage: test - image: python:3.10 + image: python:3.12-slim cache: paths: @@ -26,12 +26,15 @@ linter: before_script: - pip install pre-commit + # Install git + - apt-get update -q -y && apt-get install -q -y --no-install-recommends git + script: - pre-commit run -a tests: stage: test - image: python:3.10 + image: python:3.12-slim cache: paths: @@ -54,11 +57,11 @@ bump-python-deps: - schedules script: - - devops python-deps requirements.txt + - devops python-deps pyproject.toml deploy-pypi: stage: release - image: python:3.10 + image: python:3.12 only: - tags diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9fe0e4e..6bc4567 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,15 @@ repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.3 + hooks: + # Run the linter. + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + # Run the formatter. + - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: check-ast - id: check-docstring-first @@ -16,22 +25,19 @@ repos: - id: name-tests-test args: ['--django'] - id: check-json - - id: requirements-txt-fixer - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.1.6 - hooks: - # Run the linter. - - id: ruff - args: [--fix, --exit-non-zero-on-fix] - # Run the formatter. - - id: ruff-format + - id: check-toml - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell args: ['--write-changes'] exclude: '\.bio$' - repo: meta hooks: - - id: check-useless-excludes \ No newline at end of file + - id: check-useless-excludes + - repo: https://gitlab.teklia.com/tools/pre-commit-hooks + rev: 0.1.0 + hooks: + - id: long-test-files + args: ['1000'] + files: '^tests\/(.*\/)?test_[^\/]*\.py$' diff --git a/nerval/cli.py b/nerval/cli.py index f632206..0b838a1 100644 --- a/nerval/cli.py +++ b/nerval/cli.py @@ -10,10 +10,10 @@ def threshold_float_type(arg): """Type function for argparse.""" try: f = float(arg) - except ValueError: - raise argparse.ArgumentTypeError("Must be a floating point number.") + except ValueError as e: + raise argparse.ArgumentTypeError("Must be a floating point number.") from e if f < 0 or f > 1: - raise argparse.ArgumentTypeError("Must be between 0 and 1.") + raise argparse.ArgumentTypeError("Must be between 0 and 1.") from None return f diff --git a/nerval/evaluate.py b/nerval/evaluate.py index e89d621..4ca3cdc 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -212,16 +212,14 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li last_label = NOT_ENTITY_TAG # Inspecting aligned string - for i, char in enumerate(aligned): - # new_label = "" - + for char in aligned: # If original string has been fully processed, rest of labels are "O" ('-' characters at aligned end) if index_original >= len(original): new_label = NOT_ENTITY_TAG # If current aligned char does not match current original char ('-' characters in aligned) # Keep last_label and don't increment index_original - elif not char == original[index_original]: + elif char != original[index_original]: new_label = ( last_label if get_position_label(last_label) not in BEGINNING_POS @@ -321,7 +319,9 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict: # Compute scores scores = compute_scores( - annotation["entity_count"], prediction["entity_count"], matches + annotation["entity_count"], + prediction["entity_count"], + matches, ) return scores @@ -389,7 +389,7 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool): if not (annot and predict): raise Exception( - f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}" + f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}", ) count += 1 diff --git a/nerval/parse.py b/nerval/parse.py index abd4100..f3553be 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -16,8 +16,10 @@ def get_type_label(label: str) -> str: """ try: tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1] - except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) + except TypeError as e: + raise ( + Exception(f"The label {label} is not valid in BIOES/BIOLU format.") + ) from e return tag @@ -33,8 +35,8 @@ def get_position_label(label: str) -> str: if label == NOT_ENTITY_TAG else re.match(r"([BIESLU])-(.*)$", label)[1] ) - except TypeError: - raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") + except TypeError as e: + raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") from e return pos @@ -46,8 +48,10 @@ def parse_line(index: int, line: str): assert match_iob, f"Line {line} does not match IOB regex" return match_iob.group(1, 2) - except AssertionError: - raise Exception(f"The file is not in BIO format: check line {index} ({line})") + except AssertionError as e: + raise Exception( + f"The file is not in BIO format: check line {index} ({line})" + ) from e def parse_bio(lines: list[str]) -> dict: @@ -64,7 +68,7 @@ def parse_bio(lines: list[str]) -> dict: if "§" in " ".join(lines): raise ( Exception( - "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files." + "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files.", ) ) @@ -154,7 +158,7 @@ def parse_bio(lines: list[str]) -> dict: result["entity_count"] = entity_count assert len(result["words"]) == len( - result["labels"] + result["labels"], ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)' for tag in result["entity_count"]: if tag != ALL_ENTITIES: diff --git a/pyproject.toml b/pyproject.toml index b981d0f..9c72316 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,27 +6,39 @@ build-backend = "setuptools.build_meta" name = "teklia-nerval" version = "0.3.3rc3" description = "Tool to evaluate NER on noisy text." -dynamic = ["dependencies"] +dependencies = [ + "editdistance==0.8.1", + "edlib==1.3.9.post1", + "prettytable==3.9.0", +] authors = [ { name = "Teklia", email = "contact@teklia.com" }, ] maintainers = [ { name = "Teklia", email = "contact@teklia.com" }, ] -readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.10" +readme = { file = "README.md", content-type = "text/markdown" } +keywords = ["python"] +classifiers = [ + # Specify the Python versions you support here. + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] [project.scripts] -nerval = "nerval.cli:main" +"nerval" = "nerval.cli:main" [tool.setuptools] packages = ["nerval"] -[tool.setuptools.dynamic] -dependencies = { file = ["requirements.txt"] } - [tool.ruff] exclude = [".git", "__pycache__"] +target-version = "py312" + +[tool.ruff.lint] ignore = [ "E501", # Conflicts with the formatter @@ -42,26 +54,42 @@ select = [ "T1", # Isort "I", - # Pyupgrade + # Implicit Optional + "RUF013", + # Invalid pyproject.toml + "RUF200", + # pyupgrade "UP", - # Pandas-vet + # pandas-vet "PD", - # Flake8-comprehension - "C4", - # Flake8-builtins + # flake8-bugbear + "B", + # flake8-builtins "A", # flake8-commas "COM", + # flake8-comprehension + "C4", # flake8-import-conventions "ICN", - # flake8-raise - "RSE", # flake8-quotes "Q", + # flake8-raise + "RSE", + # flake8-simplify + "SIM", # flake8-unused-arguments "ARG", - # flake8-use-pathlib - "PTH", # flake8-pytest-style "PT", + # flake8-use-pathlib + "PTH", ] + +[tool.ruff.lint.per-file-ignores] +# Ignore `pytest-composite-assertion` rules of `flake8-pytest-style` linter for non-test files +"nerval/**/*.py" = ["PT018"] + +[tool.ruff.lint.isort] +known-first-party = [] +known-third-party = ["pytest", "setuptools", "editdistance", "edlib", "prettytable"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ec0fabe..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -editdistance==0.6.2 -edlib==1.3.9 -prettytable==3.9.0 diff --git a/tests/conftest.py b/tests/conftest.py index fbce7f1..7261742 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,56 +5,56 @@ import pytest FIXTURES = Path(__file__).parent / "fixtures" -@pytest.fixture() +@pytest.fixture def fake_annot_bio(): return FIXTURES / "test_annot.bio" -@pytest.fixture() +@pytest.fixture def fake_annot_with_empty_lines_bio(): return FIXTURES / "test_annot_with_empty_lines.bio" -@pytest.fixture() +@pytest.fixture def fake_predict_bio(): return FIXTURES / "test_predict.bio" -@pytest.fixture() +@pytest.fixture def empty_bio(): return FIXTURES / "test_empty.bio" -@pytest.fixture() +@pytest.fixture def bad_bio(): return FIXTURES / "test_bad.bio" -@pytest.fixture() +@pytest.fixture def bioeslu_bio(): return FIXTURES / "bioeslu.bio" -@pytest.fixture() +@pytest.fixture def end_of_file_bio(): return FIXTURES / "end_of_file.bio" -@pytest.fixture() +@pytest.fixture def nested_bio(): return FIXTURES / "test_nested.bio" -@pytest.fixture() +@pytest.fixture def folder_bio(): return FIXTURES -@pytest.fixture() +@pytest.fixture def csv_file_error(): return FIXTURES / "test_mapping_file_error.csv" -@pytest.fixture() +@pytest.fixture def csv_file(): return FIXTURES / "test_mapping_file.csv" diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 3266cea..cd7345e 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -188,7 +188,8 @@ def test_parse_bio(test_input, expected): def test_parse_bio_bad_input(bad_bio): lines = bad_bio.read_text().strip().splitlines() with pytest.raises( - Exception, match=re.escape("The file is not in BIO format: check line 1 (file)") + Exception, + match=re.escape("The file is not in BIO format: check line 1 (file)"), ): evaluate.parse_bio(lines) diff --git a/tests/test_run.py b/tests/test_run.py index 2e0c08f..421c51c 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -101,7 +101,8 @@ def test_run(annotation, prediction, expected): def test_run_empty_bio(empty_bio): with pytest.raises( - Exception, match="No content found in annotation or prediction files." + Exception, + match="No content found in annotation or prediction files.", ): evaluate.run(empty_bio, empty_bio, 0.3, False) @@ -116,13 +117,15 @@ def test_run_empty_entry(): def test_run_invalid_header(csv_file_error, folder_bio): with pytest.raises( - Exception, match="Columns in the CSV mapping should be: Annotation,Prediction" + Exception, + match="Columns in the CSV mapping should be: Annotation,Prediction", ): evaluate.run_multiple(csv_file_error, folder_bio, 0.3, False) def test_run_multiple(csv_file, folder_bio): with pytest.raises( - Exception, match="No file found for files demo_annot.bio, demo_predict.bio" + Exception, + match="No file found for files demo_annot.bio, demo_predict.bio", ): evaluate.run_multiple(csv_file, folder_bio, 0.3, False) diff --git a/tox.ini b/tox.ini index 2e05b97..7879514 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,6 @@ testpaths = tests addopts = --cov-report=term-missing - [testenv] commands = pytest --cov=nerval {posargs} @@ -16,4 +15,3 @@ deps = pytest<8 pytest-lazy-fixture pytest-cov - -rrequirements.txt -- GitLab