diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 56fc6077e83fcc01dfa48f9d92ce370cbded0ca6..bd83af36e04214182b129f0807c7f16116b41e89 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,7 @@ cache: linter: stage: test - image: python:3.10 + image: python:3.12-slim cache: paths: @@ -26,12 +26,15 @@ linter: before_script: - pip install pre-commit + # Install git + - apt-get update -q -y && apt-get install -q -y --no-install-recommends git + script: - pre-commit run -a tests: stage: test - image: python:3.10 + image: python:3.12-slim cache: paths: @@ -54,11 +57,11 @@ bump-python-deps: - schedules script: - - devops python-deps requirements.txt + - devops python-deps pyproject.toml deploy-pypi: stage: release - image: python:3.10 + image: python:3.12 only: - tags diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9fe0e4ee1d907254b7b4f463f03aa617125b38c4..6bc45675cc8a4b9307a6e2efe6a507865ed4da5b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,15 @@ repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.3 + hooks: + # Run the linter. + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + # Run the formatter. + - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: check-ast - id: check-docstring-first @@ -16,22 +25,19 @@ repos: - id: name-tests-test args: ['--django'] - id: check-json - - id: requirements-txt-fixer - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.1.6 - hooks: - # Run the linter. - - id: ruff - args: [--fix, --exit-non-zero-on-fix] - # Run the formatter. - - id: ruff-format + - id: check-toml - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell args: ['--write-changes'] exclude: '\.bio$' - repo: meta hooks: - - id: check-useless-excludes \ No newline at end of file + - id: check-useless-excludes + - repo: https://gitlab.teklia.com/tools/pre-commit-hooks + rev: 0.1.0 + hooks: + - id: long-test-files + args: ['1000'] + files: '^tests\/(.*\/)?test_[^\/]*\.py$' diff --git a/nerval/cli.py b/nerval/cli.py index f63220640e0c4f4d3fcbc9ad118f2e15163dd73d..0b838a18b059cd3ea6a1af4fc6dda8b56b0c4c64 100644 --- a/nerval/cli.py +++ b/nerval/cli.py @@ -10,10 +10,10 @@ def threshold_float_type(arg): """Type function for argparse.""" try: f = float(arg) - except ValueError: - raise argparse.ArgumentTypeError("Must be a floating point number.") + except ValueError as e: + raise argparse.ArgumentTypeError("Must be a floating point number.") from e if f < 0 or f > 1: - raise argparse.ArgumentTypeError("Must be between 0 and 1.") + raise argparse.ArgumentTypeError("Must be between 0 and 1.") from None return f diff --git a/nerval/evaluate.py b/nerval/evaluate.py index e89d62164f4a73544d1cafa5955452b5aef55e6a..4ca3cdcc9ac208261f35f5593fc8c049bb609359 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -212,16 +212,14 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li last_label = NOT_ENTITY_TAG # Inspecting aligned string - for i, char in enumerate(aligned): - # new_label = "" - + for char in aligned: # If original string has been fully processed, rest of labels are "O" ('-' characters at aligned end) if index_original >= len(original): new_label = NOT_ENTITY_TAG # If current aligned char does not match current original char ('-' characters in aligned) # Keep last_label and don't increment index_original - elif not char == original[index_original]: + elif char != original[index_original]: new_label = ( last_label if get_position_label(last_label) not in BEGINNING_POS @@ -321,7 +319,9 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict: # Compute scores scores = compute_scores( - annotation["entity_count"], prediction["entity_count"], matches + annotation["entity_count"], + prediction["entity_count"], + matches, ) return scores @@ -389,7 +389,7 @@ def run_multiple(file_csv: Path, folder: Path, threshold: int, verbose: bool): if not (annot and predict): raise Exception( - f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}" + f"No file found for files {row[ANNO_COLUMN]}, {row[PRED_COLUMN]}", ) count += 1 diff --git a/nerval/parse.py b/nerval/parse.py index abd41008b6a705d57ac7612db71ae48b995d2b86..f3553bec155e1723bc2a85f749b0e9a527197d96 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -16,8 +16,10 @@ def get_type_label(label: str) -> str: """ try: tag = NOT_ENTITY_TAG if label == NOT_ENTITY_TAG else REGEX_LABEL.match(label)[1] - except TypeError: - raise (Exception(f"The label {label} is not valid in BIOES/BIOLU format.")) + except TypeError as e: + raise ( + Exception(f"The label {label} is not valid in BIOES/BIOLU format.") + ) from e return tag @@ -33,8 +35,8 @@ def get_position_label(label: str) -> str: if label == NOT_ENTITY_TAG else re.match(r"([BIESLU])-(.*)$", label)[1] ) - except TypeError: - raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") + except TypeError as e: + raise Exception(f"The label {label} is not valid in BIOES/BIOLU format.") from e return pos @@ -46,8 +48,10 @@ def parse_line(index: int, line: str): assert match_iob, f"Line {line} does not match IOB regex" return match_iob.group(1, 2) - except AssertionError: - raise Exception(f"The file is not in BIO format: check line {index} ({line})") + except AssertionError as e: + raise Exception( + f"The file is not in BIO format: check line {index} ({line})" + ) from e def parse_bio(lines: list[str]) -> dict: @@ -64,7 +68,7 @@ def parse_bio(lines: list[str]) -> dict: if "§" in " ".join(lines): raise ( Exception( - "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files." + "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files.", ) ) @@ -154,7 +158,7 @@ def parse_bio(lines: list[str]) -> dict: result["entity_count"] = entity_count assert len(result["words"]) == len( - result["labels"] + result["labels"], ), f'Found {len(result["words"])} word(s) for {len(result["labels"])} label(s)' for tag in result["entity_count"]: if tag != ALL_ENTITIES: diff --git a/pyproject.toml b/pyproject.toml index b981d0f530bbde239c6cf00a9552ad8bc240f61a..9c72316ed6643839a2a6eef312a25da234cdbf68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,27 +6,39 @@ build-backend = "setuptools.build_meta" name = "teklia-nerval" version = "0.3.3rc3" description = "Tool to evaluate NER on noisy text." -dynamic = ["dependencies"] +dependencies = [ + "editdistance==0.8.1", + "edlib==1.3.9.post1", + "prettytable==3.9.0", +] authors = [ { name = "Teklia", email = "contact@teklia.com" }, ] maintainers = [ { name = "Teklia", email = "contact@teklia.com" }, ] -readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.10" +readme = { file = "README.md", content-type = "text/markdown" } +keywords = ["python"] +classifiers = [ + # Specify the Python versions you support here. + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] [project.scripts] -nerval = "nerval.cli:main" +"nerval" = "nerval.cli:main" [tool.setuptools] packages = ["nerval"] -[tool.setuptools.dynamic] -dependencies = { file = ["requirements.txt"] } - [tool.ruff] exclude = [".git", "__pycache__"] +target-version = "py312" + +[tool.ruff.lint] ignore = [ "E501", # Conflicts with the formatter @@ -42,26 +54,42 @@ select = [ "T1", # Isort "I", - # Pyupgrade + # Implicit Optional + "RUF013", + # Invalid pyproject.toml + "RUF200", + # pyupgrade "UP", - # Pandas-vet + # pandas-vet "PD", - # Flake8-comprehension - "C4", - # Flake8-builtins + # flake8-bugbear + "B", + # flake8-builtins "A", # flake8-commas "COM", + # flake8-comprehension + "C4", # flake8-import-conventions "ICN", - # flake8-raise - "RSE", # flake8-quotes "Q", + # flake8-raise + "RSE", + # flake8-simplify + "SIM", # flake8-unused-arguments "ARG", - # flake8-use-pathlib - "PTH", # flake8-pytest-style "PT", + # flake8-use-pathlib + "PTH", ] + +[tool.ruff.lint.per-file-ignores] +# Ignore `pytest-composite-assertion` rules of `flake8-pytest-style` linter for non-test files +"nerval/**/*.py" = ["PT018"] + +[tool.ruff.lint.isort] +known-first-party = [] +known-third-party = ["pytest", "setuptools", "editdistance", "edlib", "prettytable"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ec0fabeefc9613bf3a18807cedb95a4f5f2fb6a8..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -editdistance==0.6.2 -edlib==1.3.9 -prettytable==3.9.0 diff --git a/tests/conftest.py b/tests/conftest.py index fbce7f1a477bf0f4c128cc37d82aa53c902399d9..7261742f362ff6a92786aa5f839549793c83fc3b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,56 +5,56 @@ import pytest FIXTURES = Path(__file__).parent / "fixtures" -@pytest.fixture() +@pytest.fixture def fake_annot_bio(): return FIXTURES / "test_annot.bio" -@pytest.fixture() +@pytest.fixture def fake_annot_with_empty_lines_bio(): return FIXTURES / "test_annot_with_empty_lines.bio" -@pytest.fixture() +@pytest.fixture def fake_predict_bio(): return FIXTURES / "test_predict.bio" -@pytest.fixture() +@pytest.fixture def empty_bio(): return FIXTURES / "test_empty.bio" -@pytest.fixture() +@pytest.fixture def bad_bio(): return FIXTURES / "test_bad.bio" -@pytest.fixture() +@pytest.fixture def bioeslu_bio(): return FIXTURES / "bioeslu.bio" -@pytest.fixture() +@pytest.fixture def end_of_file_bio(): return FIXTURES / "end_of_file.bio" -@pytest.fixture() +@pytest.fixture def nested_bio(): return FIXTURES / "test_nested.bio" -@pytest.fixture() +@pytest.fixture def folder_bio(): return FIXTURES -@pytest.fixture() +@pytest.fixture def csv_file_error(): return FIXTURES / "test_mapping_file_error.csv" -@pytest.fixture() +@pytest.fixture def csv_file(): return FIXTURES / "test_mapping_file.csv" diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 3266cea690d778a1b955dcffce9c8fabebd58ace..cd7345e37a60036f56eba4af0c721f5fb83368e3 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -188,7 +188,8 @@ def test_parse_bio(test_input, expected): def test_parse_bio_bad_input(bad_bio): lines = bad_bio.read_text().strip().splitlines() with pytest.raises( - Exception, match=re.escape("The file is not in BIO format: check line 1 (file)") + Exception, + match=re.escape("The file is not in BIO format: check line 1 (file)"), ): evaluate.parse_bio(lines) diff --git a/tests/test_run.py b/tests/test_run.py index 2e0c08f69ac77c730813f19b647443457973246c..421c51cf7a681faa5b6c7ad3c16fc06cc12736f4 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -101,7 +101,8 @@ def test_run(annotation, prediction, expected): def test_run_empty_bio(empty_bio): with pytest.raises( - Exception, match="No content found in annotation or prediction files." + Exception, + match="No content found in annotation or prediction files.", ): evaluate.run(empty_bio, empty_bio, 0.3, False) @@ -116,13 +117,15 @@ def test_run_empty_entry(): def test_run_invalid_header(csv_file_error, folder_bio): with pytest.raises( - Exception, match="Columns in the CSV mapping should be: Annotation,Prediction" + Exception, + match="Columns in the CSV mapping should be: Annotation,Prediction", ): evaluate.run_multiple(csv_file_error, folder_bio, 0.3, False) def test_run_multiple(csv_file, folder_bio): with pytest.raises( - Exception, match="No file found for files demo_annot.bio, demo_predict.bio" + Exception, + match="No file found for files demo_annot.bio, demo_predict.bio", ): evaluate.run_multiple(csv_file, folder_bio, 0.3, False) diff --git a/tox.ini b/tox.ini index 2e05b9739e2c0288f142b93c0a6a9c10975569bb..78795149e72c15b3eee2aac75c6ec1e605935176 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,6 @@ testpaths = tests addopts = --cov-report=term-missing - [testenv] commands = pytest --cov=nerval {posargs} @@ -16,4 +15,3 @@ deps = pytest<8 pytest-lazy-fixture pytest-cov - -rrequirements.txt