From d13301ebaf8c868101cb7343d397ac58e8f01791 Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Fri, 24 Nov 2023 11:12:11 +0100 Subject: [PATCH] Setup ruff for lintint/formatting --- .flake8 | 7 -- .isort.cfg | 10 --- .pre-commit-config.yaml | 31 +++---- nerval/__init__.py | 1 - nerval/cli.py | 7 +- nerval/evaluate.py | 34 ++++--- nerval/parse.py | 5 +- nerval/utils.py | 5 +- pyproject.toml | 36 ++++++++ setup.py | 3 +- tests/conftest.py | 1 - tests/test_align.py | 3 +- tests/test_compute_matches.py | 137 ++++++++++++++-------------- tests/test_compute_scores.py | 3 +- tests/test_get_labels_aligned.py | 147 +++++++++++++++---------------- tests/test_parse_bio.py | 1 - tests/test_run.py | 1 - 17 files changed, 223 insertions(+), 209 deletions(-) delete mode 100644 .flake8 delete mode 100644 .isort.cfg create mode 100644 pyproject.toml diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ed02b76..0000000 --- a/.flake8 +++ /dev/null @@ -1,7 +0,0 @@ -[flake8] -max-line-length = 120 -exclude=build,.cache,.eggs,.git,src/zeep,front -# Flake8 ignores multiple errors by default; -# the only interesting ignore is W503, which goes against PEP8. -# See https://lintlyci.github.io/Flake8Rules/rules/W503.html -ignore = E203,E501,W503 \ No newline at end of file diff --git a/.isort.cfg b/.isort.cfg deleted file mode 100644 index 4fec7ef..0000000 --- a/.isort.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[settings] -# Compatible with black -multi_line_output = 3 -include_trailing_comma = True -force_grid_wrap = 0 -use_parentheses = True -line_length = 120 - -default_section=FIRSTPARTY -known_third_party = editdistance,edlib,pytest,setuptools,prettytable diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9a3c7b8..9fe0e4e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,21 +1,6 @@ repos: - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 - hooks: - - id: isort - - repo: https://github.com/ambv/black - rev: 23.1.0 - hooks: - - id: black - - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 - hooks: - - id: flake8 - additional_dependencies: - - 'flake8-coding==1.3.2' - - 'flake8-debugger==4.1.2' - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-ast - id: check-docstring-first @@ -26,17 +11,27 @@ repos: - id: trailing-whitespace - id: check-yaml args: [--allow-multiple-documents] + - id: check-toml - id: mixed-line-ending - id: name-tests-test args: ['--django'] - id: check-json - id: requirements-txt-fixer + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.6 + hooks: + # Run the linter. + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + # Run the formatter. + - id: ruff-format - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.6 hooks: - id: codespell args: ['--write-changes'] exclude: '\.bio$' - repo: meta hooks: - - id: check-useless-excludes + - id: check-useless-excludes \ No newline at end of file diff --git a/nerval/__init__.py b/nerval/__init__.py index b74e888..22615ee 100644 --- a/nerval/__init__.py +++ b/nerval/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import logging logging.basicConfig( diff --git a/nerval/cli.py b/nerval/cli.py index ef4dbb1..f632206 100644 --- a/nerval/cli.py +++ b/nerval/cli.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import argparse from pathlib import Path @@ -70,18 +69,18 @@ def main(): if args.annot: if not args.predict: raise argparse.ArgumentTypeError( - "You need to specify the path to a predict file with -p" + "You need to specify the path to a predict file with -p", ) run(args.annot, args.predict, args.threshold, args.verbose) elif args.csv: if not args.folder: raise argparse.ArgumentTypeError( - "You need to specify the path to a folder of bio files with -f" + "You need to specify the path to a folder of bio files with -f", ) run_multiple(args.csv, args.folder, args.threshold, args.verbose) else: raise argparse.ArgumentTypeError( - "You need to specify the argument of input file" + "You need to specify the argument of input file", ) diff --git a/nerval/evaluate.py b/nerval/evaluate.py index 0efe5b4..e72fd08 100644 --- a/nerval/evaluate.py +++ b/nerval/evaluate.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import logging import os from csv import reader @@ -138,13 +136,19 @@ def compute_matches( ): if not found_aligned_end: rest_predict, visited = look_for_further_entity_part( - i + 1, tag_ref, prediction, labels_predict + i + 1, + tag_ref, + prediction, + labels_predict, ) current_compar += rest_predict visited_predict += visited rest_annot, visited = look_for_further_entity_part( - i + 1, tag_ref, annotation, labels_annot + i + 1, + tag_ref, + annotation, + labels_annot, ) current_ref += rest_annot visited_annot += visited @@ -221,7 +225,9 @@ def get_labels_aligned(original: str, aligned: str, labels_original: list) -> li def compute_scores( - annot_tags_count: dict, predict_tags_count: dict, matches: dict + annot_tags_count: dict, + predict_tags_count: dict, + matches: dict, ) -> dict: """Compute Precision, Recall and F1 score for all entity types found in annotation and prediction. @@ -270,7 +276,9 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict: # Align annotation and prediction align_result = edlib.align(annotation["words"], prediction["words"], task="path") nice_alignment = edlib.getNiceAlignment( - align_result, annotation["words"], prediction["words"] + align_result, + annotation["words"], + prediction["words"], ) annot_aligned = nice_alignment["query_aligned"] @@ -278,10 +286,14 @@ def evaluate(annotation: dict, prediction: dict, threshold: int) -> dict: # Align labels from string alignment labels_annot_aligned = get_labels_aligned( - annotation["words"], annot_aligned, annotation["labels"] + annotation["words"], + annot_aligned, + annotation["labels"], ) labels_predict_aligned = get_labels_aligned( - prediction["words"], predict_aligned, prediction["labels"] + prediction["words"], + predict_aligned, + prediction["labels"], ) # Get nb match @@ -334,7 +346,7 @@ def run(annotation: Path, prediction: Path, threshold: int, verbose: bool) -> di def run_multiple(file_csv, folder, threshold, verbose): """Run the program for multiple files (correlation indicated in the csv file)""" # Read the csv in a list - with open(file_csv, "r") as read_obj: + with open(file_csv) as read_obj: csv_reader = reader(read_obj) list_cor = list(csv_reader) @@ -375,8 +387,8 @@ def run_multiple(file_csv, folder, threshold, verbose): round(precision / count, 3), round(recall / count, 3), round(f1 / count, 3), - ] - ] + ], + ], ) print(table) else: diff --git a/nerval/parse.py b/nerval/parse.py index c4736fa..040d9fc 100644 --- a/nerval/parse.py +++ b/nerval/parse.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import re from typing import List @@ -64,7 +63,7 @@ def parse_bio(lines: List[str]) -> dict: if "§" in " ".join(lines): raise ( Exception( - "§ found in input file. Since this character is used in a specific way during evaluation, prease remove it from files." + "§ found in input file. Since this character is used in a specific way during evaluation, please remove it from files." ) ) @@ -145,7 +144,7 @@ def parse_bio(lines: List[str]) -> dict: result = None if words: - result = dict() + result = {} result["words"] = " ".join(words) result["labels"] = labels result["entity_count"] = entity_count diff --git a/nerval/utils.py b/nerval/utils.py index 21c6c20..01d4f83 100644 --- a/nerval/utils.py +++ b/nerval/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from prettytable import MARKDOWN, PrettyTable @@ -23,7 +22,7 @@ def print_results(scores: dict): rec, f1, scores[tag]["Support"], - ] + ], ) table = PrettyTable() @@ -44,7 +43,7 @@ def print_result_compact(scores: dict): round(scores["All"]["R"], 3), round(scores["All"]["F1"], 3), scores["All"]["Support"], - ] + ], ] table = PrettyTable() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..447804e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[tool.ruff] +exclude = [".git", "__pycache__"] +ignore = [ + "E501", + # Conflicts with the formatter + "COM812" +] +select = [ + # pycodestyle + "E", + "W", + # Pyflakes + "F", + # Flake8 Debugger + "T1", + # Isort + "I", + # Pyupgrade + "UP", + # Pandas-vet + "PD", + # Flake8-comprehension + "C4", + # Flake8-builtins + "A", + # flake8-commas + "COM", + # flake8-import-conventions + "ICN", + # flake8-raise + "RSE", + # flake8-quotes + "Q", + # flake8-unused-arguments + "ARG", +] diff --git a/setup.py b/setup.py index 427ffda..c15ce58 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from pathlib import Path from setuptools import find_packages, setup @@ -22,7 +21,7 @@ def parse_requirements(): path = Path(__file__).parent.resolve() / "requirements.txt" assert path.exists(), f"Missing requirements: {path}" return list( - map(parse_requirements_line, map(str.strip, path.read_text().splitlines())) + map(parse_requirements_line, map(str.strip, path.read_text().splitlines())), ) diff --git a/tests/conftest.py b/tests/conftest.py index d73df4f..5381d25 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from pathlib import Path import pytest diff --git a/tests/test_align.py b/tests/test_align.py index a18c199..35be3ba 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import edlib import pytest @@ -9,7 +8,7 @@ import pytest ( "Gérard de Nerval was born in Paris in 1808 .", "G*rard de *N*erval bo*rn in Paris in 1833 *.", - ) + ), ], ) def test_align(query, target): diff --git a/tests/test_compute_matches.py b/tests/test_compute_matches.py index 1f62ce0..270c448 100644 --- a/tests/test_compute_matches.py +++ b/tests/test_compute_matches.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest from nerval import evaluate @@ -8,81 +7,81 @@ THRESHOLD = 0.30 # fmt: off fake_tags_aligned_nested_perfect = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'O', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'O', - 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "O", + "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", + "O", + "O", ] fake_tags_aligned_nested_false = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'O', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'O', - 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "O", + "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", + "O", + "O", ] fake_predict_tags_aligned = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', 'O', - 'O', 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - 'B-***', 'I-***', 'I-***', 'I-***', 'I-***', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O', 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "O", + "O", "O", "O", "O", + "O", "O", "O", "O", "O", + "O", + "O", "O", + "O", + "B-***", "I-***", "I-***", "I-***", "I-***", + "O", + "O", "O", + "O", + "B-DAT", "I-DAT", "I-DAT", "I-DAT", + "O", + "O", "O", ] # fmt: on diff --git a/tests/test_compute_scores.py b/tests/test_compute_scores.py index e4dc730..d51af08 100644 --- a/tests/test_compute_scores.py +++ b/tests/test_compute_scores.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest from nerval import evaluate @@ -11,7 +10,7 @@ from nerval import evaluate {"All": 3, "DAT": 1, "LOC": 1, "PER": 1}, {"All": 3, "DAT": 1, "***": 1, "PER": 1}, {"All": 1, "PER": 1, "LOC": 0, "DAT": 0}, - ) + ), ], ) def test_compute_scores(annot, predict, matches): diff --git a/tests/test_get_labels_aligned.py b/tests/test_get_labels_aligned.py index a2d45bb..b4daa39 100644 --- a/tests/test_get_labels_aligned.py +++ b/tests/test_get_labels_aligned.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest from nerval import evaluate @@ -11,88 +10,88 @@ fake_predict_aligned = "G*rard de *N*erval ----bo*rn in Paris in 1833 *." # fmt: off fake_annot_tags_original = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', - 'O', - 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "O", + "O", "O", "O", + "O", + "O", "O", "O", "O", + "O", + "O", "O", + "O", + "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", + "O", + "O", "O", + "O", + "B-DAT", "I-DAT", "I-DAT", "I-DAT", + "O", + "O", ] fake_predict_tags_original = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - '***', '***', '***', '***', '***', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O', 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "O", + "O", "O", "O", "O", "O", + "O", + "O", "O", + "O", + "***", "***", "***", "***", "***", + "O", + "O", "O", + "O", + "B-DAT", "I-DAT", "I-DAT", "I-DAT", + "O", + "O", "O", ] expected_annot_tags_aligned = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', - 'O', - 'O', 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O', 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "O", + "O", "O", "O", + "O", + "O", "O", "O", "O", "O", + "O", + "O", "O", + "O", + "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", + "O", + "O", "O", + "O", + "B-DAT", "I-DAT", "I-DAT", "I-DAT", + "O", + "O", "O", ] expected_predict_tags_aligned = [ - 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', - 'I-PER', - 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', - 'O', - 'O', 'O', 'O', 'O', - 'O', 'O', 'O', 'O', 'O', - 'O', - 'O', 'O', - 'O', - '***', '***', '***', '***', '***', - 'O', - 'O', 'O', - 'O', - 'B-DAT', 'I-DAT', 'I-DAT', 'I-DAT', - 'O', - 'O', 'O' + "B-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", + "I-PER", + "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", "I-PER", + "O", + "O", "O", "O", "O", + "O", "O", "O", "O", "O", + "O", + "O", "O", + "O", + "***", "***", "***", "***", "***", + "O", + "O", "O", + "O", + "B-DAT", "I-DAT", "I-DAT", "I-DAT", + "O", + "O", "O", ] # fmt: on diff --git a/tests/test_parse_bio.py b/tests/test_parse_bio.py index 1089795..66f8957 100644 --- a/tests/test_parse_bio.py +++ b/tests/test_parse_bio.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest from nerval import evaluate diff --git a/tests/test_run.py b/tests/test_run.py index b88595a..2381dd6 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest from nerval import evaluate -- GitLab