diff --git a/bio_parser/__init__.py b/bio_parser/__init__.py index e142d5a75614f7be2c8b9f89afe60064e177db3c..7769488795dbb6de906c6e5895bd001f322c6f60 100644 --- a/bio_parser/__init__.py +++ b/bio_parser/__init__.py @@ -17,3 +17,6 @@ logging.basicConfig( # Add colorful tracebacks to crash with elegance # https://rich.readthedocs.io/en/latest/traceback.html traceback.install() + +# Reserved name for global statistics +GLOBAL_STAT_NAME = "total" diff --git a/bio_parser/parse/exceptions.py b/bio_parser/parse/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..6009799d4369d4c47384ab365fb5cb9cf5bc8a47 --- /dev/null +++ b/bio_parser/parse/exceptions.py @@ -0,0 +1,38 @@ +"""Exceptions raised during file parsing.""" +from pathlib import Path + + +class FileProcessingError(Exception): + """Raised when a problem is encountered while parsing a file.""" + + filename: Path + """ + Path of the file being processed. + """ + + def __init__(self, filename: Path, *args: object) -> None: + super().__init__(*args) + self.filename = filename + + +class InvalidFile(FileProcessingError): + """Raised when the file is not valid.""" + + def __str__(self) -> str: + return f"BIO file {self.filename} is not valid" + + +class ForbiddenEntityName(FileProcessingError): + """Raised when the file is not valid.""" + + entity_name: str + """ + Forbidden entity name encountered. + """ + + def __init__(self, filename: Path, entity_name: str, *args: object) -> None: + super().__init__(filename=filename, *args) + self.entity_name = entity_name + + def __str__(self) -> str: + return f"Invalid entity name {self.entity_name}: reserved for global statistics ({self.filename})." diff --git a/bio_parser/utils.py b/bio_parser/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a5f485cb51097d800e7555cb8e2e9670b413f256 --- /dev/null +++ b/bio_parser/utils.py @@ -0,0 +1,112 @@ +"""Utils functions.""" + +import logging +from operator import attrgetter +from pathlib import Path + +from bio_parser.parse.document import Document +from bio_parser.parse.exceptions import ForbiddenEntityName, InvalidFile + +logger = logging.getLogger(__name__) + + +def check_complete(labels: list[Path], predictions: list[Path]): + """Check that each label BIO file has a corresponding prediction BIO file and each prediction BIO file has a corresponding label BIO file. Otherwise raise an error. + + Args: + labels: List of sorted label BIO files. + predictions: List of sorted prediction BIO files. + """ + # List filenames in prediction and label directories. + label_filenames = {label.name for label in labels} + prediction_filenames = {prediction.name for prediction in predictions} + + # Raise an error if there are any missing files. + if label_filenames != prediction_filenames: + messages = [] + missing_label_files = prediction_filenames.difference(label_filenames) + missing_pred_files = label_filenames.difference(prediction_filenames) + if len(missing_pred_files) > 0: + messages.append(f"Missing prediction files: {missing_pred_files}.") + if len(missing_label_files) > 0: + messages.append(f"Missing label files: {missing_label_files}.") + raise FileNotFoundError("\n".join(messages)) + + +def check_valid_bio( + bio_files: list[Path], global_stat_name: str | None = None +) -> list[Document]: + """Check that BIO files exists and are valid. + + Args: + bio_files (list[Path]): List of BIO files to check + global_stat_name (str | None, optional): Forbid an entity name. Defaults to None. + + Raises: + FileNotFoundError: A file could not be found. + FileNotFoundError: + Exception: Forbidden entity name used in a file. + + Returns: + list[Document]: _description_ + """ + parsed = [] + for filename in bio_files: + # Raise an error if the document does not exist + if not filename.exists(): + raise FileNotFoundError( + f"BIO file {filename} does not exist.", + ) + + # Raise an error if the document is not valid + try: + document = Document.from_file(filename) + except Exception as e: + raise InvalidFile(filename) from e + + # Raise an error if an entity is named global_stat_name + if global_stat_name and global_stat_name in { + entity[0] for entity in document.entities + }: + raise ForbiddenEntityName(filename=filename, entity_name=global_stat_name) + parsed.append(document) + return parsed + + +def load_dataset( + label_dir: Path, + prediction_dir: Path, +) -> list[tuple[Document, Document]]: + """Load BIO files for a given dataset. + + Args: + label_dir (Path): Path to the label directory. + prediction_dir (Path): Path to prediction directory. + + Returns: + list[tuple[Document, Document]]: A list of tuple containing the label and corresponding prediction Documents. + """ + sorted_labels = sorted(label_dir.glob("*.bio"), key=attrgetter("name")) + sorted_predictions = sorted(prediction_dir.glob("*.bio"), key=attrgetter("name")) + + # Check if a directory is empty + if not (sorted_labels and sorted_predictions): + messages = [] + if not sorted_labels: + messages.append(f"Empty label directory: {label_dir}.") + if not sorted_predictions: + messages.append(f"Empty prediction directory: {prediction_dir}.") + raise FileNotFoundError("\n".join(messages)) + + # Check that the dataset is complete and valid + check_complete(sorted_labels, sorted_predictions) + + logger.info("Loading labels...") + labels = check_valid_bio(sorted_labels) + + logger.info("Loading prediction...") + predictions = check_valid_bio(sorted_predictions) + + logger.info("The dataset is complete and valid.") + # Return each label and prediction Document couple + return list(zip(labels, predictions)) diff --git a/docs/reference/parse/exceptions.md b/docs/reference/parse/exceptions.md new file mode 100644 index 0000000000000000000000000000000000000000..8f3ef960e179ed7f197186cb151d86cbc206cb0d --- /dev/null +++ b/docs/reference/parse/exceptions.md @@ -0,0 +1,3 @@ +# Exceptions + +::: bio_parser.parse.exceptions \ No newline at end of file diff --git a/docs/reference/utils.md b/docs/reference/utils.md new file mode 100644 index 0000000000000000000000000000000000000000..972dce9f72788955ca360f5a0be679e55fa83ea7 --- /dev/null +++ b/docs/reference/utils.md @@ -0,0 +1,3 @@ +# Utils + +::: bio_parser.utils \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bc0dff87b554bb20d8d38e6086c895c4565695b0..e7eb4efe29d2dd5faf4a18e5051d2529ec044370 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,8 @@ ignore = [ "D417", # May cause some conflicts "COM812", + # Missing docstring in __init__ and other magic methods + "D105", "D107", ] select = [ # pycodestyle diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c0a662153efc99ebddde92082e540481eb46e360 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +FIXTURES = Path(__file__).with_name("fixtures") diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index c0a662153efc99ebddde92082e540481eb46e360..0000000000000000000000000000000000000000 --- a/tests/conftest.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -FIXTURES = Path(__file__).with_name("fixtures") diff --git a/tests/fixtures/utils/bad_entity_name.bio b/tests/fixtures/utils/bad_entity_name.bio new file mode 100644 index 0000000000000000000000000000000000000000..82e85d7c66ce7d7f5ce9c5b65c49d1ded28b3625 --- /dev/null +++ b/tests/fixtures/utils/bad_entity_name.bio @@ -0,0 +1,5 @@ +This 0 +entity 0 +type B-total +is I-total +reserved I-total \ No newline at end of file diff --git a/tests/fixtures/utils/bad_format.bio b/tests/fixtures/utils/bad_format.bio new file mode 100644 index 0000000000000000000000000000000000000000..416a0b6e5c55a12743303476ac8997da401bd6ca --- /dev/null +++ b/tests/fixtures/utils/bad_format.bio @@ -0,0 +1,5 @@ +This 0 +is 0 +a 0 +bad I-adj +format 0 \ No newline at end of file diff --git a/tests/fixtures/utils/labels/example_0.bio b/tests/fixtures/utils/labels/example_0.bio new file mode 100644 index 0000000000000000000000000000000000000000..a922b118f09a51046ef179df4b6b99a5bff2d4a0 --- /dev/null +++ b/tests/fixtures/utils/labels/example_0.bio @@ -0,0 +1,5 @@ +What O +a O +nice O +toolbox O +! O \ No newline at end of file diff --git a/tests/fixtures/utils/labels/example_1.bio b/tests/fixtures/utils/labels/example_1.bio new file mode 100644 index 0000000000000000000000000000000000000000..122f1c0d2c5e2d1b430d7debe4498b2a4187cb6b --- /dev/null +++ b/tests/fixtures/utils/labels/example_1.bio @@ -0,0 +1,37 @@ +Dissapte O +a O +5 O +rebere O +de O +Joan B-husband_name +Massuet B-husband_surname +pages B-husband_occupation +del O +regne B-husband_location +de I-husband_location +frança I-husband_location +ha= O +bitant O +en O +Collsabadell B-husband_location +ab O +Joana B-wife_name +donsella B-wife_state +filla O +de O +Bathomeu B-wifes_father_name +Pi= B-wifes_father_surname +joan I-wifes_father_surname +texidor B-wifes_father_occupation +de I-wifes_father_occupation +llana I-wifes_father_occupation +de O +Sta B-wifes_father_location +Maria I-wifes_father_location +de I-wifes_father_location +Palau I-wifes_father_location +tordera I-wifes_father_location +y O +de O +Elisabeth B-wifes_mother_name +defuncts O \ No newline at end of file diff --git a/tests/fixtures/utils/labels/example_2.bio b/tests/fixtures/utils/labels/example_2.bio new file mode 100644 index 0000000000000000000000000000000000000000..f9a0330e1c0c5b7026ecc2588b3f19271f88037f --- /dev/null +++ b/tests/fixtures/utils/labels/example_2.bio @@ -0,0 +1,88 @@ +So O +he O +put O +up O +for O +the O +night B-time +at O +The B-fac +Admiral's I-fac +Head I-fac +, O +that O +famous O +Portsmouth B-gpe +hostelry O +, O +second B-ordinal +only O +in O +historic O +interest O +to O +The B-fac +George I-fac +, O +unhappily O +destroyed O +by O +German B-norp +bombs O +during O +the O +last O +war O +. O +Having O +deposited O +his O +baggage O +and O +unpacked O +his O +overnight-bag O +he O +went O +in O +search O +of O +a O +drink O +. O +The O +lower O +bar O +was O +empty O +, O +save O +for O +the O +lady O +known O +by O +all O +habitue O +*?2s O +as O +' O +Seaweed B-per +' O +, O +and O +a O +youngish O +, O +sharp-eyed O +man O +who O +was O +staring O +moodily O +into O +a O +gin O +and O +tonic O +. O \ No newline at end of file diff --git a/tests/fixtures/utils/labels/example_3.bio b/tests/fixtures/utils/labels/example_3.bio new file mode 100644 index 0000000000000000000000000000000000000000..c8eecd1e5d6a06edd5c22282dc0dfa06701acbd5 --- /dev/null +++ b/tests/fixtures/utils/labels/example_3.bio @@ -0,0 +1,19 @@ +SAINT-LOUIS B-intitule +en I-intitule +l'ISLE I-intitule +(Les I-intitule +administrateurs I-intitule +de I-intitule +la I-intitule +compagnie I-intitule +de I-intitule +charité I-intitule +des I-intitule +pauvres I-intitule +de I-intitule +l'église I-intitule +royale I-intitule +de) I-intitule +X1A B-cote_serie +4701 B-cote_article +41 B-precisions_sur_cote \ No newline at end of file diff --git a/tests/fixtures/utils/labels/example_4.bio b/tests/fixtures/utils/labels/example_4.bio new file mode 100644 index 0000000000000000000000000000000000000000..d49037bcf492ed37a220cb0e1205ceced9549e22 --- /dev/null +++ b/tests/fixtures/utils/labels/example_4.bio @@ -0,0 +1,5 @@ +d° B-surname +Jeannine B-firstname +17 B-birth_date +P B-location_of_birth +f B-link \ No newline at end of file diff --git a/tests/fixtures/utils/predictions/example_0.bio b/tests/fixtures/utils/predictions/example_0.bio new file mode 100644 index 0000000000000000000000000000000000000000..a922b118f09a51046ef179df4b6b99a5bff2d4a0 --- /dev/null +++ b/tests/fixtures/utils/predictions/example_0.bio @@ -0,0 +1,5 @@ +What O +a O +nice O +toolbox O +! O \ No newline at end of file diff --git a/tests/fixtures/utils/predictions/example_1.bio b/tests/fixtures/utils/predictions/example_1.bio new file mode 100644 index 0000000000000000000000000000000000000000..122f1c0d2c5e2d1b430d7debe4498b2a4187cb6b --- /dev/null +++ b/tests/fixtures/utils/predictions/example_1.bio @@ -0,0 +1,37 @@ +Dissapte O +a O +5 O +rebere O +de O +Joan B-husband_name +Massuet B-husband_surname +pages B-husband_occupation +del O +regne B-husband_location +de I-husband_location +frança I-husband_location +ha= O +bitant O +en O +Collsabadell B-husband_location +ab O +Joana B-wife_name +donsella B-wife_state +filla O +de O +Bathomeu B-wifes_father_name +Pi= B-wifes_father_surname +joan I-wifes_father_surname +texidor B-wifes_father_occupation +de I-wifes_father_occupation +llana I-wifes_father_occupation +de O +Sta B-wifes_father_location +Maria I-wifes_father_location +de I-wifes_father_location +Palau I-wifes_father_location +tordera I-wifes_father_location +y O +de O +Elisabeth B-wifes_mother_name +defuncts O \ No newline at end of file diff --git a/tests/fixtures/utils/predictions/example_2.bio b/tests/fixtures/utils/predictions/example_2.bio new file mode 100644 index 0000000000000000000000000000000000000000..0da0ac3818975fd907922818cbb3791909044436 --- /dev/null +++ b/tests/fixtures/utils/predictions/example_2.bio @@ -0,0 +1,90 @@ +So O +he O +put O +up O +for O +the O +night O +at O +the O +Admiral's B-fac +Head O +, O +that O +famour O +Portsmarith B-gpe +hostelry O +, O +secand O +only O +in O +historic O +interest O +to O +the O +George B-gpe +, O +unhappily O +destrayed O +by O +German B-norp +lomber O +during O +the O +last O +war O +. O +Having O +deposited O +his O +buggage O +and O +cmpacked O +his O +overnight-leg O +he O +went O +in O +search O +af O +a O +drink O +. O +The O +lower O +bar O +was O +empty O +, O +save O +for O +the O +lady O +known O +by O +all O +hahitue O +? B-percent +? O +'s O +as O +' O +Seaweed B-work_of_art +, O +, O +anda O +youngish O +, O +sharp O +eyed O +man O +who O +was O +storing O +moodily O +into O +a O +gin O +and O +tonic O +. O \ No newline at end of file diff --git a/tests/fixtures/utils/predictions/example_3.bio b/tests/fixtures/utils/predictions/example_3.bio new file mode 100644 index 0000000000000000000000000000000000000000..2bb5920bd778e7efb5f27f68beea2caa2b625b1c --- /dev/null +++ b/tests/fixtures/utils/predictions/example_3.bio @@ -0,0 +1,22 @@ +SAINT-LOUIS B-intitule +ent I-intitule +ISLE I-intitule +(les I-intitule +administrateurs I-intitule +de I-intitule +la I-intitule +compagnie I-intitule +de I-intitule +Charité I-intitule +des I-intitule +pauvres I-intitule +de I-intitule +l'église I-intitule +royale I-intitule +de) I-intitule +8 B-date +janvier I-date +1771 I-date +X1A B-cote_serie +4701 B-cote_article +41 B-precisions_sur_cote \ No newline at end of file diff --git a/tests/fixtures/utils/predictions/example_4.bio b/tests/fixtures/utils/predictions/example_4.bio new file mode 100644 index 0000000000000000000000000000000000000000..2b27cabdcd18680d33b55390ad683aa10f45dbef --- /dev/null +++ b/tests/fixtures/utils/predictions/example_4.bio @@ -0,0 +1,5 @@ +d° B-surname +Jeaniine B-firstname +17 B-birth_date +P B-location_of_birth +f B-link \ No newline at end of file diff --git a/tests/parse/__init__.py b/tests/parse/__init__.py index 62facd4acb6d8fd508632266c4e06640cf7e892c..be6a932cc2da4165f3b8f519bdfbdddb155822d0 100644 --- a/tests/parse/__init__.py +++ b/tests/parse/__init__.py @@ -1,3 +1,3 @@ -from tests.conftest import FIXTURES +from tests import FIXTURES DATA_DIR = FIXTURES / "parse" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0162bffb89ba6bbac3f07a2be5b2ba6d2f1f92e9 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,160 @@ +import pytest +from bio_parser.parse.document import Document +from bio_parser.utils import load_dataset, check_valid_bio, check_complete +from tests import FIXTURES + +DATA = FIXTURES / "utils" + + +@pytest.mark.parametrize( + "filenames", + ( + ( + [ + DATA / "bad_format.bio", + ] + ), + ( + [ + DATA / "bad_entity_name.bio", + ] + ), + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_1.bio", + DATA / "labels" / "example_2.bio", + DATA / "bad_entity_name.bio", + ] + ), + ), +) +def test_check_valid_bio_raise(filenames): + with pytest.raises(Exception): + check_valid_bio(filenames) + + +@pytest.mark.parametrize( + "filenames", + ( + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_1.bio", + DATA / "labels" / "example_2.bio", + ] + ), + ( + [ + DATA / "predictions" / "example_0.bio", + DATA / "predictions" / "example_1.bio", + DATA / "predictions" / "example_2.bio", + ] + ), + ([]), + ), +) +def test_check_valid_bio(filenames): + check_valid_bio(filenames) + + +@pytest.mark.parametrize( + "labels, predictions", + ( + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_1.bio", + DATA / "labels" / "example_2.bio", + ], + [ + DATA / "predictions" / "example_0.bio", + DATA / "predictions" / "example_1.bio", + DATA / "predictions" / "example_2.bio", + ], + ), + ( + [], + [], + ), + ), +) +def test_check_complete(labels, predictions): + check_complete(labels, predictions) + + +@pytest.mark.parametrize( + "labels, predictions, message", + ( + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_1.bio", + DATA / "labels" / "example_2.bio", + ], + [ + DATA / "predictions" / "example_0.bio", + DATA / "predictions" / "example_1.bio", + ], + "Missing prediction files: {'example_2.bio'}.", + ), + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_2.bio", + ], + [ + DATA / "predictions" / "example_0.bio", + DATA / "predictions" / "example_1.bio", + DATA / "predictions" / "example_2.bio", + ], + "Missing label files: {'example_1.bio'}.", + ), + ( + [ + DATA / "labels" / "example_0.bio", + DATA / "labels" / "example_2.bio", + ], + [ + DATA / "predictions" / "example_1.bio", + DATA / "predictions" / "example_2.bio", + ], + "Missing prediction files: {'example_0.bio'}.\nMissing label files: {'example_1.bio'}.", + ), + ), +) +def test_check_complete_raise(labels, predictions, message): + with pytest.raises(FileNotFoundError, match=message): + check_complete(labels, predictions) + + +def test_load_dataset(): + label_dir = DATA / "labels" + prediction_dir = DATA / "predictions" + documents = load_dataset(label_dir, prediction_dir) + for i in range(3): + filename = f"example_{i}.bio" + assert documents[i] == ( + Document.from_file(label_dir / filename), + Document.from_file(prediction_dir / filename), + ) + + +@pytest.mark.parametrize( + "label_dir, prediction_dir, message", + ( + ( + DATA / "labels_empty", + DATA / "predictions", + "Empty label directory", + ), + ( + DATA / "labels", + DATA / "predictions_empty", + "Empty prediction directory", + ), + ), +) +def test_load_empty_dataset(label_dir, prediction_dir, message): + with pytest.raises(FileNotFoundError, match=f"^{message}: .*"): + load_dataset(label_dir, prediction_dir)