Skip to content
Snippets Groups Projects
Verified Commit ecabf84a authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

Port utils module

parent db2cf7bb
No related branches found
No related tags found
1 merge request!3Port utils module
Pipeline #151712 passed
Showing
with 487 additions and 3 deletions
......@@ -17,3 +17,6 @@ logging.basicConfig(
# Add colorful tracebacks to crash with elegance
# https://rich.readthedocs.io/en/latest/traceback.html
traceback.install()
# Reserved name for global statistics
GLOBAL_STAT_NAME = "total"
"""Exceptions raised during file parsing."""
from pathlib import Path
class FileProcessingError(Exception):
"""Raised when a problem is encountered while parsing a file."""
filename: Path
"""
Path of the file being processed.
"""
def __init__(self, filename: Path, *args: object) -> None:
super().__init__(*args)
self.filename = filename
class InvalidFile(FileProcessingError):
"""Raised when the file is not valid."""
def __str__(self) -> str:
return f"BIO file {self.filename} is not valid"
class ForbiddenEntityName(FileProcessingError):
"""Raised when the file is not valid."""
entity_name: str
"""
Forbidden entity name encountered.
"""
def __init__(self, filename: Path, entity_name: str, *args: object) -> None:
super().__init__(filename=filename, *args)
self.entity_name = entity_name
def __str__(self) -> str:
return f"Invalid entity name {self.entity_name}: reserved for global statistics ({self.filename})."
"""Utils functions."""
import logging
from operator import attrgetter
from pathlib import Path
from bio_parser.parse.document import Document
from bio_parser.parse.exceptions import ForbiddenEntityName, InvalidFile
logger = logging.getLogger(__name__)
def check_complete(labels: list[Path], predictions: list[Path]):
"""Check that each label BIO file has a corresponding prediction BIO file and each prediction BIO file has a corresponding label BIO file. Otherwise raise an error.
Args:
labels: List of sorted label BIO files.
predictions: List of sorted prediction BIO files.
"""
# List filenames in prediction and label directories.
label_filenames = {label.name for label in labels}
prediction_filenames = {prediction.name for prediction in predictions}
# Raise an error if there are any missing files.
if label_filenames != prediction_filenames:
messages = []
missing_label_files = prediction_filenames.difference(label_filenames)
missing_pred_files = label_filenames.difference(prediction_filenames)
if len(missing_pred_files) > 0:
messages.append(f"Missing prediction files: {missing_pred_files}.")
if len(missing_label_files) > 0:
messages.append(f"Missing label files: {missing_label_files}.")
raise FileNotFoundError("\n".join(messages))
def check_valid_bio(
bio_files: list[Path], global_stat_name: str | None = None
) -> list[Document]:
"""Check that BIO files exists and are valid.
Args:
bio_files (list[Path]): List of BIO files to check
global_stat_name (str | None, optional): Forbid an entity name. Defaults to None.
Raises:
FileNotFoundError: A file could not be found.
FileNotFoundError:
Exception: Forbidden entity name used in a file.
Returns:
list[Document]: _description_
"""
parsed = []
for filename in bio_files:
# Raise an error if the document does not exist
if not filename.exists():
raise FileNotFoundError(
f"BIO file {filename} does not exist.",
)
# Raise an error if the document is not valid
try:
document = Document.from_file(filename)
except Exception as e:
raise InvalidFile(filename) from e
# Raise an error if an entity is named global_stat_name
if global_stat_name and global_stat_name in {
entity[0] for entity in document.entities
}:
raise ForbiddenEntityName(filename=filename, entity_name=global_stat_name)
parsed.append(document)
return parsed
def load_dataset(
label_dir: Path,
prediction_dir: Path,
) -> list[tuple[Document, Document]]:
"""Load BIO files for a given dataset.
Args:
label_dir (Path): Path to the label directory.
prediction_dir (Path): Path to prediction directory.
Returns:
list[tuple[Document, Document]]: A list of tuple containing the label and corresponding prediction Documents.
"""
sorted_labels = sorted(label_dir.glob("*.bio"), key=attrgetter("name"))
sorted_predictions = sorted(prediction_dir.glob("*.bio"), key=attrgetter("name"))
# Check if a directory is empty
if not (sorted_labels and sorted_predictions):
messages = []
if not sorted_labels:
messages.append(f"Empty label directory: {label_dir}.")
if not sorted_predictions:
messages.append(f"Empty prediction directory: {prediction_dir}.")
raise FileNotFoundError("\n".join(messages))
# Check that the dataset is complete and valid
check_complete(sorted_labels, sorted_predictions)
logger.info("Loading labels...")
labels = check_valid_bio(sorted_labels)
logger.info("Loading prediction...")
predictions = check_valid_bio(sorted_predictions)
logger.info("The dataset is complete and valid.")
# Return each label and prediction Document couple
return list(zip(labels, predictions))
# Exceptions
::: bio_parser.parse.exceptions
\ No newline at end of file
# Utils
::: bio_parser.utils
\ No newline at end of file
......@@ -53,6 +53,8 @@ ignore = [
"D417",
# May cause some conflicts
"COM812",
# Missing docstring in __init__ and other magic methods
"D105", "D107",
]
select = [
# pycodestyle
......
from pathlib import Path
FIXTURES = Path(__file__).with_name("fixtures")
from pathlib import Path
FIXTURES = Path(__file__).with_name("fixtures")
This 0
entity 0
type B-total
is I-total
reserved I-total
\ No newline at end of file
This 0
is 0
a 0
bad I-adj
format 0
\ No newline at end of file
What O
a O
nice O
toolbox O
! O
\ No newline at end of file
Dissapte O
a O
5 O
rebere O
de O
Joan B-husband_name
Massuet B-husband_surname
pages B-husband_occupation
del O
regne B-husband_location
de I-husband_location
frança I-husband_location
ha= O
bitant O
en O
Collsabadell B-husband_location
ab O
Joana B-wife_name
donsella B-wife_state
filla O
de O
Bathomeu B-wifes_father_name
Pi= B-wifes_father_surname
joan I-wifes_father_surname
texidor B-wifes_father_occupation
de I-wifes_father_occupation
llana I-wifes_father_occupation
de O
Sta B-wifes_father_location
Maria I-wifes_father_location
de I-wifes_father_location
Palau I-wifes_father_location
tordera I-wifes_father_location
y O
de O
Elisabeth B-wifes_mother_name
defuncts O
\ No newline at end of file
So O
he O
put O
up O
for O
the O
night B-time
at O
The B-fac
Admiral's I-fac
Head I-fac
, O
that O
famous O
Portsmouth B-gpe
hostelry O
, O
second B-ordinal
only O
in O
historic O
interest O
to O
The B-fac
George I-fac
, O
unhappily O
destroyed O
by O
German B-norp
bombs O
during O
the O
last O
war O
. O
Having O
deposited O
his O
baggage O
and O
unpacked O
his O
overnight-bag O
he O
went O
in O
search O
of O
a O
drink O
. O
The O
lower O
bar O
was O
empty O
, O
save O
for O
the O
lady O
known O
by O
all O
habitue O
*?2s O
as O
' O
Seaweed B-per
' O
, O
and O
a O
youngish O
, O
sharp-eyed O
man O
who O
was O
staring O
moodily O
into O
a O
gin O
and O
tonic O
. O
\ No newline at end of file
SAINT-LOUIS B-intitule
en I-intitule
l'ISLE I-intitule
(Les I-intitule
administrateurs I-intitule
de I-intitule
la I-intitule
compagnie I-intitule
de I-intitule
charité I-intitule
des I-intitule
pauvres I-intitule
de I-intitule
l'église I-intitule
royale I-intitule
de) I-intitule
X1A B-cote_serie
4701 B-cote_article
41 B-precisions_sur_cote
\ No newline at end of file
d° B-surname
Jeannine B-firstname
17 B-birth_date
P B-location_of_birth
f B-link
\ No newline at end of file
What O
a O
nice O
toolbox O
! O
\ No newline at end of file
Dissapte O
a O
5 O
rebere O
de O
Joan B-husband_name
Massuet B-husband_surname
pages B-husband_occupation
del O
regne B-husband_location
de I-husband_location
frança I-husband_location
ha= O
bitant O
en O
Collsabadell B-husband_location
ab O
Joana B-wife_name
donsella B-wife_state
filla O
de O
Bathomeu B-wifes_father_name
Pi= B-wifes_father_surname
joan I-wifes_father_surname
texidor B-wifes_father_occupation
de I-wifes_father_occupation
llana I-wifes_father_occupation
de O
Sta B-wifes_father_location
Maria I-wifes_father_location
de I-wifes_father_location
Palau I-wifes_father_location
tordera I-wifes_father_location
y O
de O
Elisabeth B-wifes_mother_name
defuncts O
\ No newline at end of file
So O
he O
put O
up O
for O
the O
night O
at O
the O
Admiral's B-fac
Head O
, O
that O
famour O
Portsmarith B-gpe
hostelry O
, O
secand O
only O
in O
historic O
interest O
to O
the O
George B-gpe
, O
unhappily O
destrayed O
by O
German B-norp
lomber O
during O
the O
last O
war O
. O
Having O
deposited O
his O
buggage O
and O
cmpacked O
his O
overnight-leg O
he O
went O
in O
search O
af O
a O
drink O
. O
The O
lower O
bar O
was O
empty O
, O
save O
for O
the O
lady O
known O
by O
all O
hahitue O
? B-percent
? O
's O
as O
' O
Seaweed B-work_of_art
, O
, O
anda O
youngish O
, O
sharp O
eyed O
man O
who O
was O
storing O
moodily O
into O
a O
gin O
and O
tonic O
. O
\ No newline at end of file
SAINT-LOUIS B-intitule
ent I-intitule
ISLE I-intitule
(les I-intitule
administrateurs I-intitule
de I-intitule
la I-intitule
compagnie I-intitule
de I-intitule
Charité I-intitule
des I-intitule
pauvres I-intitule
de I-intitule
l'église I-intitule
royale I-intitule
de) I-intitule
8 B-date
janvier I-date
1771 I-date
X1A B-cote_serie
4701 B-cote_article
41 B-precisions_sur_cote
\ No newline at end of file
d° B-surname
Jeaniine B-firstname
17 B-birth_date
P B-location_of_birth
f B-link
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment