diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0e03edff4763ef7ec4b616915b7447ea584c59d..77842d92a1ca6f9bd7f09f4fd0605e8961914930 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,6 @@ repos: rev: v4.5.0 hooks: - id: check-ast - - id: check-docstring-first - id: check-executables-have-shebangs - id: check-merge-conflict - id: check-symlinks diff --git a/README.md b/README.md index abf5283684939cea4d0f81d82fe5da51fbe08e00..05718c8d6512883458dc5233b5fad43ebd87a86a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# BIO Parser +# BIO2 Parser + +**Disclaimer**: This package only supports BIO2 and doesn't support BIO (yet). More on the distinction between formats in [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)). ## Documentation diff --git a/bio_parser/__init__.py b/bio_parser/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e142d5a75614f7be2c8b9f89afe60064e177db3c 100644 --- a/bio_parser/__init__.py +++ b/bio_parser/__init__.py @@ -0,0 +1,19 @@ +import logging +import sys + +from rich import traceback +from rich.console import Console +from rich.logging import RichHandler + +# Colorful logging +# https://rich.readthedocs.io/en/latest/logging.html +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler(console=Console(file=sys.stderr))], +) + +# Add colorful tracebacks to crash with elegance +# https://rich.readthedocs.io/en/latest/traceback.html +traceback.install() diff --git a/bio_parser/cli.py b/bio_parser/cli.py index b37574a16daca85d1089f0413744004d439063ab..15d01c6301f8367a1697d702d1c97a7d1cbaf5b2 100644 --- a/bio_parser/cli.py +++ b/bio_parser/cli.py @@ -1,4 +1,7 @@ import argparse +import errno + +from bio_parser.parse import add_validate_parser def main(): @@ -7,15 +10,17 @@ def main(): formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - # To add a sub-command, you can un-comment this snippet - # More information on https://docs.python.org/3/library/argparse.html#sub-commands - # commands = parser.add_subparsers(help="Explain your sub commands globally here") - # my_command = commands.add_parser("commandX", help="Do something") - # my_command.set_defaults(func=command_main) - # my_command.add_argument("element_id", type=uuid.UUID) + commands = parser.add_subparsers() + add_validate_parser(commands) args = vars(parser.parse_args()) if "func" in args: - args.pop("func")(**args) + # Run the subcommand's function + try: + status = args.pop("func")(**args) + parser.exit(status=status) + except KeyboardInterrupt: + # Just quit silently on ^C instead of displaying a long traceback + parser.exit(status=errno.EOWNERDEAD) else: parser.print_help() diff --git a/bio_parser/parse/__init__.py b/bio_parser/parse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..06ae06dc92286f48f1a6652f6d3d3244933c70d4 --- /dev/null +++ b/bio_parser/parse/__init__.py @@ -0,0 +1,27 @@ +""" +Validate a given BIO file. +""" + +from argparse import ArgumentParser +from pathlib import Path + +from bio_parser.parse.validate import run + + +def _check_bio_ext(filename: str) -> Path: + filepath = Path(filename) + assert filepath.suffix == ".bio" + return filepath + + +def add_validate_parser(subcommands): + parser: ArgumentParser = subcommands.add_parser( + "validate", + help=__doc__, + description=__doc__, + ) + parser.set_defaults(func=run) + + parser.add_argument( + "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*" + ) diff --git a/bio_parser/parse/document.py b/bio_parser/parse/document.py new file mode 100644 index 0000000000000000000000000000000000000000..983c5ae551bb55d85164961e7f9271bff3ed2b0f --- /dev/null +++ b/bio_parser/parse/document.py @@ -0,0 +1,393 @@ +"""Parse BIO files.""" +import logging +import re +from dataclasses import dataclass, field +from enum import Enum +from itertools import pairwise +from operator import attrgetter +from pathlib import Path + +PARSE_TOKEN = re.compile(r"(?P<text>[^\s]+) (?P<tag>(I|O|B))(\-(?P<ent>[^\s]+))?") +"""Regex that parses a line of a BIO file""" + +_logger = logging.getLogger(__name__) + + +class Tag(Enum): + """Supported Beginning-Inside-Outside tags.""" + + BEGINNING = "B" + INSIDE = "I" + OUTSIDE = "O" + + +def _make_ner_label(tag: Tag, label: str | None) -> str: + """Create the corresponding IOB label from the given tag and label. + + Args: + tag (Tag): Beginning-Inside-Outside tag. + label (str | None): Label of the token. + + Returns: + str: Corresponding IOB label. + + Examples: + >>> _make_ner_label(tag=Tag.BEGINNING, label="GPE") + 'B-GPE' + >>> _make_ner_label(tag=Tag.INSIDE, label="GPE") + 'I-GPE' + >>> _make_ner_label(tag=Tag.OUTSIDE, label=None) + 'O' + + """ + if tag == Tag.OUTSIDE: + assert label is None, f"Invalid label `{label}` with tag `{tag.value}`" + return tag.value + + assert label, f"No named entity label found with tag `{tag.value}`" + + return f"{tag.value}-{label}" + + +@dataclass(slots=True) +class Token: + """Token as tokenized in the BIO document.""" + + idx: int + """Index of the token in the document.""" + text: str + """Text representation of the token.""" + + @property + def _data(self) -> re.Match: + parsed = PARSE_TOKEN.match(self.text) + assert parsed is not None, "Could not parse annotation." + return parsed + + @property + def word(self) -> str: + """Text content of the token. + + Examples: + >>> Token(idx=0, text="Chicken B-Animal").word + 'Chicken' + """ + return self._data.group("text") + + @property + def label(self) -> str | None: + """Named entity type of this token. + + Examples: + >>> Token(idx=0, text="Chicken B-Animal").label + 'Animal' + """ + return self._data.group("ent") + + @property + def tag(self) -> Tag: + """IOB code of named entity tag. + + Examples: + >>> Token(idx=0, text="Chicken B-Animal").tag + <Tag.BEGINNING: 'B'> + """ + return Tag(self._data.group("tag")) + + @property + def iob_label(self) -> str: + """IOB label (Tag + Entity). + + Examples: + >>> Token(idx=0, text="Chicken B-Animal").iob_label + 'B-Animal' + """ + return _make_ner_label(tag=self.tag, label=self.label) + + @property + def labels(self) -> list[str]: + """Character-level IOB labels. + + Examples: + >>> Token(idx=0, text="Some B-PER").labels + ['B-PER', 'I-PER', 'I-PER', 'I-PER'] + + >>> Token(idx=1, text="one I-PER").labels + ['I-PER', 'I-PER', 'I-PER']. + """ + if self.tag == Tag.OUTSIDE: + return [self.iob_label] * len(self.word) + return [self.iob_label] + [ + _make_ner_label(tag=Tag.INSIDE, label=self.label), + ] * (len(self.word) - 1) + + @property + def chars(self) -> list[str]: + """The list of characters making up the token. + + Examples: + >>> Token(idx=0, text="Chicken B-Animal").chars + ['C', 'h', 'i', 'c', 'k', 'e', 'n'] + """ + return list(self.word) + + +@dataclass(slots=True) +class Span: + """Representation of a Named Entity Span.""" + + tokens: list[Token] = field(default_factory=list) + """List of tokens in the Span""" + + @property + def text(self) -> str: + """Join every word of the span by a whitespace. + + Examples: + >>> Span(tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ]).text + 'Chicken run' + """ + return " ".join(map(attrgetter("word"), self.tokens)) + + @property + def label(self) -> str | None: + """The named entity type of this span. All tokens composing the span have the same. + + Examples: + >>> Span(tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ]).label + 'Animal' + """ + if not self.tokens: + return + return self.tokens[0].label + + @property + def idx(self) -> int | None: + """The index of the first token of the span. + + Examples: + >>> Span(tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ]).idx + 0 + """ + if not self.tokens: + return None + return self.tokens[0].idx + + @property + def end(self) -> int | None: + """The index of the first token after the span. + + Examples: + >>> Span(tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ]).end + 2 + """ + if not self.tokens: + return + return self.tokens[-1].idx + 1 + + def add_token(self, token: Token) -> None: + """Add the provided token to this span. The token's label must match the Span's. + + Args: + token (Token): Token to add to this span. + """ + if self.label: + assert ( + token.label == self.label + ), "This token doesn't have the same label as this span." + self.tokens.append(token) + + @property + def labels(self) -> list[str]: + """Character-level IOB labels. + + Examples: + >>> Span(tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ]).labels + ['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal'] + """ + if not self.tokens: + return [] + + return [_make_ner_label(tag=Tag.BEGINNING, label=self.label)] + [ + _make_ner_label(tag=Tag.INSIDE, label=self.label), + ] * (len(self.text) - 1) + + @property + def chars(self) -> list[str]: + """Characters making up the span. + + Examples: + >>> Span( + ... tokens=[ + ... Token(idx=0, text="Chicken B-Animal"), + ... Token(idx=1, text="run I-Animal") + ... ] + ... ).chars + ['C', 'h', 'i', 'c', 'k', 'e', 'n', ' ', 'r', 'u', 'n'] + """ + return list(self.text) + + +@dataclass(slots=True) +class Document: + """Representation of a BIO document.""" + + bio_repr: str + """Full BIO representation of the Document""" + tokens: list[Token] = field(default_factory=list) + """List of the tokens in the Document""" + + spans: list[Span] = field(default_factory=list) + """List of the spans in the Document""" + + def __post_init__(self): + """Parses the tokens and the entity spans in the document.""" + span: Span | None = None + for idx, line in enumerate(self.bio_repr.splitlines()): + try: + token = Token(idx=idx, text=line) + self.tokens.append(token) + # Build spans + match token.tag: + case Tag.OUTSIDE: + # Close current span if present + if span: + self.spans.append(span) + span = None + case Tag.INSIDE: + assert span, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`." + span.add_token(token) + case Tag.BEGINNING: + # Close current span if present + if span: + self.spans.append(span) + # Start new one + span = Span() + span.add_token(token) + except AssertionError as e: + _logger.error(f"Error on token n°{token.idx}: {e}") + raise Exception from e + + # Last span + if span and span.tokens: + self.spans.append(span) + + @property + def words(self) -> list[str]: + """List of words making up the document.""" + return list(map(attrgetter("word"), self.tokens)) + + @property + def entities(self) -> list[tuple[str, str]]: + """List of entities making up the document.""" + return list( + map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)), + ) + + @property + def word_entities(self) -> list[tuple[str, str]]: + """List of entities in the words making up the document.""" + return list( + map(attrgetter("label", "word"), filter(attrgetter("label"), self.tokens)), + ) + + @property + def text(self) -> str: + """Join every word of the span by a whitespace.""" + return " ".join(map(attrgetter("word"), self.tokens)) + + @property + def char_labels(self) -> list[str]: + r"""Character-level IOB labels. + + Spaces between two tokens with the same label get the same label, others get 'O'. + + Examples: + The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label. + >>> Document(bio_repr="I B-Animal\nrun I-Animal").char_labels + ['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal'] + + The space between 'run' and 'fast' is tagged as 'O', because it's not the same label. + >>> Document(bio_repr="run B-Animal\nfast O").char_labels + ['B-Animal', 'I-Animal', 'I-Animal', 'O', 'O', 'O', 'O', 'O'] + """ + tags = [] + for token, next_token in pairwise(self.tokens + [None]): + # Add token tags + tags.extend(token.labels) + if next_token and token.label == next_token.label: + tags.append(next_token.iob_label) + elif next_token: + tags.append(Tag.OUTSIDE.value) + return tags + + @property + def word_labels(self) -> list[str]: + r"""Word-level IOB labels. + + Spaces between two tokens with the same label get the same label, others get 'O'. + + Examples: + The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label. + >>> Document(bio_repr="I B-Animal\nrun I-Animal").word_labels + ['Animal', 'Animal', 'Animal'] + + The space between 'run' and 'fast' is tagged as 'O', because it's not the same label. + >>> Document(bio_repr="run B-Animal\nfast O").word_labels + ['Animal', 'O', 'O'] + """ + tags = [] + for token, next_token in pairwise(self.tokens + [None]): + # Add token tags + tags.append(token.label or Tag.OUTSIDE.value) + + # Token of the next space + if ( + # This is not the last token + next_token + # This token is not tagged as O + and token.tag != Tag.OUTSIDE + # Same label between consecutive tokens + and token.label == next_token.label + ): + tags.append(token.label) + elif next_token: + tags.append(Tag.OUTSIDE.value) + return tags + + @property + def chars(self) -> list[str]: + r"""Characters making up the token. + + Examples: + >>> Document(bio_repr="I B-Animal\nrun I-Animal").chars + ['I', ' ', 'r', 'u', 'n'] + """ + return list(self.text) + + @classmethod + def from_file(cls, filepath: Path) -> "Document": + """Load a Document from a IOB file. + + Args: + filepath (Path): Path to the file to load. + + Returns: + Document: Parsed document + """ + return Document(filepath.read_text()) diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py new file mode 100644 index 0000000000000000000000000000000000000000..6f83192e82329f741ca5071ec912a6dc1a9c66c2 --- /dev/null +++ b/bio_parser/parse/validate.py @@ -0,0 +1,25 @@ +"""Validates the construction of the BIO file.""" +import json +import logging +from dataclasses import asdict +from pathlib import Path + +from bio_parser.parse.document import Document + +logger = logging.getLogger(__name__) + + +def run(filepaths: list[Path]) -> None: + """Validate the construction of multiple BIO files. + + Args: + filepaths (list[Path]): Files to check. + """ + for filepath in filepaths: + logger.info(f"Parsing file @ `{filepath}`") + try: + doc = Document.from_file(filepath) + filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2)) + except Exception as e: + logger.error(f"Could not load the file @ `{filepath}`: {e}") + logger.info(f"The file @ `{filepath}` is valid!") diff --git a/docs/reference/parse/document.md b/docs/reference/parse/document.md new file mode 100644 index 0000000000000000000000000000000000000000..24a7d4a9c71ef7692a350058a6af63e3933c70a4 --- /dev/null +++ b/docs/reference/parse/document.md @@ -0,0 +1,3 @@ +# Document + +::: bio_parser.parse.document \ No newline at end of file diff --git a/docs/reference/parse/validate.md b/docs/reference/parse/validate.md new file mode 100644 index 0000000000000000000000000000000000000000..a7da7f6d6f4025b7ad4e4f1e07cc68616221018a --- /dev/null +++ b/docs/reference/parse/validate.md @@ -0,0 +1,3 @@ +# Validate + +::: bio_parser.parse.validate \ No newline at end of file diff --git a/docs/usage/index.md b/docs/usage/index.md new file mode 100644 index 0000000000000000000000000000000000000000..d9c5583da89c08c46e0fc5993c4ef1309f8663cd --- /dev/null +++ b/docs/usage/index.md @@ -0,0 +1,6 @@ +# Usage + +When `bio-parser` is installed in your environment, you may use the following commands: + +`bio-parser validate` +: To parse and validate the structure of one or more BIO files. More details in the [dedicated page](./validate.md). diff --git a/docs/usage/validate.md b/docs/usage/validate.md new file mode 100644 index 0000000000000000000000000000000000000000..1bf35b1467c5c414b95703e38bed7a1b22aec320 --- /dev/null +++ b/docs/usage/validate.md @@ -0,0 +1,56 @@ +# Validation + +Use the `bio-parser validate` command to parse and validate the structure of one or more BIO2 files. + +## Supported format + +The BIO2 format is a common tagging format in NER (Named entities recognition) tasks. More details about it on [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)). + +An example of such a tagging format is given below. +```plaintext +Alex B-PER +is O +going O +to O +Los B-LOC +Angeles I-LOC +in O +California B-LOC +``` + +## Usage + +You can specify one or more paths to your BIO files. The extension used has to be `.bio`. +The parser will check them one by one and report the first error encountered. + +```shell +$ bio-parser validate input.bio +[12:37:20] INFO Parsing file @ `input.bio` validate.py:19 + INFO The file @ `input.bio` is valid! validate.py:25 +``` + +With multiple files: +```shell +$ bio-parser validate input1.bio input2.bio +[12:37:20] INFO Parsing file @ `input1.bio` validate.py:19 + INFO The file @ `input1.bio` is valid! validate.py:25 +[12:37:20] INFO Parsing file @ `input2.bio` validate.py:19 + INFO The file @ `input2.bio` is valid! validate.py:25 +``` + +With an invalid file. +```shell +$ bio-parser validate invalid.bio +[12:41:16] INFO Parsing file @ `invalid.bio` validate.py:19 + ERROR Error on token n°0: Found `Tag.INSIDE` before `Tag.BEGINNING`. document.py:283 + ERROR Could not load the file @ `invalid.bio`: validate.py:24 + INFO The file @ `invalid.bio` is valid! validate.py:25 +``` + +In addition to validating the structure of the file, a JSON representation of the BIO file is also saved at the same location. + +This JSON file has three keys: + +- `bio_repr`: The string in BIO format passed to the command, +- `tokens`: the list of tokens in the file, with their index and text, +- `spans`: the list of NER entities found and their tokens. diff --git a/mkdocs.yml b/mkdocs.yml index d4caef47259954802b7299fca1824909552ef532..d12830219d4a8bf05be273048a587c34f9825d58 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -61,6 +61,9 @@ nav: - Get started: - get_started/index.md - Development: get_started/development.md + - Usage: + - usage/index.md + - Validation: usage/validate.md # defer to literate-nav - Code Reference: reference/ diff --git a/pyproject.toml b/pyproject.toml index ecad63fdd828e504d5ce8dcf2138a6784f467bf2..bc0dff87b554bb20d8d38e6086c895c4565695b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,8 @@ ignore = [ # On top of the Google convention, disable `D417`, which requires # documentation for every function parameter. "D417", + # May cause some conflicts + "COM812", ] select = [ # pycodestyle diff --git a/requirements.txt b/requirements.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9bc85868e90ab73e44625aa50199c64c222351f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ +rich==13.7.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..c0a662153efc99ebddde92082e540481eb46e360 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,3 @@ +from pathlib import Path + +FIXTURES = Path(__file__).with_name("fixtures") diff --git a/tests/fixtures/parse/valid.bio b/tests/fixtures/parse/valid.bio new file mode 100644 index 0000000000000000000000000000000000000000..c924d42b0355ca8379baae7ac9be122deb57b556 --- /dev/null +++ b/tests/fixtures/parse/valid.bio @@ -0,0 +1,7 @@ +San B-GPE +Francisco I-GPE +considers O +banning B-VERB +sidewalk O +delivery O +robots O \ No newline at end of file diff --git a/tests/parse/__init__.py b/tests/parse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..62facd4acb6d8fd508632266c4e06640cf7e892c --- /dev/null +++ b/tests/parse/__init__.py @@ -0,0 +1,3 @@ +from tests.conftest import FIXTURES + +DATA_DIR = FIXTURES / "parse" diff --git a/tests/parse/test_document.py b/tests/parse/test_document.py new file mode 100644 index 0000000000000000000000000000000000000000..496a046861d67b1427cb0a42744704769abf3dbe --- /dev/null +++ b/tests/parse/test_document.py @@ -0,0 +1,200 @@ +from bio_parser.parse.document import Document, Span, Tag, Token +from tests.parse import DATA_DIR +import pytest + +from bio_parser.parse.document import _make_ner_label + +FILEPATH = DATA_DIR / "valid.bio" + + +@pytest.fixture +def document(): + return Document.from_file(FILEPATH) + + +@pytest.mark.parametrize( + "tag, label, output", + ( + (Tag.OUTSIDE, None, "O"), + (Tag.BEGINNING, "GPE", "B-GPE"), + (Tag.INSIDE, "GPE", "I-GPE"), + ), +) +def test_make_ner_label(tag: Tag, label: str, output: str): + assert _make_ner_label(tag=tag, label=label) == output + + +@pytest.mark.parametrize( + "tag, label, error", + ( + (Tag.OUTSIDE, "GPE", "Invalid label `GPE` with tag `O`"), + (Tag.BEGINNING, None, "No named entity label found with tag `B`"), + (Tag.INSIDE, None, "No named entity label found with tag `I`"), + ), +) +def test_make_ner_label_invalid(tag: Tag, label: str, error: str): + with pytest.raises(AssertionError, match=error): + _ = _make_ner_label(tag=tag, label=label) + + +def test_parse_document(document: Document): + # Check words + assert document.words == [ + "San", + "Francisco", + "considers", + "banning", + "sidewalk", + "delivery", + "robots", + ] + + # Check entities + assert document.entities == [ + ("GPE", "San Francisco"), + ("VERB", "banning"), + ] + + # Check word entities + assert document.word_entities == [ + ("GPE", "San"), + ("GPE", "Francisco"), + ("VERB", "banning"), + ] + + # Check text + assert document.text == "San Francisco considers banning sidewalk delivery robots" + + # Check labels + assert document.char_labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco") + [ + "O" + ] * len(" considers ") + ["B-VERB"] + ["I-VERB"] * len("anning") + ["O"] * len( + " sidewalk delivery robots" + ) + print(document.word_labels) + assert document.word_labels == [ + "GPE", + "GPE", + "GPE", + "O", + "O", + "O", + "VERB", + "O", + "O", + "O", + "O", + "O", + "O", + ] + + # Check chars + assert document.chars == list( + "San Francisco considers banning sidewalk delivery robots" + ) + + +def test_parse_span(document: Document): + span: Span = document.spans[0] + + # Check text + assert span.text == "San Francisco" + + # Check label + assert span.label == "GPE" + + # Check idx + assert span.idx == 0 + + # Check end + assert span.end == 2 + + # Check chars + assert span.chars == list("San Francisco") + + # Check labels + assert span.labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco") + + +def test_parse_token(document: Document): + # B- token + token: Token = document.spans[0].tokens[0] + + # Check word + assert token.word == "San" + + # Check label + assert token.label == "GPE" + + # Check label + assert token.tag == Tag.BEGINNING + + # Check IOB Label + assert token.iob_label == "B-GPE" + + # Check labels + assert token.labels == ["B-GPE", "I-GPE", "I-GPE"] + + # Check chars + assert token.chars == ["S", "a", "n"] + + # I- token + token: Token = document.spans[0].tokens[1] + + # Check word + assert token.word == "Francisco" + + # Check label + assert token.label == "GPE" + + # Check label + assert token.tag == Tag.INSIDE + + # Check IOB Label + assert token.iob_label == "I-GPE" + + # Check labels + assert token.labels == [ + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + "I-GPE", + ] + + # Check chars + assert token.chars == ["F", "r", "a", "n", "c", "i", "s", "c", "o"] + + # O token + token: Token = document.tokens[-1] + + # Check word + assert token.word == "robots" + + # Check label + assert token.label is None + + # Check label + assert token.tag == Tag.OUTSIDE + + # Check IOB Label + assert token.iob_label == "O" + + # Check labels + assert token.labels == ["O", "O", "O", "O", "O", "O"] + + # Check chars + assert token.chars == ["r", "o", "b", "o", "t", "s"] + + +@pytest.mark.parametrize( + "annotation", + ("Something something", "Something A-GPE", "Something GPE-A", "Something A"), +) +def test_invalid_token(annotation: str): + with pytest.raises(AssertionError, match="Could not parse annotation"): + _ = Token(idx=0, text=annotation).word diff --git a/tests/parse/test_validate.py b/tests/parse/test_validate.py new file mode 100644 index 0000000000000000000000000000000000000000..b9ef73f02e6bd67fb60a201ab99d59f98cb8e864 --- /dev/null +++ b/tests/parse/test_validate.py @@ -0,0 +1,38 @@ +import json +from bio_parser.parse.validate import run as validate +from tests.parse import DATA_DIR + + +def test_valid(): + filepath = DATA_DIR / "valid.bio" + validate([filepath]) + + # A JSON file should have been generated + output = filepath.with_suffix(".json") + assert output.exists() + + # Check content of JSON + assert json.loads(output.read_text()) == { + "bio_repr": "San B-GPE\nFrancisco I-GPE\nconsiders O\nbanning B-VERB\nsidewalk O\ndelivery O\nrobots O", + "tokens": [ + {"idx": 0, "text": "San B-GPE"}, + {"idx": 1, "text": "Francisco I-GPE"}, + {"idx": 2, "text": "considers O"}, + {"idx": 3, "text": "banning B-VERB"}, + {"idx": 4, "text": "sidewalk O"}, + {"idx": 5, "text": "delivery O"}, + {"idx": 6, "text": "robots O"}, + ], + "spans": [ + { + "tokens": [ + {"idx": 0, "text": "San B-GPE"}, + {"idx": 1, "text": "Francisco I-GPE"}, + ] + }, + {"tokens": [{"idx": 3, "text": "banning B-VERB"}]}, + ], + } + + # Cleanup + output.unlink() diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index f4f53619168f8993841e5a85193b424a60085554..0000000000000000000000000000000000000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - assert True