Implementation of the IOB2 parser

56e6e8d0 · Yoann Schneider · Solene Tarride · 26219d22 · 56e6e8d0 · 56e6e8d0
Commit 56e6e8d0 authored 1 year ago by Yoann Schneider Committed by Solene Tarride 1 year ago
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,6 @@ repos:
    rev: v4.5.0
    hooks:
      - id: check-ast
-      - id: check-docstring-first
      - id: check-executables-have-shebangs
      - id: check-merge-conflict
      - id: check-symlinks

--- a/README.md
+++ b/README.md
-# BIO Parser
+# BIO2 Parser
+
+**Disclaimer**: This package only supports BIO2 and doesn't support BIO (yet). More on the distinction between formats in [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).

 ## Documentation


--- a/bio_parser/__init__.py
+++ b/bio_parser/__init__.py
+import logging
+import sys
+
+from rich import traceback
+from rich.console import Console
+from rich.logging import RichHandler
+
+# Colorful logging
+# https://rich.readthedocs.io/en/latest/logging.html
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    datefmt="[%X]",
+    handlers=[RichHandler(console=Console(file=sys.stderr))],
+)
+
+# Add colorful tracebacks to crash with elegance
+# https://rich.readthedocs.io/en/latest/traceback.html
+traceback.install()
--- a/bio_parser/cli.py
+++ b/bio_parser/cli.py
 import argparse
+import errno
+
+from bio_parser.parse import add_validate_parser


 def main():
@@ -7,15 +10,17 @@ def main():
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

-    # To add a sub-command, you can un-comment this snippet
-    # More information on https://docs.python.org/3/library/argparse.html#sub-commands
-    # commands = parser.add_subparsers(help="Explain your sub commands globally here")
-    # my_command = commands.add_parser("commandX", help="Do something")
-    # my_command.set_defaults(func=command_main)
-    # my_command.add_argument("element_id", type=uuid.UUID)
+    commands = parser.add_subparsers()
+    add_validate_parser(commands)

    args = vars(parser.parse_args())
    if "func" in args:
-        args.pop("func")(**args)
+        # Run the subcommand's function
+        try:
+            status = args.pop("func")(**args)
+            parser.exit(status=status)
+        except KeyboardInterrupt:
+            # Just quit silently on ^C instead of displaying a long traceback
+            parser.exit(status=errno.EOWNERDEAD)
    else:
        parser.print_help()
--- a/bio_parser/parse/__init__.py
+++ b/bio_parser/parse/__init__.py
+"""
+Validate a given BIO file.
+"""
+
+from argparse import ArgumentParser
+from pathlib import Path
+
+from bio_parser.parse.validate import run
+
+
+def _check_bio_ext(filename: str) -> Path:
+    filepath = Path(filename)
+    assert filepath.suffix == ".bio"
+    return filepath
+
+
+def add_validate_parser(subcommands):
+    parser: ArgumentParser = subcommands.add_parser(
+        "validate",
+        help=__doc__,
+        description=__doc__,
+    )
+    parser.set_defaults(func=run)
+
+    parser.add_argument(
+        "filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
+    )
--- a/bio_parser/parse/document.py
+++ b/bio_parser/parse/document.py
+"""Parse BIO files."""
+import logging
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from itertools import pairwise
+from operator import attrgetter
+from pathlib import Path
+
+PARSE_TOKEN = re.compile(r"(?P<text>[^\s]+) (?P<tag>(I|O|B))(\-(?P<ent>[^\s]+))?")
+"""Regex that parses a line of a BIO file"""
+
+_logger = logging.getLogger(__name__)
+
+
+class Tag(Enum):
+    """Supported Beginning-Inside-Outside tags."""
+
+    BEGINNING = "B"
+    INSIDE = "I"
+    OUTSIDE = "O"
+
+
+def _make_ner_label(tag: Tag, label: str | None) -> str:
+    """Create the corresponding IOB label from the given tag and label.
+
+    Args:
+        tag (Tag): Beginning-Inside-Outside tag.
+        label (str | None): Label of the token.
+
+    Returns:
+        str: Corresponding IOB label.
+
+    Examples:
+        >>> _make_ner_label(tag=Tag.BEGINNING, label="GPE")
+        'B-GPE'
+        >>> _make_ner_label(tag=Tag.INSIDE, label="GPE")
+        'I-GPE'
+        >>> _make_ner_label(tag=Tag.OUTSIDE, label=None)
+        'O'
+
+    """
+    if tag == Tag.OUTSIDE:
+        assert label is None, f"Invalid label `{label}` with tag `{tag.value}`"
+        return tag.value
+
+    assert label, f"No named entity label found with tag `{tag.value}`"
+
+    return f"{tag.value}-{label}"
+
+
+@dataclass(slots=True)
+class Token:
+    """Token as tokenized in the BIO document."""
+
+    idx: int
+    """Index of the token in the document."""
+    text: str
+    """Text representation of the token."""
+
+    @property
+    def _data(self) -> re.Match:
+        parsed = PARSE_TOKEN.match(self.text)
+        assert parsed is not None, "Could not parse annotation."
+        return parsed
+
+    @property
+    def word(self) -> str:
+        """Text content of the token.
+
+        Examples:
+            >>> Token(idx=0, text="Chicken B-Animal").word
+            'Chicken'
+        """
+        return self._data.group("text")
+
+    @property
+    def label(self) -> str | None:
+        """Named entity type of this token.
+
+        Examples:
+            >>> Token(idx=0, text="Chicken B-Animal").label
+            'Animal'
+        """
+        return self._data.group("ent")
+
+    @property
+    def tag(self) -> Tag:
+        """IOB code of named entity tag.
+
+        Examples:
+            >>> Token(idx=0, text="Chicken B-Animal").tag
+            <Tag.BEGINNING: 'B'>
+        """
+        return Tag(self._data.group("tag"))
+
+    @property
+    def iob_label(self) -> str:
+        """IOB label (Tag + Entity).
+
+        Examples:
+            >>> Token(idx=0, text="Chicken B-Animal").iob_label
+            'B-Animal'
+        """
+        return _make_ner_label(tag=self.tag, label=self.label)
+
+    @property
+    def labels(self) -> list[str]:
+        """Character-level IOB labels.
+
+        Examples:
+            >>> Token(idx=0, text="Some B-PER").labels
+            ['B-PER', 'I-PER', 'I-PER', 'I-PER']
+
+            >>> Token(idx=1, text="one I-PER").labels
+            ['I-PER', 'I-PER', 'I-PER'].
+        """
+        if self.tag == Tag.OUTSIDE:
+            return [self.iob_label] * len(self.word)
+        return [self.iob_label] + [
+            _make_ner_label(tag=Tag.INSIDE, label=self.label),
+        ] * (len(self.word) - 1)
+
+    @property
+    def chars(self) -> list[str]:
+        """The list of characters making up the token.
+
+        Examples:
+            >>> Token(idx=0, text="Chicken B-Animal").chars
+            ['C', 'h', 'i', 'c', 'k', 'e', 'n']
+        """
+        return list(self.word)
+
+
+@dataclass(slots=True)
+class Span:
+    """Representation of a Named Entity Span."""
+
+    tokens: list[Token] = field(default_factory=list)
+    """List of tokens in the Span"""
+
+    @property
+    def text(self) -> str:
+        """Join every word of the span by a whitespace.
+
+        Examples:
+            >>> Span(tokens=[
+            ...         Token(idx=0, text="Chicken B-Animal"),
+            ...         Token(idx=1, text="run I-Animal")
+            ... ]).text
+            'Chicken run'
+        """
+        return " ".join(map(attrgetter("word"), self.tokens))
+
+    @property
+    def label(self) -> str | None:
+        """The named entity type of this span. All tokens composing the span have the same.
+
+        Examples:
+            >>> Span(tokens=[
+            ...         Token(idx=0, text="Chicken B-Animal"),
+            ...         Token(idx=1, text="run I-Animal")
+            ... ]).label
+            'Animal'
+        """
+        if not self.tokens:
+            return
+        return self.tokens[0].label
+
+    @property
+    def idx(self) -> int | None:
+        """The index of the first token of the span.
+
+        Examples:
+            >>> Span(tokens=[
+            ...         Token(idx=0, text="Chicken B-Animal"),
+            ...         Token(idx=1, text="run I-Animal")
+            ... ]).idx
+            0
+        """
+        if not self.tokens:
+            return None
+        return self.tokens[0].idx
+
+    @property
+    def end(self) -> int | None:
+        """The index of the first token after the span.
+
+        Examples:
+            >>> Span(tokens=[
+            ...         Token(idx=0, text="Chicken B-Animal"),
+            ...         Token(idx=1, text="run I-Animal")
+            ... ]).end
+            2
+        """
+        if not self.tokens:
+            return
+        return self.tokens[-1].idx + 1
+
+    def add_token(self, token: Token) -> None:
+        """Add the provided token to this span. The token's label must match the Span's.
+
+        Args:
+            token (Token): Token to add to this span.
+        """
+        if self.label:
+            assert (
+                token.label == self.label
+            ), "This token doesn't have the same label as this span."
+        self.tokens.append(token)
+
+    @property
+    def labels(self) -> list[str]:
+        """Character-level IOB labels.
+
+        Examples:
+            >>> Span(tokens=[
+            ...         Token(idx=0, text="Chicken B-Animal"),
+            ...         Token(idx=1, text="run I-Animal")
+            ... ]).labels
+            ['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal']
+        """
+        if not self.tokens:
+            return []
+
+        return [_make_ner_label(tag=Tag.BEGINNING, label=self.label)] + [
+            _make_ner_label(tag=Tag.INSIDE, label=self.label),
+        ] * (len(self.text) - 1)
+
+    @property
+    def chars(self) -> list[str]:
+        """Characters making up the span.
+
+        Examples:
+            >>> Span(
+            ...     tokens=[
+            ...             Token(idx=0, text="Chicken B-Animal"),
+            ...             Token(idx=1, text="run I-Animal")
+            ...     ]
+            ... ).chars
+            ['C', 'h', 'i', 'c', 'k', 'e', 'n', ' ', 'r', 'u', 'n']
+        """
+        return list(self.text)
+
+
+@dataclass(slots=True)
+class Document:
+    """Representation of a BIO document."""
+
+    bio_repr: str
+    """Full BIO representation of the Document"""
+    tokens: list[Token] = field(default_factory=list)
+    """List of the tokens in the Document"""
+
+    spans: list[Span] = field(default_factory=list)
+    """List of the spans in the Document"""
+
+    def __post_init__(self):
+        """Parses the tokens and the entity spans in the document."""
+        span: Span | None = None
+        for idx, line in enumerate(self.bio_repr.splitlines()):
+            try:
+                token = Token(idx=idx, text=line)
+                self.tokens.append(token)
+                # Build spans
+                match token.tag:
+                    case Tag.OUTSIDE:
+                        # Close current span if present
+                        if span:
+                            self.spans.append(span)
+                            span = None
+                    case Tag.INSIDE:
+                        assert span, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
+                        span.add_token(token)
+                    case Tag.BEGINNING:
+                        # Close current span if present
+                        if span:
+                            self.spans.append(span)
+                        # Start new one
+                        span = Span()
+                        span.add_token(token)
+            except AssertionError as e:
+                _logger.error(f"Error on token n°{token.idx}: {e}")
+                raise Exception from e
+
+        # Last span
+        if span and span.tokens:
+            self.spans.append(span)
+
+    @property
+    def words(self) -> list[str]:
+        """List of words making up the document."""
+        return list(map(attrgetter("word"), self.tokens))
+
+    @property
+    def entities(self) -> list[tuple[str, str]]:
+        """List of entities making up the document."""
+        return list(
+            map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)),
+        )
+
+    @property
+    def word_entities(self) -> list[tuple[str, str]]:
+        """List of entities in the words making up the document."""
+        return list(
+            map(attrgetter("label", "word"), filter(attrgetter("label"), self.tokens)),
+        )
+
+    @property
+    def text(self) -> str:
+        """Join every word of the span by a whitespace."""
+        return " ".join(map(attrgetter("word"), self.tokens))
+
+    @property
+    def char_labels(self) -> list[str]:
+        r"""Character-level IOB labels.
+
+        Spaces between two tokens with the same label get the same label, others get 'O'.
+
+        Examples:
+            The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label.
+            >>> Document(bio_repr="I B-Animal\nrun I-Animal").char_labels
+            ['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal']
+
+            The space between 'run' and 'fast' is tagged as 'O', because it's not the same label.
+            >>> Document(bio_repr="run B-Animal\nfast O").char_labels
+            ['B-Animal', 'I-Animal', 'I-Animal', 'O', 'O', 'O', 'O', 'O']
+        """
+        tags = []
+        for token, next_token in pairwise(self.tokens + [None]):
+            # Add token tags
+            tags.extend(token.labels)
+            if next_token and token.label == next_token.label:
+                tags.append(next_token.iob_label)
+            elif next_token:
+                tags.append(Tag.OUTSIDE.value)
+        return tags
+
+    @property
+    def word_labels(self) -> list[str]:
+        r"""Word-level IOB labels.
+
+        Spaces between two tokens with the same label get the same label, others get 'O'.
+
+        Examples:
+            The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label.
+            >>> Document(bio_repr="I B-Animal\nrun I-Animal").word_labels
+            ['Animal', 'Animal', 'Animal']
+
+            The space between 'run' and 'fast' is tagged as 'O', because it's not the same label.
+            >>> Document(bio_repr="run B-Animal\nfast O").word_labels
+            ['Animal', 'O', 'O']
+        """
+        tags = []
+        for token, next_token in pairwise(self.tokens + [None]):
+            # Add token tags
+            tags.append(token.label or Tag.OUTSIDE.value)
+
+            # Token of the next space
+            if (
+                # This is not the last token
+                next_token
+                # This token is not tagged as O
+                and token.tag != Tag.OUTSIDE
+                # Same label between consecutive tokens
+                and token.label == next_token.label
+            ):
+                tags.append(token.label)
+            elif next_token:
+                tags.append(Tag.OUTSIDE.value)
+        return tags
+
+    @property
+    def chars(self) -> list[str]:
+        r"""Characters making up the token.
+
+        Examples:
+            >>> Document(bio_repr="I B-Animal\nrun I-Animal").chars
+            ['I', ' ', 'r', 'u', 'n']
+        """
+        return list(self.text)
+
+    @classmethod
+    def from_file(cls, filepath: Path) -> "Document":
+        """Load a Document from a IOB file.
+
+        Args:
+            filepath (Path): Path to the file to load.
+
+        Returns:
+            Document: Parsed document
+        """
+        return Document(filepath.read_text())
--- a/bio_parser/parse/validate.py
+++ b/bio_parser/parse/validate.py
+"""Validates the construction of the BIO file."""
+import json
+import logging
+from dataclasses import asdict
+from pathlib import Path
+
+from bio_parser.parse.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+def run(filepaths: list[Path]) -> None:
+    """Validate the construction of multiple BIO files.
+
+    Args:
+        filepaths (list[Path]): Files to check.
+    """
+    for filepath in filepaths:
+        logger.info(f"Parsing file @ `{filepath}`")
+        try:
+            doc = Document.from_file(filepath)
+            filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
+        except Exception as e:
+            logger.error(f"Could not load the file @ `{filepath}`: {e}")
+        logger.info(f"The file @ `{filepath}` is valid!")
--- a/docs/reference/parse/document.md
+++ b/docs/reference/parse/document.md
+# Document
+
+::: bio_parser.parse.document
\ No newline at end of file
--- a/docs/reference/parse/validate.md
+++ b/docs/reference/parse/validate.md
+# Validate
+
+::: bio_parser.parse.validate
\ No newline at end of file
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
+# Usage
+
+When `bio-parser` is installed in your environment, you may use the following commands:
+
+`bio-parser validate`
+: To parse and validate the structure of one or more BIO files. More details in the [dedicated page](./validate.md).
--- a/docs/usage/validate.md
+++ b/docs/usage/validate.md
+# Validation
+
+Use the `bio-parser validate` command to parse and validate the structure of one or more BIO2 files.
+
+## Supported format
+
+The BIO2 format is a common tagging format in NER (Named entities recognition) tasks. More details about it on [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).
+
+An example of such a tagging format is given below.
+```plaintext
+Alex B-PER
+is O
+going O
+to O
+Los B-LOC
+Angeles I-LOC
+in O
+California B-LOC
+```
+
+## Usage
+
+You can specify one or more paths to your BIO files. The extension used has to be `.bio`.
+The parser will check them one by one and report the first error encountered.
+
+```shell
+$ bio-parser validate input.bio
+[12:37:20] INFO     Parsing file @ `input.bio`                                                                                            validate.py:19
+           INFO     The file @ `input.bio` is valid!                                                                                      validate.py:25
+```
+
+With multiple files:
+```shell
+$ bio-parser validate input1.bio input2.bio
+[12:37:20] INFO     Parsing file @ `input1.bio`                                                                                            validate.py:19
+           INFO     The file @ `input1.bio` is valid!                                                                                      validate.py:25
+[12:37:20] INFO     Parsing file @ `input2.bio`                                                                                            validate.py:19
+           INFO     The file @ `input2.bio` is valid!                                                                                      validate.py:25
+```
+
+With an invalid file.
+```shell
+$ bio-parser validate invalid.bio
+[12:41:16] INFO     Parsing file @ `invalid.bio`                                                                                                               validate.py:19
+           ERROR    Error on token n°0: Found `Tag.INSIDE` before `Tag.BEGINNING`.                                                                            document.py:283
+           ERROR    Could not load the file @ `invalid.bio`:                                                                                                   validate.py:24
+           INFO     The file @ `invalid.bio` is valid!                                                                                                         validate.py:25
+```
+
+In addition to validating the structure of the file, a JSON representation of the BIO file is also saved at the same location.
+
+This JSON file has three keys:
+
+- `bio_repr`: The string in BIO format passed to the command,
+- `tokens`: the list of tokens in the file, with their index and text,
+- `spans`: the list of NER entities found and their tokens.
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -61,6 +61,9 @@ nav:
  - Get started:
    - get_started/index.md
    - Development: get_started/development.md
+  - Usage:
+    - usage/index.md
+    - Validation: usage/validate.md
  # defer to literate-nav
  - Code Reference: reference/


--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,8 @@ ignore = [
    # On top of the Google convention, disable `D417`, which requires
    # documentation for every function parameter.
    "D417",
+    # May cause some conflicts
+    "COM812",
 ]
 select = [
    # pycodestyle

--- a/requirements.txt
+++ b/requirements.txt
+rich==13.7.0
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
+from pathlib import Path
+
+FIXTURES = Path(__file__).with_name("fixtures")
--- a/tests/fixtures/parse/valid.bio
+++ b/tests/fixtures/parse/valid.bio
+San B-GPE
+Francisco I-GPE
+considers O
+banning B-VERB
+sidewalk O
+delivery O
+robots O
\ No newline at end of file
--- a/tests/parse/__init__.py
+++ b/tests/parse/__init__.py
+from tests.conftest import FIXTURES
+
+DATA_DIR = FIXTURES / "parse"
--- a/tests/parse/test_document.py
+++ b/tests/parse/test_document.py
+from bio_parser.parse.document import Document, Span, Tag, Token
+from tests.parse import DATA_DIR
+import pytest
+
+from bio_parser.parse.document import _make_ner_label
+
+FILEPATH = DATA_DIR / "valid.bio"
+
+
+@pytest.fixture
+def document():
+    return Document.from_file(FILEPATH)
+
+
+@pytest.mark.parametrize(
+    "tag, label, output",
+    (
+        (Tag.OUTSIDE, None, "O"),
+        (Tag.BEGINNING, "GPE", "B-GPE"),
+        (Tag.INSIDE, "GPE", "I-GPE"),
+    ),
+)
+def test_make_ner_label(tag: Tag, label: str, output: str):
+    assert _make_ner_label(tag=tag, label=label) == output
+
+
+@pytest.mark.parametrize(
+    "tag, label, error",
+    (
+        (Tag.OUTSIDE, "GPE", "Invalid label `GPE` with tag `O`"),
+        (Tag.BEGINNING, None, "No named entity label found with tag `B`"),
+        (Tag.INSIDE, None, "No named entity label found with tag `I`"),
+    ),
+)
+def test_make_ner_label_invalid(tag: Tag, label: str, error: str):
+    with pytest.raises(AssertionError, match=error):
+        _ = _make_ner_label(tag=tag, label=label)
+
+
+def test_parse_document(document: Document):
+    # Check words
+    assert document.words == [
+        "San",
+        "Francisco",
+        "considers",
+        "banning",
+        "sidewalk",
+        "delivery",
+        "robots",
+    ]
+
+    # Check entities
+    assert document.entities == [
+        ("GPE", "San Francisco"),
+        ("VERB", "banning"),
+    ]
+
+    # Check word entities
+    assert document.word_entities == [
+        ("GPE", "San"),
+        ("GPE", "Francisco"),
+        ("VERB", "banning"),
+    ]
+
+    # Check text
+    assert document.text == "San Francisco considers banning sidewalk delivery robots"
+
+    # Check labels
+    assert document.char_labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco") + [
+        "O"
+    ] * len(" considers ") + ["B-VERB"] + ["I-VERB"] * len("anning") + ["O"] * len(
+        " sidewalk delivery robots"
+    )
+    print(document.word_labels)
+    assert document.word_labels == [
+        "GPE",
+        "GPE",
+        "GPE",
+        "O",
+        "O",
+        "O",
+        "VERB",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+        "O",
+    ]
+
+    # Check chars
+    assert document.chars == list(
+        "San Francisco considers banning sidewalk delivery robots"
+    )
+
+
+def test_parse_span(document: Document):
+    span: Span = document.spans[0]
+
+    # Check text
+    assert span.text == "San Francisco"
+
+    # Check label
+    assert span.label == "GPE"
+
+    # Check idx
+    assert span.idx == 0
+
+    # Check end
+    assert span.end == 2
+
+    # Check chars
+    assert span.chars == list("San Francisco")
+
+    # Check labels
+    assert span.labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco")
+
+
+def test_parse_token(document: Document):
+    # B- token
+    token: Token = document.spans[0].tokens[0]
+
+    # Check word
+    assert token.word == "San"
+
+    # Check label
+    assert token.label == "GPE"
+
+    # Check label
+    assert token.tag == Tag.BEGINNING
+
+    # Check IOB Label
+    assert token.iob_label == "B-GPE"
+
+    # Check labels
+    assert token.labels == ["B-GPE", "I-GPE", "I-GPE"]
+
+    # Check chars
+    assert token.chars == ["S", "a", "n"]
+
+    # I- token
+    token: Token = document.spans[0].tokens[1]
+
+    # Check word
+    assert token.word == "Francisco"
+
+    # Check label
+    assert token.label == "GPE"
+
+    # Check label
+    assert token.tag == Tag.INSIDE
+
+    # Check IOB Label
+    assert token.iob_label == "I-GPE"
+
+    # Check labels
+    assert token.labels == [
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+        "I-GPE",
+    ]
+
+    # Check chars
+    assert token.chars == ["F", "r", "a", "n", "c", "i", "s", "c", "o"]
+
+    # O token
+    token: Token = document.tokens[-1]
+
+    # Check word
+    assert token.word == "robots"
+
+    # Check label
+    assert token.label is None
+
+    # Check label
+    assert token.tag == Tag.OUTSIDE
+
+    # Check IOB Label
+    assert token.iob_label == "O"
+
+    # Check labels
+    assert token.labels == ["O", "O", "O", "O", "O", "O"]
+
+    # Check chars
+    assert token.chars == ["r", "o", "b", "o", "t", "s"]
+
+
+@pytest.mark.parametrize(
+    "annotation",
+    ("Something something", "Something A-GPE", "Something GPE-A", "Something A"),
+)
+def test_invalid_token(annotation: str):
+    with pytest.raises(AssertionError, match="Could not parse annotation"):
+        _ = Token(idx=0, text=annotation).word
--- a/tests/parse/test_validate.py
+++ b/tests/parse/test_validate.py
+import json
+from bio_parser.parse.validate import run as validate
+from tests.parse import DATA_DIR
+
+
+def test_valid():
+    filepath = DATA_DIR / "valid.bio"
+    validate([filepath])
+
+    # A JSON file should have been generated
+    output = filepath.with_suffix(".json")
+    assert output.exists()
+
+    # Check content of JSON
+    assert json.loads(output.read_text()) == {
+        "bio_repr": "San B-GPE\nFrancisco I-GPE\nconsiders O\nbanning B-VERB\nsidewalk O\ndelivery O\nrobots O",
+        "tokens": [
+            {"idx": 0, "text": "San B-GPE"},
+            {"idx": 1, "text": "Francisco I-GPE"},
+            {"idx": 2, "text": "considers O"},
+            {"idx": 3, "text": "banning B-VERB"},
+            {"idx": 4, "text": "sidewalk O"},
+            {"idx": 5, "text": "delivery O"},
+            {"idx": 6, "text": "robots O"},
+        ],
+        "spans": [
+            {
+                "tokens": [
+                    {"idx": 0, "text": "San B-GPE"},
+                    {"idx": 1, "text": "Francisco I-GPE"},
+                ]
+            },
+            {"tokens": [{"idx": 3, "text": "banning B-VERB"}]},
+        ],
+    }
+
+    # Cleanup
+    output.unlink()