Skip to content
Snippets Groups Projects
Commit 56e6e8d0 authored by Yoann Schneider's avatar Yoann Schneider :tennis: Committed by Solene Tarride
Browse files

Implementation of the IOB2 parser

parent 26219d22
No related branches found
No related tags found
1 merge request!2Implementation of the IOB2 parser
Pipeline #146881 passed
Showing
with 804 additions and 9 deletions
......@@ -3,7 +3,6 @@ repos:
rev: v4.5.0
hooks:
- id: check-ast
- id: check-docstring-first
- id: check-executables-have-shebangs
- id: check-merge-conflict
- id: check-symlinks
......
# BIO Parser
# BIO2 Parser
**Disclaimer**: This package only supports BIO2 and doesn't support BIO (yet). More on the distinction between formats in [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).
## Documentation
......
import logging
import sys
from rich import traceback
from rich.console import Console
from rich.logging import RichHandler
# Colorful logging
# https://rich.readthedocs.io/en/latest/logging.html
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
datefmt="[%X]",
handlers=[RichHandler(console=Console(file=sys.stderr))],
)
# Add colorful tracebacks to crash with elegance
# https://rich.readthedocs.io/en/latest/traceback.html
traceback.install()
import argparse
import errno
from bio_parser.parse import add_validate_parser
def main():
......@@ -7,15 +10,17 @@ def main():
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# To add a sub-command, you can un-comment this snippet
# More information on https://docs.python.org/3/library/argparse.html#sub-commands
# commands = parser.add_subparsers(help="Explain your sub commands globally here")
# my_command = commands.add_parser("commandX", help="Do something")
# my_command.set_defaults(func=command_main)
# my_command.add_argument("element_id", type=uuid.UUID)
commands = parser.add_subparsers()
add_validate_parser(commands)
args = vars(parser.parse_args())
if "func" in args:
args.pop("func")(**args)
# Run the subcommand's function
try:
status = args.pop("func")(**args)
parser.exit(status=status)
except KeyboardInterrupt:
# Just quit silently on ^C instead of displaying a long traceback
parser.exit(status=errno.EOWNERDEAD)
else:
parser.print_help()
"""
Validate a given BIO file.
"""
from argparse import ArgumentParser
from pathlib import Path
from bio_parser.parse.validate import run
def _check_bio_ext(filename: str) -> Path:
filepath = Path(filename)
assert filepath.suffix == ".bio"
return filepath
def add_validate_parser(subcommands):
parser: ArgumentParser = subcommands.add_parser(
"validate",
help=__doc__,
description=__doc__,
)
parser.set_defaults(func=run)
parser.add_argument(
"filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
)
"""Parse BIO files."""
import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from itertools import pairwise
from operator import attrgetter
from pathlib import Path
PARSE_TOKEN = re.compile(r"(?P<text>[^\s]+) (?P<tag>(I|O|B))(\-(?P<ent>[^\s]+))?")
"""Regex that parses a line of a BIO file"""
_logger = logging.getLogger(__name__)
class Tag(Enum):
"""Supported Beginning-Inside-Outside tags."""
BEGINNING = "B"
INSIDE = "I"
OUTSIDE = "O"
def _make_ner_label(tag: Tag, label: str | None) -> str:
"""Create the corresponding IOB label from the given tag and label.
Args:
tag (Tag): Beginning-Inside-Outside tag.
label (str | None): Label of the token.
Returns:
str: Corresponding IOB label.
Examples:
>>> _make_ner_label(tag=Tag.BEGINNING, label="GPE")
'B-GPE'
>>> _make_ner_label(tag=Tag.INSIDE, label="GPE")
'I-GPE'
>>> _make_ner_label(tag=Tag.OUTSIDE, label=None)
'O'
"""
if tag == Tag.OUTSIDE:
assert label is None, f"Invalid label `{label}` with tag `{tag.value}`"
return tag.value
assert label, f"No named entity label found with tag `{tag.value}`"
return f"{tag.value}-{label}"
@dataclass(slots=True)
class Token:
"""Token as tokenized in the BIO document."""
idx: int
"""Index of the token in the document."""
text: str
"""Text representation of the token."""
@property
def _data(self) -> re.Match:
parsed = PARSE_TOKEN.match(self.text)
assert parsed is not None, "Could not parse annotation."
return parsed
@property
def word(self) -> str:
"""Text content of the token.
Examples:
>>> Token(idx=0, text="Chicken B-Animal").word
'Chicken'
"""
return self._data.group("text")
@property
def label(self) -> str | None:
"""Named entity type of this token.
Examples:
>>> Token(idx=0, text="Chicken B-Animal").label
'Animal'
"""
return self._data.group("ent")
@property
def tag(self) -> Tag:
"""IOB code of named entity tag.
Examples:
>>> Token(idx=0, text="Chicken B-Animal").tag
<Tag.BEGINNING: 'B'>
"""
return Tag(self._data.group("tag"))
@property
def iob_label(self) -> str:
"""IOB label (Tag + Entity).
Examples:
>>> Token(idx=0, text="Chicken B-Animal").iob_label
'B-Animal'
"""
return _make_ner_label(tag=self.tag, label=self.label)
@property
def labels(self) -> list[str]:
"""Character-level IOB labels.
Examples:
>>> Token(idx=0, text="Some B-PER").labels
['B-PER', 'I-PER', 'I-PER', 'I-PER']
>>> Token(idx=1, text="one I-PER").labels
['I-PER', 'I-PER', 'I-PER'].
"""
if self.tag == Tag.OUTSIDE:
return [self.iob_label] * len(self.word)
return [self.iob_label] + [
_make_ner_label(tag=Tag.INSIDE, label=self.label),
] * (len(self.word) - 1)
@property
def chars(self) -> list[str]:
"""The list of characters making up the token.
Examples:
>>> Token(idx=0, text="Chicken B-Animal").chars
['C', 'h', 'i', 'c', 'k', 'e', 'n']
"""
return list(self.word)
@dataclass(slots=True)
class Span:
"""Representation of a Named Entity Span."""
tokens: list[Token] = field(default_factory=list)
"""List of tokens in the Span"""
@property
def text(self) -> str:
"""Join every word of the span by a whitespace.
Examples:
>>> Span(tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]).text
'Chicken run'
"""
return " ".join(map(attrgetter("word"), self.tokens))
@property
def label(self) -> str | None:
"""The named entity type of this span. All tokens composing the span have the same.
Examples:
>>> Span(tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]).label
'Animal'
"""
if not self.tokens:
return
return self.tokens[0].label
@property
def idx(self) -> int | None:
"""The index of the first token of the span.
Examples:
>>> Span(tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]).idx
0
"""
if not self.tokens:
return None
return self.tokens[0].idx
@property
def end(self) -> int | None:
"""The index of the first token after the span.
Examples:
>>> Span(tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]).end
2
"""
if not self.tokens:
return
return self.tokens[-1].idx + 1
def add_token(self, token: Token) -> None:
"""Add the provided token to this span. The token's label must match the Span's.
Args:
token (Token): Token to add to this span.
"""
if self.label:
assert (
token.label == self.label
), "This token doesn't have the same label as this span."
self.tokens.append(token)
@property
def labels(self) -> list[str]:
"""Character-level IOB labels.
Examples:
>>> Span(tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]).labels
['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal']
"""
if not self.tokens:
return []
return [_make_ner_label(tag=Tag.BEGINNING, label=self.label)] + [
_make_ner_label(tag=Tag.INSIDE, label=self.label),
] * (len(self.text) - 1)
@property
def chars(self) -> list[str]:
"""Characters making up the span.
Examples:
>>> Span(
... tokens=[
... Token(idx=0, text="Chicken B-Animal"),
... Token(idx=1, text="run I-Animal")
... ]
... ).chars
['C', 'h', 'i', 'c', 'k', 'e', 'n', ' ', 'r', 'u', 'n']
"""
return list(self.text)
@dataclass(slots=True)
class Document:
"""Representation of a BIO document."""
bio_repr: str
"""Full BIO representation of the Document"""
tokens: list[Token] = field(default_factory=list)
"""List of the tokens in the Document"""
spans: list[Span] = field(default_factory=list)
"""List of the spans in the Document"""
def __post_init__(self):
"""Parses the tokens and the entity spans in the document."""
span: Span | None = None
for idx, line in enumerate(self.bio_repr.splitlines()):
try:
token = Token(idx=idx, text=line)
self.tokens.append(token)
# Build spans
match token.tag:
case Tag.OUTSIDE:
# Close current span if present
if span:
self.spans.append(span)
span = None
case Tag.INSIDE:
assert span, f"Found `{Tag.INSIDE}` before `{Tag.BEGINNING}`."
span.add_token(token)
case Tag.BEGINNING:
# Close current span if present
if span:
self.spans.append(span)
# Start new one
span = Span()
span.add_token(token)
except AssertionError as e:
_logger.error(f"Error on token n°{token.idx}: {e}")
raise Exception from e
# Last span
if span and span.tokens:
self.spans.append(span)
@property
def words(self) -> list[str]:
"""List of words making up the document."""
return list(map(attrgetter("word"), self.tokens))
@property
def entities(self) -> list[tuple[str, str]]:
"""List of entities making up the document."""
return list(
map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)),
)
@property
def word_entities(self) -> list[tuple[str, str]]:
"""List of entities in the words making up the document."""
return list(
map(attrgetter("label", "word"), filter(attrgetter("label"), self.tokens)),
)
@property
def text(self) -> str:
"""Join every word of the span by a whitespace."""
return " ".join(map(attrgetter("word"), self.tokens))
@property
def char_labels(self) -> list[str]:
r"""Character-level IOB labels.
Spaces between two tokens with the same label get the same label, others get 'O'.
Examples:
The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label.
>>> Document(bio_repr="I B-Animal\nrun I-Animal").char_labels
['B-Animal', 'I-Animal', 'I-Animal', 'I-Animal', 'I-Animal']
The space between 'run' and 'fast' is tagged as 'O', because it's not the same label.
>>> Document(bio_repr="run B-Animal\nfast O").char_labels
['B-Animal', 'I-Animal', 'I-Animal', 'O', 'O', 'O', 'O', 'O']
"""
tags = []
for token, next_token in pairwise(self.tokens + [None]):
# Add token tags
tags.extend(token.labels)
if next_token and token.label == next_token.label:
tags.append(next_token.iob_label)
elif next_token:
tags.append(Tag.OUTSIDE.value)
return tags
@property
def word_labels(self) -> list[str]:
r"""Word-level IOB labels.
Spaces between two tokens with the same label get the same label, others get 'O'.
Examples:
The space between 'I' and 'run' is tagged as 'I-Animal', because it's the same named entity label.
>>> Document(bio_repr="I B-Animal\nrun I-Animal").word_labels
['Animal', 'Animal', 'Animal']
The space between 'run' and 'fast' is tagged as 'O', because it's not the same label.
>>> Document(bio_repr="run B-Animal\nfast O").word_labels
['Animal', 'O', 'O']
"""
tags = []
for token, next_token in pairwise(self.tokens + [None]):
# Add token tags
tags.append(token.label or Tag.OUTSIDE.value)
# Token of the next space
if (
# This is not the last token
next_token
# This token is not tagged as O
and token.tag != Tag.OUTSIDE
# Same label between consecutive tokens
and token.label == next_token.label
):
tags.append(token.label)
elif next_token:
tags.append(Tag.OUTSIDE.value)
return tags
@property
def chars(self) -> list[str]:
r"""Characters making up the token.
Examples:
>>> Document(bio_repr="I B-Animal\nrun I-Animal").chars
['I', ' ', 'r', 'u', 'n']
"""
return list(self.text)
@classmethod
def from_file(cls, filepath: Path) -> "Document":
"""Load a Document from a IOB file.
Args:
filepath (Path): Path to the file to load.
Returns:
Document: Parsed document
"""
return Document(filepath.read_text())
"""Validates the construction of the BIO file."""
import json
import logging
from dataclasses import asdict
from pathlib import Path
from bio_parser.parse.document import Document
logger = logging.getLogger(__name__)
def run(filepaths: list[Path]) -> None:
"""Validate the construction of multiple BIO files.
Args:
filepaths (list[Path]): Files to check.
"""
for filepath in filepaths:
logger.info(f"Parsing file @ `{filepath}`")
try:
doc = Document.from_file(filepath)
filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
except Exception as e:
logger.error(f"Could not load the file @ `{filepath}`: {e}")
logger.info(f"The file @ `{filepath}` is valid!")
# Document
::: bio_parser.parse.document
\ No newline at end of file
# Validate
::: bio_parser.parse.validate
\ No newline at end of file
# Usage
When `bio-parser` is installed in your environment, you may use the following commands:
`bio-parser validate`
: To parse and validate the structure of one or more BIO files. More details in the [dedicated page](./validate.md).
# Validation
Use the `bio-parser validate` command to parse and validate the structure of one or more BIO2 files.
## Supported format
The BIO2 format is a common tagging format in NER (Named entities recognition) tasks. More details about it on [Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).
An example of such a tagging format is given below.
```plaintext
Alex B-PER
is O
going O
to O
Los B-LOC
Angeles I-LOC
in O
California B-LOC
```
## Usage
You can specify one or more paths to your BIO files. The extension used has to be `.bio`.
The parser will check them one by one and report the first error encountered.
```shell
$ bio-parser validate input.bio
[12:37:20] INFO Parsing file @ `input.bio` validate.py:19
INFO The file @ `input.bio` is valid! validate.py:25
```
With multiple files:
```shell
$ bio-parser validate input1.bio input2.bio
[12:37:20] INFO Parsing file @ `input1.bio` validate.py:19
INFO The file @ `input1.bio` is valid! validate.py:25
[12:37:20] INFO Parsing file @ `input2.bio` validate.py:19
INFO The file @ `input2.bio` is valid! validate.py:25
```
With an invalid file.
```shell
$ bio-parser validate invalid.bio
[12:41:16] INFO Parsing file @ `invalid.bio` validate.py:19
ERROR Error on token n°0: Found `Tag.INSIDE` before `Tag.BEGINNING`. document.py:283
ERROR Could not load the file @ `invalid.bio`: validate.py:24
INFO The file @ `invalid.bio` is valid! validate.py:25
```
In addition to validating the structure of the file, a JSON representation of the BIO file is also saved at the same location.
This JSON file has three keys:
- `bio_repr`: The string in BIO format passed to the command,
- `tokens`: the list of tokens in the file, with their index and text,
- `spans`: the list of NER entities found and their tokens.
......@@ -61,6 +61,9 @@ nav:
- Get started:
- get_started/index.md
- Development: get_started/development.md
- Usage:
- usage/index.md
- Validation: usage/validate.md
# defer to literate-nav
- Code Reference: reference/
......
......@@ -51,6 +51,8 @@ ignore = [
# On top of the Google convention, disable `D417`, which requires
# documentation for every function parameter.
"D417",
# May cause some conflicts
"COM812",
]
select = [
# pycodestyle
......
rich==13.7.0
from pathlib import Path
FIXTURES = Path(__file__).with_name("fixtures")
San B-GPE
Francisco I-GPE
considers O
banning B-VERB
sidewalk O
delivery O
robots O
\ No newline at end of file
from tests.conftest import FIXTURES
DATA_DIR = FIXTURES / "parse"
from bio_parser.parse.document import Document, Span, Tag, Token
from tests.parse import DATA_DIR
import pytest
from bio_parser.parse.document import _make_ner_label
FILEPATH = DATA_DIR / "valid.bio"
@pytest.fixture
def document():
return Document.from_file(FILEPATH)
@pytest.mark.parametrize(
"tag, label, output",
(
(Tag.OUTSIDE, None, "O"),
(Tag.BEGINNING, "GPE", "B-GPE"),
(Tag.INSIDE, "GPE", "I-GPE"),
),
)
def test_make_ner_label(tag: Tag, label: str, output: str):
assert _make_ner_label(tag=tag, label=label) == output
@pytest.mark.parametrize(
"tag, label, error",
(
(Tag.OUTSIDE, "GPE", "Invalid label `GPE` with tag `O`"),
(Tag.BEGINNING, None, "No named entity label found with tag `B`"),
(Tag.INSIDE, None, "No named entity label found with tag `I`"),
),
)
def test_make_ner_label_invalid(tag: Tag, label: str, error: str):
with pytest.raises(AssertionError, match=error):
_ = _make_ner_label(tag=tag, label=label)
def test_parse_document(document: Document):
# Check words
assert document.words == [
"San",
"Francisco",
"considers",
"banning",
"sidewalk",
"delivery",
"robots",
]
# Check entities
assert document.entities == [
("GPE", "San Francisco"),
("VERB", "banning"),
]
# Check word entities
assert document.word_entities == [
("GPE", "San"),
("GPE", "Francisco"),
("VERB", "banning"),
]
# Check text
assert document.text == "San Francisco considers banning sidewalk delivery robots"
# Check labels
assert document.char_labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco") + [
"O"
] * len(" considers ") + ["B-VERB"] + ["I-VERB"] * len("anning") + ["O"] * len(
" sidewalk delivery robots"
)
print(document.word_labels)
assert document.word_labels == [
"GPE",
"GPE",
"GPE",
"O",
"O",
"O",
"VERB",
"O",
"O",
"O",
"O",
"O",
"O",
]
# Check chars
assert document.chars == list(
"San Francisco considers banning sidewalk delivery robots"
)
def test_parse_span(document: Document):
span: Span = document.spans[0]
# Check text
assert span.text == "San Francisco"
# Check label
assert span.label == "GPE"
# Check idx
assert span.idx == 0
# Check end
assert span.end == 2
# Check chars
assert span.chars == list("San Francisco")
# Check labels
assert span.labels == ["B-GPE"] + ["I-GPE"] * len("an Francisco")
def test_parse_token(document: Document):
# B- token
token: Token = document.spans[0].tokens[0]
# Check word
assert token.word == "San"
# Check label
assert token.label == "GPE"
# Check label
assert token.tag == Tag.BEGINNING
# Check IOB Label
assert token.iob_label == "B-GPE"
# Check labels
assert token.labels == ["B-GPE", "I-GPE", "I-GPE"]
# Check chars
assert token.chars == ["S", "a", "n"]
# I- token
token: Token = document.spans[0].tokens[1]
# Check word
assert token.word == "Francisco"
# Check label
assert token.label == "GPE"
# Check label
assert token.tag == Tag.INSIDE
# Check IOB Label
assert token.iob_label == "I-GPE"
# Check labels
assert token.labels == [
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
"I-GPE",
]
# Check chars
assert token.chars == ["F", "r", "a", "n", "c", "i", "s", "c", "o"]
# O token
token: Token = document.tokens[-1]
# Check word
assert token.word == "robots"
# Check label
assert token.label is None
# Check label
assert token.tag == Tag.OUTSIDE
# Check IOB Label
assert token.iob_label == "O"
# Check labels
assert token.labels == ["O", "O", "O", "O", "O", "O"]
# Check chars
assert token.chars == ["r", "o", "b", "o", "t", "s"]
@pytest.mark.parametrize(
"annotation",
("Something something", "Something A-GPE", "Something GPE-A", "Something A"),
)
def test_invalid_token(annotation: str):
with pytest.raises(AssertionError, match="Could not parse annotation"):
_ = Token(idx=0, text=annotation).word
import json
from bio_parser.parse.validate import run as validate
from tests.parse import DATA_DIR
def test_valid():
filepath = DATA_DIR / "valid.bio"
validate([filepath])
# A JSON file should have been generated
output = filepath.with_suffix(".json")
assert output.exists()
# Check content of JSON
assert json.loads(output.read_text()) == {
"bio_repr": "San B-GPE\nFrancisco I-GPE\nconsiders O\nbanning B-VERB\nsidewalk O\ndelivery O\nrobots O",
"tokens": [
{"idx": 0, "text": "San B-GPE"},
{"idx": 1, "text": "Francisco I-GPE"},
{"idx": 2, "text": "considers O"},
{"idx": 3, "text": "banning B-VERB"},
{"idx": 4, "text": "sidewalk O"},
{"idx": 5, "text": "delivery O"},
{"idx": 6, "text": "robots O"},
],
"spans": [
{
"tokens": [
{"idx": 0, "text": "San B-GPE"},
{"idx": 1, "text": "Francisco I-GPE"},
]
},
{"tokens": [{"idx": 3, "text": "banning B-VERB"}]},
],
}
# Cleanup
output.unlink()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment