Skip to content
Snippets Groups Projects
Commit 9a1ec89c authored by Solene Tarride's avatar Solene Tarride
Browse files

Build hierarchy and nested spans

parent 7bc3eb2c
No related branches found
No related tags found
1 merge request!8Draft: Support nested entities
Pipeline #204053 failed
......@@ -5,6 +5,8 @@ Validate a given BIO file.
from argparse import ArgumentParser
from pathlib import Path
import yaml
from bio_parser.parse.validate import run
......@@ -14,6 +16,11 @@ def _check_bio_ext(filename: str) -> Path:
return filepath
def _load_yaml(config: str) -> Path:
with Path(config).open() as file:
return yaml.safe_load(file)
def add_validate_parser(subcommands):
parser: ArgumentParser = subcommands.add_parser(
"validate",
......@@ -25,6 +32,7 @@ def add_validate_parser(subcommands):
parser.add_argument(
"filepaths", help="Files to validate.", type=_check_bio_ext, nargs="*"
)
parser.add_argument("config", help="Config with entity hierarchy.", type=_load_yaml)
parser.add_argument(
"--allow-nested",
help="Whether to allow nested entities.",
......
......@@ -4,6 +4,7 @@ import re
from dataclasses import dataclass, field
from operator import attrgetter
from pathlib import Path
from typing import Any
from bio_parser.parse.document import Span, Tag, Token
......@@ -110,16 +111,26 @@ class NestedDocument:
filename: str
"""Document filename"""
bio_repr: str
"""Full BIO representation of the Document"""
entity_hierarchy: dict[int, list[str]]
"""Hierarchy between entities"""
nested_tokens: list[NestedToken] = field(default_factory=list)
"""List of the nested tokens in the Document"""
spans: list[Span] = field(default_factory=list)
"""List of the spans in the Document"""
def __post_init__(self):
"""Parses the tokens and the entity spans in the document."""
nested_spans: list[dict[str, Any]] = field(default_factory=list)
"""List of the nested spans in the Document"""
hierarchy: list[dict[str, Any]] = field(default_factory=list)
"""Hierarchy required for metrics"""
def _build_spans(self):
current_spans: dict[str, Span] = {} # Keep track of current spans by category
for idx, line in enumerate(self.bio_repr.splitlines()):
try:
......@@ -160,6 +171,55 @@ class NestedDocument:
for span in current_spans.values():
self.spans.append(span)
def _build_nested_spans(self) -> list[dict[str, Span | list[Span]]]:
"""Span hierarchy based on entity config."""
def get_span_level(span):
for level, categories in self.entity_hierarchy.items():
if span.label in categories:
return level
return
def is_inside(span, parent_span):
return (
(span.idx >= parent_span.idx)
and (span.end <= parent_span.end)
and (parent_span != span)
)
def get_children(parent, candidates):
return [span for span in candidates if is_inside(span, parent)]
parent_spans = [span for span in self.spans if get_span_level(span) == 0]
self.nested_spans = [
{"parent": span, "children": get_children(span, self.spans)}
for span in parent_spans
]
def _build_hierarchy(self) -> None:
self.hierarchy = [
{
"category": span["parent"].label,
"children": [
{"category": child.label, "children": child.text}
for child in span["children"]
],
}
for span in self.nested_spans
]
def __post_init__(self):
"""Parses the tokens and the entity spans in the document."""
# Build spans
self._build_spans()
# Build nested spans with hierarchy
self._build_nested_spans()
# Build a simple hierarchy
self._build_hierarchy()
@property
def words(self) -> list[str]:
"""List of words making up the document."""
......@@ -198,7 +258,7 @@ class NestedDocument:
return list(self.text)
@classmethod
def from_file(cls, filepath: Path) -> "NestedDocument":
def from_file(cls, filepath: Path, config: dict) -> "NestedDocument":
"""Load a Document from a IOB file.
Args:
......@@ -207,4 +267,4 @@ class NestedDocument:
Returns:
Document: Parsed document
"""
return NestedDocument(filepath.stem, filepath.read_text())
return NestedDocument(filepath.stem, filepath.read_text(), config)
......@@ -10,7 +10,7 @@ from bio_parser.parse.nested_document import NestedDocument
logger = logging.getLogger(__name__)
def run(filepaths: list[Path], allow_nested=False) -> None:
def run(filepaths: list[Path], config={}, allow_nested=False) -> None:
"""Validate the construction of multiple BIO files.
Args:
......@@ -20,11 +20,12 @@ def run(filepaths: list[Path], allow_nested=False) -> None:
logger.info(f"Parsing file @ `{filepath}`")
try:
doc = (
NestedDocument.from_file(filepath)
NestedDocument.from_file(filepath, config)
if allow_nested
else Document.from_file(filepath)
)
filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2))
except Exception as e:
logger.error(f"Could not load the file @ `{filepath}`: {e}")
logger.info(f"The file @ `{filepath}` is valid!")
0:
- child
- father
- mother
1:
- name
- surname
- surname
- occupation
- date
pyaml==24.12.1
rich==13.7.0
......@@ -5,11 +5,15 @@ from bio_parser.parse.nested_document import NestedDocument, NestedToken
from tests.parse import DATA_DIR
FILEPATH = DATA_DIR / "valid_nested.bio"
CONFIG = {
0: ["child", "father", "mother"],
1: ["name", "surname", "occupation", "location", "date"],
}
@pytest.fixture()
def nested_document() -> NestedDocument:
return NestedDocument.from_file(FILEPATH)
return NestedDocument.from_file(FILEPATH, CONFIG)
def test_parse_document(nested_document: NestedDocument):
......
......@@ -39,3 +39,215 @@ def test_valid():
# Cleanup
output.unlink()
def test_valid_nested():
filepath = DATA_DIR / "valid_nested.bio"
config = {
0: ["child", "father", "mother"],
1: ["name", "surname", "occupation", "location", "date"],
}
validate([filepath], config, allow_nested=True)
# A JSON file should have been generated
output = filepath.with_suffix(".json")
assert output.exists()
# Check content of JSON
assert json.loads(output.read_text()) == {
"filename": "valid_nested",
"bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n",
"entity_hierarchy": {
"0": ["child", "father", "mother"],
"1": ["name", "surname", "occupation", "location", "date"],
},
"nested_tokens": [
{"idx": 0, "text": "Charles B-child B-name"},
{"idx": 1, "text": "né I-child"},
{"idx": 2, "text": "à I-child"},
{"idx": 3, "text": "Beaune I-child B-location"},
{"idx": 4, "text": "en I-child"},
{"idx": 5, "text": "1836 I-child B-date"},
{"idx": 6, "text": "père O"},
{"idx": 7, "text": "Jean B-father B-name"},
{"idx": 8, "text": "Bigre I-father B-surname"},
{"idx": 9, "text": "charpentier I-father B-occupation"},
{"idx": 10, "text": "de I-father"},
{"idx": 11, "text": "cette I-father B-location"},
{"idx": 12, "text": "paroisse I-father I-location"},
{"idx": 13, "text": "mère O"},
{"idx": 14, "text": "Marie B-mother B-name"},
],
"spans": [
{
"tokens": [
{"idx": 0, "text": "Charles B-child"},
{"idx": 1, "text": "né I-child"},
{"idx": 2, "text": "à I-child"},
{"idx": 3, "text": "Beaune I-child"},
{"idx": 4, "text": "en I-child"},
{"idx": 5, "text": "1836 I-child"},
]
},
{"tokens": [{"idx": 0, "text": "Charles B-name"}]},
{"tokens": [{"idx": 3, "text": "Beaune B-location"}]},
{"tokens": [{"idx": 5, "text": "1836 B-date"}]},
{
"tokens": [
{"idx": 7, "text": "Jean B-father"},
{"idx": 8, "text": "Bigre I-father"},
{"idx": 9, "text": "charpentier I-father"},
{"idx": 10, "text": "de I-father"},
{"idx": 11, "text": "cette I-father"},
{"idx": 12, "text": "paroisse I-father"},
]
},
{"tokens": [{"idx": 7, "text": "Jean B-name"}]},
{"tokens": [{"idx": 8, "text": "Bigre B-surname"}]},
{"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]},
{
"tokens": [
{"idx": 11, "text": "cette B-location"},
{"idx": 12, "text": "paroisse I-location"},
]
},
{"tokens": [{"idx": 14, "text": "Marie B-mother"}]},
{"tokens": [{"idx": 14, "text": "Marie B-name"}]},
],
"nested_spans": [
{
"parent": {
"tokens": [
{"idx": 0, "text": "Charles B-child"},
{"idx": 1, "text": "né I-child"},
{"idx": 2, "text": "à I-child"},
{"idx": 3, "text": "Beaune I-child"},
{"idx": 4, "text": "en I-child"},
{"idx": 5, "text": "1836 I-child"},
]
},
"children": [
{"tokens": [{"idx": 0, "text": "Charles B-name"}]},
{"tokens": [{"idx": 3, "text": "Beaune B-location"}]},
{"tokens": [{"idx": 5, "text": "1836 B-date"}]},
],
},
{
"parent": {
"tokens": [
{"idx": 7, "text": "Jean B-father"},
{"idx": 8, "text": "Bigre I-father"},
{"idx": 9, "text": "charpentier I-father"},
{"idx": 10, "text": "de I-father"},
{"idx": 11, "text": "cette I-father"},
{"idx": 12, "text": "paroisse I-father"},
]
},
"children": [
{"tokens": [{"idx": 7, "text": "Jean B-name"}]},
{"tokens": [{"idx": 8, "text": "Bigre B-surname"}]},
{"tokens": [{"idx": 9, "text": "charpentier B-occupation"}]},
{
"tokens": [
{"idx": 11, "text": "cette B-location"},
{"idx": 12, "text": "paroisse I-location"},
]
},
],
},
{
"parent": {"tokens": [{"idx": 14, "text": "Marie B-mother"}]},
"children": [{"tokens": [{"idx": 14, "text": "Marie B-name"}]}],
},
],
"hierarchy": [
{
"category": "child",
"children": [
{"category": "name", "children": "Charles"},
{"category": "location", "children": "Beaune"},
{"category": "date", "children": "1836"},
],
},
{
"category": "father",
"children": [
{"category": "name", "children": "Jean"},
{"category": "surname", "children": "Bigre"},
{"category": "occupation", "children": "charpentier"},
{"category": "location", "children": "cette paroisse"},
],
},
{
"category": "mother",
"children": [{"category": "name", "children": "Marie"}],
},
],
}
# Cleanup
output.unlink()
def test_valid_not_nested():
filepath = DATA_DIR / "valid_nested.bio"
config = {
0: ["child", "father", "mother"],
1: ["name", "surname", "occupation", "location", "date"],
}
validate([filepath], config, allow_nested=False)
# A JSON file should have been generated
output = filepath.with_suffix(".json")
assert output.exists()
# Check content of JSON
assert json.loads(output.read_text()) == {
"filename": "valid_nested",
"bio_repr": "Charles B-child B-name\nné I-child\nà I-child\nBeaune I-child B-location\nen I-child\n1836 I-child B-date\npère O\nJean B-father B-name\nBigre I-father B-surname\ncharpentier I-father B-occupation\nde I-father\ncette I-father B-location\nparoisse I-father I-location\nmère O\nMarie B-mother B-name\n",
"tokens": [
{"idx": 0, "text": "Charles B-child B-name"},
{"idx": 1, "text": "né I-child"},
{"idx": 2, "text": "à I-child"},
{"idx": 3, "text": "Beaune I-child B-location"},
{"idx": 4, "text": "en I-child"},
{"idx": 5, "text": "1836 I-child B-date"},
{"idx": 6, "text": "père O"},
{"idx": 7, "text": "Jean B-father B-name"},
{"idx": 8, "text": "Bigre I-father B-surname"},
{"idx": 9, "text": "charpentier I-father B-occupation"},
{"idx": 10, "text": "de I-father"},
{"idx": 11, "text": "cette I-father B-location"},
{"idx": 12, "text": "paroisse I-father I-location"},
{"idx": 13, "text": "mère O"},
{"idx": 14, "text": "Marie B-mother B-name"},
],
"spans": [
{
"tokens": [
{"idx": 0, "text": "Charles B-child B-name"},
{"idx": 1, "text": "né I-child"},
{"idx": 2, "text": "à I-child"},
{"idx": 3, "text": "Beaune I-child B-location"},
{"idx": 4, "text": "en I-child"},
{"idx": 5, "text": "1836 I-child B-date"},
]
},
{
"tokens": [
{"idx": 7, "text": "Jean B-father B-name"},
{"idx": 8, "text": "Bigre I-father B-surname"},
{"idx": 9, "text": "charpentier I-father B-occupation"},
{"idx": 10, "text": "de I-father"},
{"idx": 11, "text": "cette I-father B-location"},
{"idx": 12, "text": "paroisse I-father I-location"},
]
},
{"tokens": [{"idx": 14, "text": "Marie B-mother B-name"}]},
],
}
# Cleanup
output.unlink()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment