diff --git a/bio_parser/parse/nested_document.py b/bio_parser/parse/nested_document.py index 664d65d961631c2d1b06576c7a0ba3f17951f199..e4a2e08cdfb020eeafde4e8ddd57c535477d8640 100644 --- a/bio_parser/parse/nested_document.py +++ b/bio_parser/parse/nested_document.py @@ -208,16 +208,48 @@ class NestedDocument: nested_tokens: list[NestedToken] = field(default_factory=list) """List of the nested tokens in the Document""" - spans: list[Span] = field(default_factory=list) - """List of the spans in the Document""" - nested_spans: list[NestedSpan] = field(default_factory=list) """List of the nested spans in the Document""" hierarchy: list[dict[str, Any]] = field(default_factory=list) """Hierarchy required for metrics""" + def __post_init__(self): + """Parses the tokens and the entity spans in the document.""" + # Build nested spans with hierarchy + self.nested_spans = self._build_nested_spans() + + # Build JSON hierarchy + self.hierarchy = self._build_hierarchy() + + def _build_nested_spans(self) -> list[dict[str, Span | list[Span]]]: + """Build span hierarchy based on token position in the BIO file.""" + + def is_inside(span, parent_span): + return ( + (span.idx >= parent_span.idx) + and (span.end <= parent_span.end) + and (parent_span != span) + ) + + flat_spans = self._build_spans() + nested_spans = [] + parents = [span for span in flat_spans if span.level == 0] + for parent in parents: + children = [ + span + for span in flat_spans + if is_inside(span, parent) and parent.level < span.level + ] + nested_spans.append(NestedSpan([parent] + children)) + return nested_spans + + def _build_hierarchy(self) -> list[dict]: + return [nested_span.hierarchy for nested_span in self.nested_spans] + def _build_spans(self): + """Build spans.""" + spans = [] current_spans: dict[str, Span] = {} # Keep track of current spans by category for idx, line in enumerate(self.bio_repr.splitlines()): try: @@ -230,7 +262,7 @@ class NestedDocument: case Tag.OUTSIDE: # Close all current spans for span in current_spans.values(): - self.spans.append(span) + spans.append(span) current_spans = {} case Tag.INSIDE: @@ -244,7 +276,7 @@ class NestedDocument: # End existing span if necessary if token.label in current_spans: span = current_spans.pop(token.label) - self.spans.append(span) + spans.append(span) # Start a new span current_spans[token.label] = Span() @@ -256,42 +288,8 @@ class NestedDocument: # Last spans for span in current_spans.values(): - self.spans.append(span) - - def _build_nested_spans(self) -> list[dict[str, Span | list[Span]]]: - """Build span hierarchy based on token position in the BIO file.""" - - def is_inside(span, parent_span): - return ( - (span.idx >= parent_span.idx) - and (span.end <= parent_span.end) - and (parent_span != span) - ) - - self.nested_spans = [] - parents = [span for span in self.spans if span.level == 0] - for parent in parents: - children = [ - span - for span in self.spans - if is_inside(span, parent) and parent.level < span.level - ] - self.nested_spans.append(NestedSpan([parent] + children)) - - def _build_hierarchy(self) -> list[dict]: - self.hierarchy = [nested_span.hierarchy for nested_span in self.nested_spans] - return self.hierarchy - - def __post_init__(self): - """Parses the tokens and the entity spans in the document.""" - # Build spans - self._build_spans() - - # Build nested spans with hierarchy - self._build_nested_spans() - - # Build a simple hierarchy - self._build_hierarchy() + spans.append(span) + return spans @property def words(self) -> list[str]: @@ -302,7 +300,17 @@ class NestedDocument: def entities(self) -> list[tuple[str, str]]: """List of entities making up the document.""" return list( - map(attrgetter("label", "text"), filter(attrgetter("label"), self.spans)), + map( + attrgetter("label", "text"), + filter( + attrgetter("label"), + [ + span + for nested_span in self.nested_spans + for span in nested_span.spans + ], + ), + ), ) @property diff --git a/bio_parser/parse/validate.py b/bio_parser/parse/validate.py index ca097852ab6a5f3cdc0791531c45086b273d8fe4..ebf26b6709ec404ac6aad20d5c6a26783a6334e1 100644 --- a/bio_parser/parse/validate.py +++ b/bio_parser/parse/validate.py @@ -25,7 +25,7 @@ def run(filepaths: list[Path], allow_nested=False) -> None: else Document.from_file(filepath) ) filepath.with_suffix(".json").write_text(json.dumps(asdict(doc), indent=2)) + logger.info(f"The file @ `{filepath}` is valid!") except Exception as e: logger.error(f"Could not load the file @ `{filepath}`: {e}") - logger.info(f"The file @ `{filepath}` is valid!") diff --git a/tests/parse/test_validate.py b/tests/parse/test_validate.py index 646d0ef398b23eef73bc6eae7e0eb5b1daf05e2b..56fda91a35986a5b8427d2366efb65f84fc1c79e 100644 --- a/tests/parse/test_validate.py +++ b/tests/parse/test_validate.py @@ -71,42 +71,6 @@ def test_valid_nested(): {"idx": 13, "text": "mère O"}, {"idx": 14, "text": "Marie B-mother B-name"}, ], - "spans": [ - { - "tokens": [ - {"idx": 0, "text": "Charles B-child", "level": 0}, - {"idx": 1, "text": "né I-child", "level": 0}, - {"idx": 2, "text": "à I-child", "level": 0}, - {"idx": 3, "text": "Beaune I-child", "level": 0}, - {"idx": 4, "text": "en I-child", "level": 0}, - {"idx": 5, "text": "1836 I-child", "level": 0}, - ] - }, - {"tokens": [{"idx": 0, "text": "Charles B-name", "level": 1}]}, - {"tokens": [{"idx": 3, "text": "Beaune B-location", "level": 1}]}, - {"tokens": [{"idx": 5, "text": "1836 B-date", "level": 1}]}, - { - "tokens": [ - {"idx": 7, "text": "Jean B-father", "level": 0}, - {"idx": 8, "text": "Bigre I-father", "level": 0}, - {"idx": 9, "text": "charpentier I-father", "level": 0}, - {"idx": 10, "text": "de I-father", "level": 0}, - {"idx": 11, "text": "cette I-father", "level": 0}, - {"idx": 12, "text": "paroisse I-father", "level": 0}, - ] - }, - {"tokens": [{"idx": 7, "text": "Jean B-name", "level": 1}]}, - {"tokens": [{"idx": 8, "text": "Bigre B-surname", "level": 1}]}, - {"tokens": [{"idx": 9, "text": "charpentier B-occupation", "level": 1}]}, - { - "tokens": [ - {"idx": 11, "text": "cette B-location", "level": 1}, - {"idx": 12, "text": "paroisse I-location", "level": 1}, - ] - }, - {"tokens": [{"idx": 14, "text": "Marie B-mother", "level": 0}]}, - {"tokens": [{"idx": 14, "text": "Marie B-name", "level": 1}]}, - ], "nested_spans": [ { "spans": [