From c1aa261bb2b0a589c43ca838c0b7007fedf7ea7e Mon Sep 17 00:00:00 2001 From: Yoann Schneider <yschneider@teklia.com> Date: Mon, 18 Mar 2024 12:39:53 +0000 Subject: [PATCH] Implement BIO convert command --- dan/bio.py | 63 ++++++++++++++++++++++++++++++- dan/cli.py | 2 + docs/get_started/development.md | 6 +++ docs/usage/convert/index.md | 67 +++++++++++++++++++++++++++++++++ docs/usage/index.md | 3 ++ mkdocs.yml | 1 + 6 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 docs/usage/convert/index.md diff --git a/dan/bio.py b/dan/bio.py index d676d937..e9151d60 100644 --- a/dan/bio.py +++ b/dan/bio.py @@ -1,12 +1,49 @@ +""" +Convert DAN predictions to BIO format. +""" + +import json import logging import re +from pathlib import Path from typing import Dict, List -from dan.utils import EntityType +from tqdm import tqdm + +from dan.utils import EntityType, parse_tokens logger = logging.getLogger(__name__) +def add_convert_bio_parser(subcommands): + parser = subcommands.add_parser( + "convert", + description=__doc__, + help=__doc__, + ) + parser.set_defaults(func=run) + + parser.add_argument( + "predictions", + type=Path, + help="Path to a folder of DAN predictions.", + ) + + parser.add_argument( + "--tokens", + type=Path, + help="Mapping between starting tokens and end tokens to extract text with their entities.", + required=True, + ) + + parser.add_argument( + "--output", + type=Path, + help="Where BIO files are saved. Will be created if missing.", + required=True, + ) + + def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: # Mapping to find a starting token for an ending token efficiently mapping_end_start: Dict[str, str] = { @@ -83,3 +120,27 @@ def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: # Concatenating all formatted iob strings return "\n".join(iob) + + +def run( + predictions: Path, + tokens: Path, + output: Path, +): + # Create output folder + output.mkdir(parents=True, exist_ok=True) + + # Load tokens + ner_tokens = parse_tokens(tokens) + + for prediction in tqdm( + list(predictions.glob("*.json")), desc="Converting predictions" + ): + data = json.loads(prediction.read_text()) + try: + bio_representation = convert(data["text"], ner_tokens) + except Exception as e: + logger.error(f"Failed to convert {prediction.name}: {e}") + continue + + (output / prediction.stem).with_suffix(".bio").write_text(bio_representation) diff --git a/dan/cli.py b/dan/cli.py index 1e80f88c..0df8ee85 100644 --- a/dan/cli.py +++ b/dan/cli.py @@ -2,6 +2,7 @@ import argparse import errno +from dan.bio import add_convert_bio_parser from dan.datasets import add_dataset_parser from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser @@ -10,6 +11,7 @@ def get_parser(): parser = argparse.ArgumentParser(prog="teklia-dan") subcommands = parser.add_subparsers(metavar="subcommand") + add_convert_bio_parser(subcommands) add_dataset_parser(subcommands) add_train_parser(subcommands) add_evaluate_parser(subcommands) diff --git a/docs/get_started/development.md b/docs/get_started/development.md index ee7739d8..0c2ff165 100644 --- a/docs/get_started/development.md +++ b/docs/get_started/development.md @@ -52,6 +52,12 @@ The library already has all the documents needed to run the [evaluation command] teklia-dan evaluate --config configs/eval.json ``` +If you want to evaluate a NER models with you own scripts, you can convert DAN's predictions in [BIO](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) format, using the [convert command](../usage/convert/index.md). + +```shell +teklia-dan convert /tmp/dan-predict --tokens tokens.yml --output /tmp/dan-convert +``` + ## Documentation This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/). diff --git a/docs/usage/convert/index.md b/docs/usage/convert/index.md new file mode 100644 index 00000000..46812639 --- /dev/null +++ b/docs/usage/convert/index.md @@ -0,0 +1,67 @@ +# Convert + +Use the `teklia-dan convert` command to convert DAN predictions to BIO format. This is also the code used during [evaluation](../evaluate/index.md). + +## BIO format + +The BIO (or IOB) [format](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) is a representation used for the Named Entity Recognition task. + +## Description + +This command is meant to be used on DAN predictions. Make sure the [predict](../predict/index.md) command has been used first. +The first argument of this command is the path to a folder with the predictions in JSON format. The other **required** arguments are described in the table below. + +| Parameter | Description | Type | Default | +| ---------- | ----------------------------------------------------------------------------------- | -------------- | ------- | +| `--output` | Where BIO files are saved. Will be created if missing | `pathlib.Path` | | +| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` | | + +!!! note + The `--tokens` argument is the same file used during [dataset extraction](../datasets/extract.md#description), generated by the [tokens subcommand](../datasets/tokens.md). + +## Examples + +Take a simple prediction from DAN. + +```json title="predictions/image.json" +{ + "text": "â’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶28 aout 1858\nâ’¶30 aout 1858", + "confidences": {}, + "language_model": {}, + "objects": [...] +} +``` + +With this tokens map: + +```yaml title="tokens.yml" +Date: + start: â’¶ + end: +``` + +Then you can create the corresponding BIO file using + +```sh +teklia-dan convert predictions --tokens tokens.yml --output bio +``` + +The folder pointed by `--output` will be created if missing. This command will generate one BIO file per JSON prediction, under the same name. + +```title="bio/image.bio" +27 B-Date +aout I-Date +1858 I-Date +27 B-Date +aout I-Date +1858 I-Date +27 B-Date +aout I-Date +1858 I-Date +28 B-Date +aout I-Date +1858 I-Date +30 B-Date +aout I-Date +1858 I-Date +``` diff --git a/docs/usage/index.md b/docs/usage/index.md index eefe9610..53e4bb34 100644 --- a/docs/usage/index.md +++ b/docs/usage/index.md @@ -13,3 +13,6 @@ When `teklia-dan` is installed in your environment, you may use the following co `teklia-dan predict` : To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md). + +`teklia-dan convert` +: To convert DAN predictions to BIO format. Mostly useful when training DAN for the NER task. More details in the [dedicated page](./convert/index.md). diff --git a/mkdocs.yml b/mkdocs.yml index 3ac3aea2..f6595421 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,6 +74,7 @@ nav: - Jean Zay tutorial: usage/train/jeanzay.md - Evaluation: usage/evaluate/index.md - Prediction: usage/predict/index.md + - Convert: usage/convert/index.md - Python Reference: - Datasets: -- GitLab