diff --git a/dan/bio.py b/dan/bio.py index d676d937cc652fa88c4d18919c38b9c53e3d1a39..e9151d605e16cadd9299525df3364b0d4bef0455 100644 --- a/dan/bio.py +++ b/dan/bio.py @@ -1,12 +1,49 @@ +""" +Convert DAN predictions to BIO format. +""" + +import json import logging import re +from pathlib import Path from typing import Dict, List -from dan.utils import EntityType +from tqdm import tqdm + +from dan.utils import EntityType, parse_tokens logger = logging.getLogger(__name__) +def add_convert_bio_parser(subcommands): + parser = subcommands.add_parser( + "convert", + description=__doc__, + help=__doc__, + ) + parser.set_defaults(func=run) + + parser.add_argument( + "predictions", + type=Path, + help="Path to a folder of DAN predictions.", + ) + + parser.add_argument( + "--tokens", + type=Path, + help="Mapping between starting tokens and end tokens to extract text with their entities.", + required=True, + ) + + parser.add_argument( + "--output", + type=Path, + help="Where BIO files are saved. Will be created if missing.", + required=True, + ) + + def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: # Mapping to find a starting token for an ending token efficiently mapping_end_start: Dict[str, str] = { @@ -83,3 +120,27 @@ def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str: # Concatenating all formatted iob strings return "\n".join(iob) + + +def run( + predictions: Path, + tokens: Path, + output: Path, +): + # Create output folder + output.mkdir(parents=True, exist_ok=True) + + # Load tokens + ner_tokens = parse_tokens(tokens) + + for prediction in tqdm( + list(predictions.glob("*.json")), desc="Converting predictions" + ): + data = json.loads(prediction.read_text()) + try: + bio_representation = convert(data["text"], ner_tokens) + except Exception as e: + logger.error(f"Failed to convert {prediction.name}: {e}") + continue + + (output / prediction.stem).with_suffix(".bio").write_text(bio_representation) diff --git a/dan/cli.py b/dan/cli.py index 1e80f88c5a3ee1178932195b545d2c4c693c5b93..0df8ee8593a416b8cf8843a28dd51b39fe3b7037 100644 --- a/dan/cli.py +++ b/dan/cli.py @@ -2,6 +2,7 @@ import argparse import errno +from dan.bio import add_convert_bio_parser from dan.datasets import add_dataset_parser from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser @@ -10,6 +11,7 @@ def get_parser(): parser = argparse.ArgumentParser(prog="teklia-dan") subcommands = parser.add_subparsers(metavar="subcommand") + add_convert_bio_parser(subcommands) add_dataset_parser(subcommands) add_train_parser(subcommands) add_evaluate_parser(subcommands) diff --git a/docs/get_started/development.md b/docs/get_started/development.md index ee7739d82da238ca8eb8971faf906cf3a01614cf..0c2ff165a36786216378c28e4ec1a7d249b8f368 100644 --- a/docs/get_started/development.md +++ b/docs/get_started/development.md @@ -52,6 +52,12 @@ The library already has all the documents needed to run the [evaluation command] teklia-dan evaluate --config configs/eval.json ``` +If you want to evaluate a NER models with you own scripts, you can convert DAN's predictions in [BIO](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) format, using the [convert command](../usage/convert/index.md). + +```shell +teklia-dan convert /tmp/dan-predict --tokens tokens.yml --output /tmp/dan-convert +``` + ## Documentation This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/). diff --git a/docs/usage/convert/index.md b/docs/usage/convert/index.md new file mode 100644 index 0000000000000000000000000000000000000000..46812639d47596d412a04252af3bfe468553b30a --- /dev/null +++ b/docs/usage/convert/index.md @@ -0,0 +1,67 @@ +# Convert + +Use the `teklia-dan convert` command to convert DAN predictions to BIO format. This is also the code used during [evaluation](../evaluate/index.md). + +## BIO format + +The BIO (or IOB) [format](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) is a representation used for the Named Entity Recognition task. + +## Description + +This command is meant to be used on DAN predictions. Make sure the [predict](../predict/index.md) command has been used first. +The first argument of this command is the path to a folder with the predictions in JSON format. The other **required** arguments are described in the table below. + +| Parameter | Description | Type | Default | +| ---------- | ----------------------------------------------------------------------------------- | -------------- | ------- | +| `--output` | Where BIO files are saved. Will be created if missing | `pathlib.Path` | | +| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` | | + +!!! note + The `--tokens` argument is the same file used during [dataset extraction](../datasets/extract.md#description), generated by the [tokens subcommand](../datasets/tokens.md). + +## Examples + +Take a simple prediction from DAN. + +```json title="predictions/image.json" +{ + "text": "â’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶28 aout 1858\nâ’¶30 aout 1858", + "confidences": {}, + "language_model": {}, + "objects": [...] +} +``` + +With this tokens map: + +```yaml title="tokens.yml" +Date: + start: â’¶ + end: +``` + +Then you can create the corresponding BIO file using + +```sh +teklia-dan convert predictions --tokens tokens.yml --output bio +``` + +The folder pointed by `--output` will be created if missing. This command will generate one BIO file per JSON prediction, under the same name. + +```title="bio/image.bio" +27 B-Date +aout I-Date +1858 I-Date +27 B-Date +aout I-Date +1858 I-Date +27 B-Date +aout I-Date +1858 I-Date +28 B-Date +aout I-Date +1858 I-Date +30 B-Date +aout I-Date +1858 I-Date +``` diff --git a/docs/usage/index.md b/docs/usage/index.md index eefe961027dcf2345ac703fe2ef79915d39dfa0b..53e4bb34e33e196dcde2b333205ed762fb97df76 100644 --- a/docs/usage/index.md +++ b/docs/usage/index.md @@ -13,3 +13,6 @@ When `teklia-dan` is installed in your environment, you may use the following co `teklia-dan predict` : To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md). + +`teklia-dan convert` +: To convert DAN predictions to BIO format. Mostly useful when training DAN for the NER task. More details in the [dedicated page](./convert/index.md). diff --git a/mkdocs.yml b/mkdocs.yml index 3ac3aea259d4d272fc8975909b3f723d811cddef..f6595421e2fe0f94dd0f7b609ee3bb3f70953c8a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,6 +74,7 @@ nav: - Jean Zay tutorial: usage/train/jeanzay.md - Evaluation: usage/evaluate/index.md - Prediction: usage/predict/index.md + - Convert: usage/convert/index.md - Python Reference: - Datasets: