Implement BIO convert command

c1aa261b · Yoann Schneider · Manon Blanco · ed481c6f · c1aa261b · c1aa261b
Commit c1aa261b authored 1 year ago by Yoann Schneider Committed by Manon Blanco 1 year ago
--- a/dan/bio.py
+++ b/dan/bio.py
+"""
+Convert DAN predictions to BIO format.
+"""
+
+import json
 import logging
 import re
+from pathlib import Path
 from typing import Dict, List

-from dan.utils import EntityType
+from tqdm import tqdm
+
+from dan.utils import EntityType, parse_tokens

 logger = logging.getLogger(__name__)


+def add_convert_bio_parser(subcommands):
+    parser = subcommands.add_parser(
+        "convert",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.set_defaults(func=run)
+
+    parser.add_argument(
+        "predictions",
+        type=Path,
+        help="Path to a folder of DAN predictions.",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=Path,
+        help="Mapping between starting tokens and end tokens to extract text with their entities.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--output",
+        type=Path,
+        help="Where BIO files are saved. Will be created if missing.",
+        required=True,
+    )
+
+
 def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
    # Mapping to find a starting token for an ending token efficiently
    mapping_end_start: Dict[str, str] = {
@@ -83,3 +120,27 @@ def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:

    # Concatenating all formatted iob strings
    return "\n".join(iob)
+
+
+def run(
+    predictions: Path,
+    tokens: Path,
+    output: Path,
+):
+    # Create output folder
+    output.mkdir(parents=True, exist_ok=True)
+
+    # Load tokens
+    ner_tokens = parse_tokens(tokens)
+
+    for prediction in tqdm(
+        list(predictions.glob("*.json")), desc="Converting predictions"
+    ):
+        data = json.loads(prediction.read_text())
+        try:
+            bio_representation = convert(data["text"], ner_tokens)
+        except Exception as e:
+            logger.error(f"Failed to convert {prediction.name}: {e}")
+            continue
+
+        (output / prediction.stem).with_suffix(".bio").write_text(bio_representation)
--- a/dan/cli.py
+++ b/dan/cli.py
@@ -2,6 +2,7 @@
 import argparse
 import errno

+from dan.bio import add_convert_bio_parser
 from dan.datasets import add_dataset_parser
 from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser

@@ -10,6 +11,7 @@ def get_parser():
    parser = argparse.ArgumentParser(prog="teklia-dan")
    subcommands = parser.add_subparsers(metavar="subcommand")

+    add_convert_bio_parser(subcommands)
    add_dataset_parser(subcommands)
    add_train_parser(subcommands)
    add_evaluate_parser(subcommands)

--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -52,6 +52,12 @@ The library already has all the documents needed to run the [evaluation command]
 teklia-dan evaluate --config configs/eval.json
 ```

+If you want to evaluate a NER models with you own scripts, you can convert DAN's predictions in [BIO](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) format, using the [convert command](../usage/convert/index.md).
+
+```shell
+teklia-dan convert /tmp/dan-predict --tokens tokens.yml --output /tmp/dan-convert
+```
+
 ## Documentation

 This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).

--- a/docs/usage/convert/index.md
+++ b/docs/usage/convert/index.md
+# Convert
+
+Use the `teklia-dan convert` command to convert DAN predictions to BIO format. This is also the code used during [evaluation](../evaluate/index.md).
+
+## BIO format
+
+The BIO (or IOB) [format](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) is a representation used for the Named Entity Recognition task.
+
+## Description
+
+This command is meant to be used on DAN predictions. Make sure the [predict](../predict/index.md) command has been used first.
+The first argument of this command is the path to a folder with the predictions in JSON format. The other **required** arguments are described in the table below.
+
+| Parameter  | Description                                                                         | Type           | Default |
+| ---------- | ----------------------------------------------------------------------------------- | -------------- | ------- |
+| `--output` | Where BIO files are saved. Will be created if missing                               | `pathlib.Path` |         |
+| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` |         |
+
+!!! note
+    The `--tokens` argument is the same file used during [dataset extraction](../datasets/extract.md#description), generated by the [tokens subcommand](../datasets/tokens.md).
+
+## Examples
+
+Take a simple prediction from DAN.
+
+```json title="predictions/image.json"
+{
+  "text": "Ⓐ27 aout 1858\nⒶ27 aout 1858\nⒶ27 aout 1858\nⒶ28 aout 1858\nⒶ30 aout 1858",
+  "confidences": {},
+  "language_model": {},
+  "objects": [...]
+}
+```
+
+With this tokens map:
+
+```yaml title="tokens.yml"
+Date:
+  start: Ⓐ
+  end:
+```
+
+Then you can create the corresponding BIO file using
+
+```sh
+teklia-dan convert predictions --tokens tokens.yml --output bio
+```
+
+The folder pointed by `--output` will be created if missing. This command will generate one BIO file per JSON prediction, under the same name.
+
+```title="bio/image.bio"
+27 B-Date
+aout I-Date
+1858 I-Date
+27 B-Date
+aout I-Date
+1858 I-Date
+27 B-Date
+aout I-Date
+1858 I-Date
+28 B-Date
+aout I-Date
+1858 I-Date
+30 B-Date
+aout I-Date
+1858 I-Date
+```
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@@ -13,3 +13,6 @@ When `teklia-dan` is installed in your environment, you may use the following co

 `teklia-dan predict`
 : To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md).
+
+`teklia-dan convert`
+: To convert DAN predictions to BIO format. Mostly useful when training DAN for the NER task. More details in the [dedicated page](./convert/index.md).
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
      - Jean Zay tutorial: usage/train/jeanzay.md
    - Evaluation: usage/evaluate/index.md
    - Prediction: usage/predict/index.md
+    - Convert: usage/convert/index.md

  - Python Reference:
    - Datasets: