From c1aa261bb2b0a589c43ca838c0b7007fedf7ea7e Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Mon, 18 Mar 2024 12:39:53 +0000
Subject: [PATCH] Implement BIO convert command

---
 dan/bio.py                      | 63 ++++++++++++++++++++++++++++++-
 dan/cli.py                      |  2 +
 docs/get_started/development.md |  6 +++
 docs/usage/convert/index.md     | 67 +++++++++++++++++++++++++++++++++
 docs/usage/index.md             |  3 ++
 mkdocs.yml                      |  1 +
 6 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 docs/usage/convert/index.md

diff --git a/dan/bio.py b/dan/bio.py
index d676d937..e9151d60 100644
--- a/dan/bio.py
+++ b/dan/bio.py
@@ -1,12 +1,49 @@
+"""
+Convert DAN predictions to BIO format.
+"""
+
+import json
 import logging
 import re
+from pathlib import Path
 from typing import Dict, List
 
-from dan.utils import EntityType
+from tqdm import tqdm
+
+from dan.utils import EntityType, parse_tokens
 
 logger = logging.getLogger(__name__)
 
 
+def add_convert_bio_parser(subcommands):
+    parser = subcommands.add_parser(
+        "convert",
+        description=__doc__,
+        help=__doc__,
+    )
+    parser.set_defaults(func=run)
+
+    parser.add_argument(
+        "predictions",
+        type=Path,
+        help="Path to a folder of DAN predictions.",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=Path,
+        help="Mapping between starting tokens and end tokens to extract text with their entities.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--output",
+        type=Path,
+        help="Where BIO files are saved. Will be created if missing.",
+        required=True,
+    )
+
+
 def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
     # Mapping to find a starting token for an ending token efficiently
     mapping_end_start: Dict[str, str] = {
@@ -83,3 +120,27 @@ def convert(text: str, ner_tokens: Dict[str, EntityType]) -> str:
 
     # Concatenating all formatted iob strings
     return "\n".join(iob)
+
+
+def run(
+    predictions: Path,
+    tokens: Path,
+    output: Path,
+):
+    # Create output folder
+    output.mkdir(parents=True, exist_ok=True)
+
+    # Load tokens
+    ner_tokens = parse_tokens(tokens)
+
+    for prediction in tqdm(
+        list(predictions.glob("*.json")), desc="Converting predictions"
+    ):
+        data = json.loads(prediction.read_text())
+        try:
+            bio_representation = convert(data["text"], ner_tokens)
+        except Exception as e:
+            logger.error(f"Failed to convert {prediction.name}: {e}")
+            continue
+
+        (output / prediction.stem).with_suffix(".bio").write_text(bio_representation)
diff --git a/dan/cli.py b/dan/cli.py
index 1e80f88c..0df8ee85 100644
--- a/dan/cli.py
+++ b/dan/cli.py
@@ -2,6 +2,7 @@
 import argparse
 import errno
 
+from dan.bio import add_convert_bio_parser
 from dan.datasets import add_dataset_parser
 from dan.ocr import add_evaluate_parser, add_predict_parser, add_train_parser
 
@@ -10,6 +11,7 @@ def get_parser():
     parser = argparse.ArgumentParser(prog="teklia-dan")
     subcommands = parser.add_subparsers(metavar="subcommand")
 
+    add_convert_bio_parser(subcommands)
     add_dataset_parser(subcommands)
     add_train_parser(subcommands)
     add_evaluate_parser(subcommands)
diff --git a/docs/get_started/development.md b/docs/get_started/development.md
index ee7739d8..0c2ff165 100644
--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -52,6 +52,12 @@ The library already has all the documents needed to run the [evaluation command]
 teklia-dan evaluate --config configs/eval.json
 ```
 
+If you want to evaluate a NER models with you own scripts, you can convert DAN's predictions in [BIO](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) format, using the [convert command](../usage/convert/index.md).
+
+```shell
+teklia-dan convert /tmp/dan-predict --tokens tokens.yml --output /tmp/dan-convert
+```
+
 ## Documentation
 
 This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).
diff --git a/docs/usage/convert/index.md b/docs/usage/convert/index.md
new file mode 100644
index 00000000..46812639
--- /dev/null
+++ b/docs/usage/convert/index.md
@@ -0,0 +1,67 @@
+# Convert
+
+Use the `teklia-dan convert` command to convert DAN predictions to BIO format. This is also the code used during [evaluation](../evaluate/index.md).
+
+## BIO format
+
+The BIO (or IOB) [format](<https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>) is a representation used for the Named Entity Recognition task.
+
+## Description
+
+This command is meant to be used on DAN predictions. Make sure the [predict](../predict/index.md) command has been used first.
+The first argument of this command is the path to a folder with the predictions in JSON format. The other **required** arguments are described in the table below.
+
+| Parameter  | Description                                                                         | Type           | Default |
+| ---------- | ----------------------------------------------------------------------------------- | -------------- | ------- |
+| `--output` | Where BIO files are saved. Will be created if missing                               | `pathlib.Path` |         |
+| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `pathlib.Path` |         |
+
+!!! note
+    The `--tokens` argument is the same file used during [dataset extraction](../datasets/extract.md#description), generated by the [tokens subcommand](../datasets/tokens.md).
+
+## Examples
+
+Take a simple prediction from DAN.
+
+```json title="predictions/image.json"
+{
+  "text": "â’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶27 aout 1858\nâ’¶28 aout 1858\nâ’¶30 aout 1858",
+  "confidences": {},
+  "language_model": {},
+  "objects": [...]
+}
+```
+
+With this tokens map:
+
+```yaml title="tokens.yml"
+Date:
+  start: â’¶
+  end:
+```
+
+Then you can create the corresponding BIO file using
+
+```sh
+teklia-dan convert predictions --tokens tokens.yml --output bio
+```
+
+The folder pointed by `--output` will be created if missing. This command will generate one BIO file per JSON prediction, under the same name.
+
+```title="bio/image.bio"
+27 B-Date
+aout I-Date
+1858 I-Date
+27 B-Date
+aout I-Date
+1858 I-Date
+27 B-Date
+aout I-Date
+1858 I-Date
+28 B-Date
+aout I-Date
+1858 I-Date
+30 B-Date
+aout I-Date
+1858 I-Date
+```
diff --git a/docs/usage/index.md b/docs/usage/index.md
index eefe9610..53e4bb34 100644
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@@ -13,3 +13,6 @@ When `teklia-dan` is installed in your environment, you may use the following co
 
 `teklia-dan predict`
 : To predict an image using a trained DAN model. More details in the [dedicated page](./predict/index.md).
+
+`teklia-dan convert`
+: To convert DAN predictions to BIO format. Mostly useful when training DAN for the NER task. More details in the [dedicated page](./convert/index.md).
diff --git a/mkdocs.yml b/mkdocs.yml
index 3ac3aea2..f6595421 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
       - Jean Zay tutorial: usage/train/jeanzay.md
     - Evaluation: usage/evaluate/index.md
     - Prediction: usage/predict/index.md
+    - Convert: usage/convert/index.md
 
   - Python Reference:
     - Datasets:
-- 
GitLab