From 0aece446ffe32f58c760721553b95f47b666475b Mon Sep 17 00:00:00 2001
From: manonBlanco <blanco@teklia.com>
Date: Mon, 24 Jul 2023 09:12:47 +0200
Subject: [PATCH] Remove "allow-unknown-entities" parameters + Check entity
 separators

---
 dan/datasets/extract/__init__.py   | 16 ++++++++++------
 dan/datasets/extract/exceptions.py | 15 ---------------
 dan/datasets/extract/extract.py    | 14 +++++---------
 docs/usage/datasets/extract.md     |  1 -
 tests/test_extract.py              | 25 ++-----------------------
 5 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index 10e3b45c..2a17a75c 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -40,6 +40,15 @@ def validate_probability(proba):
     return proba
 
 
+def validate_char(char):
+    if len(char) != 1:
+        raise argparse.ArgumentTypeError(
+            f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1."
+        )
+
+    return char
+
+
 def add_extract_parser(subcommands) -> None:
     parser = subcommands.add_parser(
         "extract",
@@ -87,14 +96,9 @@ def add_extract_parser(subcommands) -> None:
         action="store_true",
         help="Extract text with their entities.",
     )
-    parser.add_argument(
-        "--allow-unknown-entities",
-        action="store_true",
-        help="Ignore entities that do not appear in the list of tokens.",
-    )
     parser.add_argument(
         "--entity-separators",
-        type=str,
+        type=validate_char,
         nargs="+",
         help="Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text.",
         required=False,
diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py
index da8fba65..93c8d1ae 100644
--- a/dan/datasets/extract/exceptions.py
+++ b/dan/datasets/extract/exceptions.py
@@ -49,21 +49,6 @@ class NoTranscriptionError(ElementProcessingError):
         return f"No transcriptions found on element ({self.element_id}) with this config. Skipping."
 
 
-class UnknownLabelError(ProcessingError):
-    """
-    Raised when the specified label is not known
-    """
-
-    label: str
-
-    def __init__(self, label: str, *args: object) -> None:
-        super().__init__(*args)
-        self.label = label
-
-    def __str__(self) -> str:
-        return f"Label `{self.label}` is missing in the NER configuration."
-
-
 class NoEndTokenError(ProcessingError):
     """
     Raised when the specified label has no end token and there is potentially additional text around the labels
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index 55b120dc..05105f9e 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -21,7 +21,6 @@ from dan.datasets.extract.exceptions import (
     NoEndTokenError,
     NoTranscriptionError,
     ProcessingError,
-    UnknownLabelError,
 )
 from dan.datasets.extract.utils import (
     EntityType,
@@ -50,7 +49,6 @@ class ArkindexExtractor:
         parent_element_type: str = None,
         output: Path = None,
         load_entities: bool = False,
-        allow_unknown_entities: bool = False,
         entity_separators: list = [],
         tokens: Path = None,
         use_existing_split: bool = None,
@@ -65,7 +63,6 @@ class ArkindexExtractor:
         self.parent_element_type = parent_element_type
         self.output = output
         self.load_entities = load_entities
-        self.allow_unknown_entities = allow_unknown_entities
         self.entity_separators = entity_separators
         self.tokens = parse_tokens(tokens) if self.load_entities else None
         self.use_existing_split = use_existing_split
@@ -121,11 +118,12 @@ class ArkindexExtractor:
             text += "".join(filter(keep_char, full_text[text_offset : entity.offset]))
 
             entity_type: EntityType = self.tokens.get(entity.type)
-            # Unknown entities are not allowed
-            if not entity_type and not self.allow_unknown_entities:
-                raise UnknownLabelError(entity.type)
+            if not entity_type:
+                logger.warning(
+                    f"Label `{entity.type}` is missing in the NER configuration."
+                )
             # We keep the whole text, so we need an end token for each entity to know exactly when an entity begins and ends.
-            if entity_type and not entity_type.end and keep_all_text:
+            elif not entity_type.end and keep_all_text:
                 raise NoEndTokenError(entity.type)
 
             # Entity text:
@@ -265,7 +263,6 @@ def run(
     parent_element_type: str,
     output: Path,
     load_entities: bool,
-    allow_unknown_entities: bool,
     entity_separators: list,
     tokens: Path,
     use_existing_split: bool,
@@ -318,7 +315,6 @@ def run(
         parent_element_type=parent_element_type,
         output=output,
         load_entities=load_entities,
-        allow_unknown_entities=allow_unknown_entities,
         entity_separators=entity_separators,
         tokens=tokens,
         use_existing_split=use_existing_split,
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index fce4f564..bf3b4ef2 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -12,7 +12,6 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 | `--parent-element-type`          | Type of the parent element containing the data.                                                                                              | `str`           | `page`  |
 | `--output`                       | Folder where the data will be generated.                                                                                                     | `Path`          |         |
 | `--load-entities`                | Extract text with their entities. Needed for NER tasks.                                                                                      | `bool`          | `False` |
-| `--allow-unknown-entities`       | Ignore entities that do not appear in the list of tokens.                                                                                    | `bool`          | `False` |
 | `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text. | `str`           |         |
 | `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.                                                                        | `Path`          |         |
 | `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                                                                          | `bool`          |         |
diff --git a/tests/test_extract.py b/tests/test_extract.py
index c20c4674..3c6cadac 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -4,7 +4,7 @@ from typing import NamedTuple
 
 import pytest
 
-from dan.datasets.extract.exceptions import NoEndTokenError, UnknownLabelError
+from dan.datasets.extract.exceptions import NoEndTokenError
 from dan.datasets.extract.extract import ArkindexExtractor
 from dan.datasets.extract.utils import EntityType, insert_token
 
@@ -36,25 +36,6 @@ def test_insert_token(text, offset, length, expected):
     )
 
 
-def test_reconstruct_text_unknown_label_error():
-    arkindex_extractor = ArkindexExtractor()
-    arkindex_extractor.tokens = TOKENS
-    with pytest.raises(
-        UnknownLabelError, match="Label `X` is missing in the NER configuration."
-    ):
-        arkindex_extractor.reconstruct_text(
-            "nÂ°1 x 16 janvier 1611",
-            [
-                Entity(
-                    offset=0,
-                    length=3,
-                    type="X",
-                    value="nÂ°1",
-                ),
-            ],
-        )
-
-
 def test_reconstruct_text_no_end_token_error():
     arkindex_extractor = ArkindexExtractor()
     arkindex_extractor.tokens = {
@@ -100,9 +81,7 @@ def test_reconstruct_text_no_end_token_error():
 @pytest.mark.parametrize("text_before", ("", "text before "))
 @pytest.mark.parametrize("text_after", ("", " text after"))
 def test_reconstruct_text(entity_separators, tokens, expected, text_before, text_after):
-    arkindex_extractor = ArkindexExtractor(
-        allow_unknown_entities=True, entity_separators=entity_separators
-    )
+    arkindex_extractor = ArkindexExtractor(entity_separators=entity_separators)
     arkindex_extractor.tokens = tokens
     assert arkindex_extractor.reconstruct_text(
         text_before + "nÂ°1 x 16 janvier 1611\nMichou" + text_after,
-- 
GitLab