Remove "allow-unknown-entities" parameters + Check entity separators

0aece446 · Manon Blanco · Manon Blanco · b57d5979 · 0aece446 · 0aece446
Commit 0aece446 authored 1 year ago by Manon Blanco Committed by Manon Blanco 1 year ago
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -40,6 +40,15 @@ def validate_probability(proba):
    return proba


+def validate_char(char):
+    if len(char) != 1:
+        raise argparse.ArgumentTypeError(
+            f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1."
+        )
+
+    return char
+
+
 def add_extract_parser(subcommands) -> None:
    parser = subcommands.add_parser(
        "extract",
@@ -87,14 +96,9 @@ def add_extract_parser(subcommands) -> None:
        action="store_true",
        help="Extract text with their entities.",
    )
-    parser.add_argument(
-        "--allow-unknown-entities",
-        action="store_true",
-        help="Ignore entities that do not appear in the list of tokens.",
-    )
    parser.add_argument(
        "--entity-separators",
-        type=str,
+        type=validate_char,
        nargs="+",
        help="Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text.",
        required=False,

--- a/dan/datasets/extract/exceptions.py
+++ b/dan/datasets/extract/exceptions.py
@@ -49,21 +49,6 @@ class NoTranscriptionError(ElementProcessingError):
        return f"No transcriptions found on element ({self.element_id}) with this config. Skipping."


-class UnknownLabelError(ProcessingError):
-    """
-    Raised when the specified label is not known
-    """
-
-    label: str
-
-    def __init__(self, label: str, *args: object) -> None:
-        super().__init__(*args)
-        self.label = label
-
-    def __str__(self) -> str:
-        return f"Label `{self.label}` is missing in the NER configuration."
-
-
 class NoEndTokenError(ProcessingError):
    """
    Raised when the specified label has no end token and there is potentially additional text around the labels

--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -21,7 +21,6 @@ from dan.datasets.extract.exceptions import (
    NoEndTokenError,
    NoTranscriptionError,
    ProcessingError,
-    UnknownLabelError,
 )
 from dan.datasets.extract.utils import (
    EntityType,
@@ -50,7 +49,6 @@ class ArkindexExtractor:
        parent_element_type: str = None,
        output: Path = None,
        load_entities: bool = False,
-        allow_unknown_entities: bool = False,
        entity_separators: list = [],
        tokens: Path = None,
        use_existing_split: bool = None,
@@ -65,7 +63,6 @@ class ArkindexExtractor:
        self.parent_element_type = parent_element_type
        self.output = output
        self.load_entities = load_entities
-        self.allow_unknown_entities = allow_unknown_entities
        self.entity_separators = entity_separators
        self.tokens = parse_tokens(tokens) if self.load_entities else None
        self.use_existing_split = use_existing_split
@@ -121,11 +118,12 @@ class ArkindexExtractor:
            text += "".join(filter(keep_char, full_text[text_offset : entity.offset]))

            entity_type: EntityType = self.tokens.get(entity.type)
-            # Unknown entities are not allowed
-            if not entity_type and not self.allow_unknown_entities:
-                raise UnknownLabelError(entity.type)
+            if not entity_type:
+                logger.warning(
+                    f"Label `{entity.type}` is missing in the NER configuration."
+                )
            # We keep the whole text, so we need an end token for each entity to know exactly when an entity begins and ends.
-            if entity_type and not entity_type.end and keep_all_text:
+            elif not entity_type.end and keep_all_text:
                raise NoEndTokenError(entity.type)

            # Entity text:
@@ -265,7 +263,6 @@ def run(
    parent_element_type: str,
    output: Path,
    load_entities: bool,
-    allow_unknown_entities: bool,
    entity_separators: list,
    tokens: Path,
    use_existing_split: bool,
@@ -318,7 +315,6 @@ def run(
        parent_element_type=parent_element_type,
        output=output,
        load_entities=load_entities,
-        allow_unknown_entities=allow_unknown_entities,
        entity_separators=entity_separators,
        tokens=tokens,
        use_existing_split=use_existing_split,

--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -12,7 +12,6 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 | `--parent-element-type`          | Type of the parent element containing the data.                                                                                              | `str`           | `page`  |
 | `--output`                       | Folder where the data will be generated.                                                                                                     | `Path`          |         |
 | `--load-entities`                | Extract text with their entities. Needed for NER tasks.                                                                                      | `bool`          | `False` |
-| `--allow-unknown-entities`       | Ignore entities that do not appear in the list of tokens.                                                                                    | `bool`          | `False` |
 | `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text. | `str`           |         |
 | `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.                                                                        | `Path`          |         |
 | `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                                                                          | `bool`          |         |

--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -4,7 +4,7 @@ from typing import NamedTuple

 import pytest

-from dan.datasets.extract.exceptions import NoEndTokenError, UnknownLabelError
+from dan.datasets.extract.exceptions import NoEndTokenError
 from dan.datasets.extract.extract import ArkindexExtractor
 from dan.datasets.extract.utils import EntityType, insert_token

@@ -36,25 +36,6 @@ def test_insert_token(text, offset, length, expected):
    )


-def test_reconstruct_text_unknown_label_error():
-    arkindex_extractor = ArkindexExtractor()
-    arkindex_extractor.tokens = TOKENS
-    with pytest.raises(
-        UnknownLabelError, match="Label `X` is missing in the NER configuration."
-    ):
-        arkindex_extractor.reconstruct_text(
-            "n°1 x 16 janvier 1611",
-            [
-                Entity(
-                    offset=0,
-                    length=3,
-                    type="X",
-                    value="n°1",
-                ),
-            ],
-        )
-
-
 def test_reconstruct_text_no_end_token_error():
    arkindex_extractor = ArkindexExtractor()
    arkindex_extractor.tokens = {
@@ -100,9 +81,7 @@ def test_reconstruct_text_no_end_token_error():
 @pytest.mark.parametrize("text_before", ("", "text before "))
 @pytest.mark.parametrize("text_after", ("", " text after"))
 def test_reconstruct_text(entity_separators, tokens, expected, text_before, text_after):
-    arkindex_extractor = ArkindexExtractor(
-        allow_unknown_entities=True, entity_separators=entity_separators
-    )
+    arkindex_extractor = ArkindexExtractor(entity_separators=entity_separators)
    arkindex_extractor.tokens = tokens
    assert arkindex_extractor.reconstruct_text(
        text_before + "n°1 x 16 janvier 1611\nMichou" + text_after,