From 0aece446ffe32f58c760721553b95f47b666475b Mon Sep 17 00:00:00 2001 From: manonBlanco <blanco@teklia.com> Date: Mon, 24 Jul 2023 09:12:47 +0200 Subject: [PATCH] Remove "allow-unknown-entities" parameters + Check entity separators --- dan/datasets/extract/__init__.py | 16 ++++++++++------ dan/datasets/extract/exceptions.py | 15 --------------- dan/datasets/extract/extract.py | 14 +++++--------- docs/usage/datasets/extract.md | 1 - tests/test_extract.py | 25 ++----------------------- 5 files changed, 17 insertions(+), 54 deletions(-) diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 10e3b45c..2a17a75c 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -40,6 +40,15 @@ def validate_probability(proba): return proba +def validate_char(char): + if len(char) != 1: + raise argparse.ArgumentTypeError( + f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1." + ) + + return char + + def add_extract_parser(subcommands) -> None: parser = subcommands.add_parser( "extract", @@ -87,14 +96,9 @@ def add_extract_parser(subcommands) -> None: action="store_true", help="Extract text with their entities.", ) - parser.add_argument( - "--allow-unknown-entities", - action="store_true", - help="Ignore entities that do not appear in the list of tokens.", - ) parser.add_argument( "--entity-separators", - type=str, + type=validate_char, nargs="+", help="Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text.", required=False, diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py index da8fba65..93c8d1ae 100644 --- a/dan/datasets/extract/exceptions.py +++ b/dan/datasets/extract/exceptions.py @@ -49,21 +49,6 @@ class NoTranscriptionError(ElementProcessingError): return f"No transcriptions found on element ({self.element_id}) with this config. Skipping." -class UnknownLabelError(ProcessingError): - """ - Raised when the specified label is not known - """ - - label: str - - def __init__(self, label: str, *args: object) -> None: - super().__init__(*args) - self.label = label - - def __str__(self) -> str: - return f"Label `{self.label}` is missing in the NER configuration." - - class NoEndTokenError(ProcessingError): """ Raised when the specified label has no end token and there is potentially additional text around the labels diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 55b120dc..05105f9e 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -21,7 +21,6 @@ from dan.datasets.extract.exceptions import ( NoEndTokenError, NoTranscriptionError, ProcessingError, - UnknownLabelError, ) from dan.datasets.extract.utils import ( EntityType, @@ -50,7 +49,6 @@ class ArkindexExtractor: parent_element_type: str = None, output: Path = None, load_entities: bool = False, - allow_unknown_entities: bool = False, entity_separators: list = [], tokens: Path = None, use_existing_split: bool = None, @@ -65,7 +63,6 @@ class ArkindexExtractor: self.parent_element_type = parent_element_type self.output = output self.load_entities = load_entities - self.allow_unknown_entities = allow_unknown_entities self.entity_separators = entity_separators self.tokens = parse_tokens(tokens) if self.load_entities else None self.use_existing_split = use_existing_split @@ -121,11 +118,12 @@ class ArkindexExtractor: text += "".join(filter(keep_char, full_text[text_offset : entity.offset])) entity_type: EntityType = self.tokens.get(entity.type) - # Unknown entities are not allowed - if not entity_type and not self.allow_unknown_entities: - raise UnknownLabelError(entity.type) + if not entity_type: + logger.warning( + f"Label `{entity.type}` is missing in the NER configuration." + ) # We keep the whole text, so we need an end token for each entity to know exactly when an entity begins and ends. - if entity_type and not entity_type.end and keep_all_text: + elif not entity_type.end and keep_all_text: raise NoEndTokenError(entity.type) # Entity text: @@ -265,7 +263,6 @@ def run( parent_element_type: str, output: Path, load_entities: bool, - allow_unknown_entities: bool, entity_separators: list, tokens: Path, use_existing_split: bool, @@ -318,7 +315,6 @@ def run( parent_element_type=parent_element_type, output=output, load_entities=load_entities, - allow_unknown_entities=allow_unknown_entities, entity_separators=entity_separators, tokens=tokens, use_existing_split=use_existing_split, diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index fce4f564..bf3b4ef2 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -12,7 +12,6 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind | `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` | | `--output` | Folder where the data will be generated. | `Path` | | | `--load-entities` | Extract text with their entities. Needed for NER tasks. | `bool` | `False` | -| `--allow-unknown-entities` | Ignore entities that do not appear in the list of tokens. | `bool` | `False` | | `--entity-separators` | Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text. | `str` | | | `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | `Path` | | | `--use-existing-split` | Use the specified folder IDs for the dataset split. | `bool` | | diff --git a/tests/test_extract.py b/tests/test_extract.py index c20c4674..3c6cadac 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,7 +4,7 @@ from typing import NamedTuple import pytest -from dan.datasets.extract.exceptions import NoEndTokenError, UnknownLabelError +from dan.datasets.extract.exceptions import NoEndTokenError from dan.datasets.extract.extract import ArkindexExtractor from dan.datasets.extract.utils import EntityType, insert_token @@ -36,25 +36,6 @@ def test_insert_token(text, offset, length, expected): ) -def test_reconstruct_text_unknown_label_error(): - arkindex_extractor = ArkindexExtractor() - arkindex_extractor.tokens = TOKENS - with pytest.raises( - UnknownLabelError, match="Label `X` is missing in the NER configuration." - ): - arkindex_extractor.reconstruct_text( - "n°1 x 16 janvier 1611", - [ - Entity( - offset=0, - length=3, - type="X", - value="n°1", - ), - ], - ) - - def test_reconstruct_text_no_end_token_error(): arkindex_extractor = ArkindexExtractor() arkindex_extractor.tokens = { @@ -100,9 +81,7 @@ def test_reconstruct_text_no_end_token_error(): @pytest.mark.parametrize("text_before", ("", "text before ")) @pytest.mark.parametrize("text_after", ("", " text after")) def test_reconstruct_text(entity_separators, tokens, expected, text_before, text_after): - arkindex_extractor = ArkindexExtractor( - allow_unknown_entities=True, entity_separators=entity_separators - ) + arkindex_extractor = ArkindexExtractor(entity_separators=entity_separators) arkindex_extractor.tokens = tokens assert arkindex_extractor.reconstruct_text( text_before + "n°1 x 16 janvier 1611\nMichou" + text_after, -- GitLab