diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 10e3b45cb803b74f846c161195d437fbeb103316..2a17a75c7a86e9c4578058a57f7de0706c28f12e 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -40,6 +40,15 @@ def validate_probability(proba): return proba +def validate_char(char): + if len(char) != 1: + raise argparse.ArgumentTypeError( + f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1." + ) + + return char + + def add_extract_parser(subcommands) -> None: parser = subcommands.add_parser( "extract", @@ -87,14 +96,9 @@ def add_extract_parser(subcommands) -> None: action="store_true", help="Extract text with their entities.", ) - parser.add_argument( - "--allow-unknown-entities", - action="store_true", - help="Ignore entities that do not appear in the list of tokens.", - ) parser.add_argument( "--entity-separators", - type=str, + type=validate_char, nargs="+", help="Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text.", required=False, diff --git a/dan/datasets/extract/exceptions.py b/dan/datasets/extract/exceptions.py index da8fba65541f8df9c075cba963b3e23481f6aae0..93c8d1ae5983edbb3ad3eaeafee0e46c0c863411 100644 --- a/dan/datasets/extract/exceptions.py +++ b/dan/datasets/extract/exceptions.py @@ -49,21 +49,6 @@ class NoTranscriptionError(ElementProcessingError): return f"No transcriptions found on element ({self.element_id}) with this config. Skipping." -class UnknownLabelError(ProcessingError): - """ - Raised when the specified label is not known - """ - - label: str - - def __init__(self, label: str, *args: object) -> None: - super().__init__(*args) - self.label = label - - def __str__(self) -> str: - return f"Label `{self.label}` is missing in the NER configuration." - - class NoEndTokenError(ProcessingError): """ Raised when the specified label has no end token and there is potentially additional text around the labels diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 55b120dc1ce774067da55515f4935ed9174fc29d..05105f9e2eeab7b3121c45d3d23fa612139345e7 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -21,7 +21,6 @@ from dan.datasets.extract.exceptions import ( NoEndTokenError, NoTranscriptionError, ProcessingError, - UnknownLabelError, ) from dan.datasets.extract.utils import ( EntityType, @@ -50,7 +49,6 @@ class ArkindexExtractor: parent_element_type: str = None, output: Path = None, load_entities: bool = False, - allow_unknown_entities: bool = False, entity_separators: list = [], tokens: Path = None, use_existing_split: bool = None, @@ -65,7 +63,6 @@ class ArkindexExtractor: self.parent_element_type = parent_element_type self.output = output self.load_entities = load_entities - self.allow_unknown_entities = allow_unknown_entities self.entity_separators = entity_separators self.tokens = parse_tokens(tokens) if self.load_entities else None self.use_existing_split = use_existing_split @@ -121,11 +118,12 @@ class ArkindexExtractor: text += "".join(filter(keep_char, full_text[text_offset : entity.offset])) entity_type: EntityType = self.tokens.get(entity.type) - # Unknown entities are not allowed - if not entity_type and not self.allow_unknown_entities: - raise UnknownLabelError(entity.type) + if not entity_type: + logger.warning( + f"Label `{entity.type}` is missing in the NER configuration." + ) # We keep the whole text, so we need an end token for each entity to know exactly when an entity begins and ends. - if entity_type and not entity_type.end and keep_all_text: + elif not entity_type.end and keep_all_text: raise NoEndTokenError(entity.type) # Entity text: @@ -265,7 +263,6 @@ def run( parent_element_type: str, output: Path, load_entities: bool, - allow_unknown_entities: bool, entity_separators: list, tokens: Path, use_existing_split: bool, @@ -318,7 +315,6 @@ def run( parent_element_type=parent_element_type, output=output, load_entities=load_entities, - allow_unknown_entities=allow_unknown_entities, entity_separators=entity_separators, tokens=tokens, use_existing_split=use_existing_split, diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index fce4f564afadd57e1a88975daef44c2b9de345a2..bf3b4ef266829a7e878cf425f8fc4a4640f9418d 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -12,7 +12,6 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind | `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` | | `--output` | Folder where the data will be generated. | `Path` | | | `--load-entities` | Extract text with their entities. Needed for NER tasks. | `bool` | `False` | -| `--allow-unknown-entities` | Ignore entities that do not appear in the list of tokens. | `bool` | `False` | | `--entity-separators` | Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text. | `str` | | | `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | `Path` | | | `--use-existing-split` | Use the specified folder IDs for the dataset split. | `bool` | | diff --git a/tests/test_extract.py b/tests/test_extract.py index c20c4674e1158b1be8ef5d84a232a38faf589c52..3c6cadac48feb9fa48bc0605f2e315b47467e408 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,7 +4,7 @@ from typing import NamedTuple import pytest -from dan.datasets.extract.exceptions import NoEndTokenError, UnknownLabelError +from dan.datasets.extract.exceptions import NoEndTokenError from dan.datasets.extract.extract import ArkindexExtractor from dan.datasets.extract.utils import EntityType, insert_token @@ -36,25 +36,6 @@ def test_insert_token(text, offset, length, expected): ) -def test_reconstruct_text_unknown_label_error(): - arkindex_extractor = ArkindexExtractor() - arkindex_extractor.tokens = TOKENS - with pytest.raises( - UnknownLabelError, match="Label `X` is missing in the NER configuration." - ): - arkindex_extractor.reconstruct_text( - "n°1 x 16 janvier 1611", - [ - Entity( - offset=0, - length=3, - type="X", - value="n°1", - ), - ], - ) - - def test_reconstruct_text_no_end_token_error(): arkindex_extractor = ArkindexExtractor() arkindex_extractor.tokens = { @@ -100,9 +81,7 @@ def test_reconstruct_text_no_end_token_error(): @pytest.mark.parametrize("text_before", ("", "text before ")) @pytest.mark.parametrize("text_after", ("", " text after")) def test_reconstruct_text(entity_separators, tokens, expected, text_before, text_after): - arkindex_extractor = ArkindexExtractor( - allow_unknown_entities=True, entity_separators=entity_separators - ) + arkindex_extractor = ArkindexExtractor(entity_separators=entity_separators) arkindex_extractor.tokens = tokens assert arkindex_extractor.reconstruct_text( text_before + "n°1 x 16 janvier 1611\nMichou" + text_after,