Skip to content
Snippets Groups Projects
Commit 0aece446 authored by Manon Blanco's avatar Manon Blanco Committed by Manon Blanco
Browse files

Remove "allow-unknown-entities" parameters + Check entity separators

parent b57d5979
No related branches found
No related tags found
1 merge request!214Filter entities by name when extracting data from Arkindex
This commit is part of merge request !214. Comments created here will be created in the context of that merge request.
......@@ -40,6 +40,15 @@ def validate_probability(proba):
return proba
def validate_char(char):
if len(char) != 1:
raise argparse.ArgumentTypeError(
f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1."
)
return char
def add_extract_parser(subcommands) -> None:
parser = subcommands.add_parser(
"extract",
......@@ -87,14 +96,9 @@ def add_extract_parser(subcommands) -> None:
action="store_true",
help="Extract text with their entities.",
)
parser.add_argument(
"--allow-unknown-entities",
action="store_true",
help="Ignore entities that do not appear in the list of tokens.",
)
parser.add_argument(
"--entity-separators",
type=str,
type=validate_char,
nargs="+",
help="Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text.",
required=False,
......
......@@ -49,21 +49,6 @@ class NoTranscriptionError(ElementProcessingError):
return f"No transcriptions found on element ({self.element_id}) with this config. Skipping."
class UnknownLabelError(ProcessingError):
"""
Raised when the specified label is not known
"""
label: str
def __init__(self, label: str, *args: object) -> None:
super().__init__(*args)
self.label = label
def __str__(self) -> str:
return f"Label `{self.label}` is missing in the NER configuration."
class NoEndTokenError(ProcessingError):
"""
Raised when the specified label has no end token and there is potentially additional text around the labels
......
......@@ -21,7 +21,6 @@ from dan.datasets.extract.exceptions import (
NoEndTokenError,
NoTranscriptionError,
ProcessingError,
UnknownLabelError,
)
from dan.datasets.extract.utils import (
EntityType,
......@@ -50,7 +49,6 @@ class ArkindexExtractor:
parent_element_type: str = None,
output: Path = None,
load_entities: bool = False,
allow_unknown_entities: bool = False,
entity_separators: list = [],
tokens: Path = None,
use_existing_split: bool = None,
......@@ -65,7 +63,6 @@ class ArkindexExtractor:
self.parent_element_type = parent_element_type
self.output = output
self.load_entities = load_entities
self.allow_unknown_entities = allow_unknown_entities
self.entity_separators = entity_separators
self.tokens = parse_tokens(tokens) if self.load_entities else None
self.use_existing_split = use_existing_split
......@@ -121,11 +118,12 @@ class ArkindexExtractor:
text += "".join(filter(keep_char, full_text[text_offset : entity.offset]))
entity_type: EntityType = self.tokens.get(entity.type)
# Unknown entities are not allowed
if not entity_type and not self.allow_unknown_entities:
raise UnknownLabelError(entity.type)
if not entity_type:
logger.warning(
f"Label `{entity.type}` is missing in the NER configuration."
)
# We keep the whole text, so we need an end token for each entity to know exactly when an entity begins and ends.
if entity_type and not entity_type.end and keep_all_text:
elif not entity_type.end and keep_all_text:
raise NoEndTokenError(entity.type)
# Entity text:
......@@ -265,7 +263,6 @@ def run(
parent_element_type: str,
output: Path,
load_entities: bool,
allow_unknown_entities: bool,
entity_separators: list,
tokens: Path,
use_existing_split: bool,
......@@ -318,7 +315,6 @@ def run(
parent_element_type=parent_element_type,
output=output,
load_entities=load_entities,
allow_unknown_entities=allow_unknown_entities,
entity_separators=entity_separators,
tokens=tokens,
use_existing_split=use_existing_split,
......
......@@ -12,7 +12,6 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
| `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` |
| `--output` | Folder where the data will be generated. | `Path` | |
| `--load-entities` | Extract text with their entities. Needed for NER tasks. | `bool` | `False` |
| `--allow-unknown-entities` | Ignore entities that do not appear in the list of tokens. | `bool` | `False` |
| `--entity-separators` | Removes all text that does not appear in an entity or in the list of given characters. Do not give any arguments for keeping the whole text. | `str` | |
| `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | `Path` | |
| `--use-existing-split` | Use the specified folder IDs for the dataset split. | `bool` | |
......
......@@ -4,7 +4,7 @@ from typing import NamedTuple
import pytest
from dan.datasets.extract.exceptions import NoEndTokenError, UnknownLabelError
from dan.datasets.extract.exceptions import NoEndTokenError
from dan.datasets.extract.extract import ArkindexExtractor
from dan.datasets.extract.utils import EntityType, insert_token
......@@ -36,25 +36,6 @@ def test_insert_token(text, offset, length, expected):
)
def test_reconstruct_text_unknown_label_error():
arkindex_extractor = ArkindexExtractor()
arkindex_extractor.tokens = TOKENS
with pytest.raises(
UnknownLabelError, match="Label `X` is missing in the NER configuration."
):
arkindex_extractor.reconstruct_text(
"n°1 x 16 janvier 1611",
[
Entity(
offset=0,
length=3,
type="X",
value="n°1",
),
],
)
def test_reconstruct_text_no_end_token_error():
arkindex_extractor = ArkindexExtractor()
arkindex_extractor.tokens = {
......@@ -100,9 +81,7 @@ def test_reconstruct_text_no_end_token_error():
@pytest.mark.parametrize("text_before", ("", "text before "))
@pytest.mark.parametrize("text_after", ("", " text after"))
def test_reconstruct_text(entity_separators, tokens, expected, text_before, text_after):
arkindex_extractor = ArkindexExtractor(
allow_unknown_entities=True, entity_separators=entity_separators
)
arkindex_extractor = ArkindexExtractor(entity_separators=entity_separators)
arkindex_extractor.tokens = tokens
assert arkindex_extractor.reconstruct_text(
text_before + "n°1 x 16 janvier 1611\nMichou" + text_after,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment