Skip to content
Snippets Groups Projects

Filter entities by name when extracting data from Arkindex

Merged Manon Blanco requested to merge allow-unknown-entities into main
8 files
+ 312
79
Compare changes
  • Side-by-side
  • Inline
Files
8
@@ -40,6 +40,15 @@ def validate_probability(proba):
return proba
def validate_char(char):
if len(char) != 1:
raise argparse.ArgumentTypeError(
f"`{char}` (of length {len(char)}) is not a valid character. Must be a string of length 1."
)
return char
def add_extract_parser(subcommands) -> None:
parser = subcommands.add_parser(
"extract",
@@ -83,7 +92,20 @@ def add_extract_parser(subcommands) -> None:
# Optional arguments.
parser.add_argument(
"--load-entities", action="store_true", help="Extract text with their entities."
"--load-entities",
action="store_true",
help="Extract text with their entities.",
)
parser.add_argument(
"--entity-separators",
type=validate_char,
nargs="+",
help="""
Removes all text that does not appear in an entity or in the list of given ordered characters.
If several separators follow each other, keep only the first to appear in the list.
Do not give any arguments to keep the whole text.
""",
required=False,
)
parser.add_argument(
"--tokens",
Loading