Skip to content
Snippets Groups Projects

Implement extraction command

Merged Yoann Schneider requested to merge implement-extraction-command into main
22 files
+ 799
310
Compare changes
  • Side-by-side
  • Inline
Files
22
# -*- coding: utf-8 -*-
"""
Extract dataset from Arkindex using API.
"""
import pathlib
import uuid
from dan.datasets.extract.extract_from_arkindex import run
MANUAL_SOURCE = "manual"
def parse_worker_version(worker_version_id):
if worker_version_id == MANUAL_SOURCE:
return False
return worker_version_id
def add_extract_parser(subcommands) -> None:
parser = subcommands.add_parser(
"extract",
description=__doc__,
help=__doc__,
)
# Required arguments.
parser.add_argument(
"--parent",
type=uuid.UUID,
nargs="+",
help="ID of the parent folder to import from Arkindex.",
required=False,
)
parser.add_argument(
"--element-type",
nargs="+",
type=str,
help="Type of elements to retrieve",
required=True,
)
parser.add_argument(
"--parent-element-type",
type=str,
help="Type of the parent element containing the data.",
required=False,
default="page",
)
parser.add_argument(
"--output",
type=pathlib.Path,
help="Path where the data will be generated.",
required=True,
)
# Optional arguments.
parser.add_argument(
"--load-entities", action="store_true", help="Extract text with their entities"
)
parser.add_argument(
"--tokens",
type=pathlib.Path,
help="Mapping between starting tokens and end tokens. Needed for entities.",
required=False,
)
parser.add_argument(
"--use-existing-split",
action="store_true",
help="Use the specified folder IDs for the dataset split.",
)
parser.add_argument(
"--train-folder",
type=uuid.UUID,
help="ID of the training folder to import from Arkindex.",
required=False,
)
parser.add_argument(
"--val-folder",
type=uuid.UUID,
help="ID of the validation folder to import from Arkindex.",
required=False,
)
parser.add_argument(
"--test-folder",
type=uuid.UUID,
help="ID of the testing folder to import from Arkindex.",
required=False,
)
parser.add_argument(
"--transcription-worker-version",
type=parse_worker_version,
help=f"Filter transcriptions by worker_version. Use {MANUAL_SOURCE} for manual filtering.",
required=False,
default=MANUAL_SOURCE,
)
parser.add_argument(
"--entity-worker-version",
type=parse_worker_version,
help=f"Filter transcriptions entities by worker_version. Use {MANUAL_SOURCE} for manual filtering.",
required=False,
default=MANUAL_SOURCE,
)
parser.add_argument(
"--train-prob", type=float, default=0.7, help="Training set split size."
)
parser.add_argument(
"--val-prob", type=float, default=0.15, help="Validation set split size"
)
parser.set_defaults(func=run)
Loading