diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index efd50c9a5a1acc973f33f0fd42768156161c4d56..6ea479c602b3557e90d2d256e598472d9574c3a5 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -107,6 +107,7 @@ def add_extract_parser(subcommands) -> None: Do not give any arguments to keep the whole text. """, required=False, + default=list(map(validate_char, ("\n", " "))), ) parser.add_argument( "--tokens", diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py index 89869e6bed6eb4dc11a798d81632724f4bc2fab7..440986ec32bd1e1119a98be34aa02482013691ca 100644 --- a/dan/datasets/extract/extract.py +++ b/dan/datasets/extract/extract.py @@ -63,7 +63,7 @@ class ArkindexExtractor: element_type: List[str] = [], parent_element_type: str = None, output: Path = None, - entity_separators: list = [], + entity_separators: List[str] = ["\n", " "], tokens: Path = None, transcription_worker_version: Optional[Union[str, bool]] = None, entity_worker_version: Optional[Union[str, bool]] = None, @@ -389,7 +389,7 @@ def run( element_type: List[str], parent_element_type: str, output: Path, - entity_separators: list, + entity_separators: List[str], tokens: Path, train_folder: UUID, val_folder: UUID, diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index f050a12fc2dfa2253390b7aa79e0aa9b8b886d29..ae0c9059b21fc6bccc98de1e684a68bd946fdcf9 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -10,23 +10,23 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed. -| Parameter | Description | Type | Default | -| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | ------------------------------------ | -| `database` | Path to an Arkindex export database in SQLite format. | `Path` | | -| `--element-type` | Type of the elements to extract. You may specify multiple types. | `str` | | -| `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` | -| `--output` | Folder where the data will be generated. | `Path` | | -| `--entity-separators` | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str` | (see [dedicated section](#examples)) | -| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `Path` | | -| `--train-folder` | ID of the training folder to import from Arkindex. | `uuid` | | -| `--val-folder` | ID of the validation folder to import from Arkindex. | `uuid` | | -| `--test-folder` | ID of the training folder to import from Arkindex. | `uuid` | | -| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering. | `str` or `uuid` | | -| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str` or `uuid` | | -| `--max-width` | Images larger than this width will be resized to this width. | `int` | | -| `--max-height` | Images larger than this height will be resized to this height. | `int` | | -| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False | -| `--image-format` | Images will be saved under this format. | `str` | `.jpg` | +| Parameter | Description | Type | Default | +| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | -------------------------------------------------- | +| `database` | Path to an Arkindex export database in SQLite format. | `Path` | | +| `--element-type` | Type of the elements to extract. You may specify multiple types. | `str` | | +| `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` | +| `--output` | Folder where the data will be generated. | `Path` | | +| `--entity-separators` | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str` | `["\n", " "]` (see [dedicated section](#examples)) | +| `--tokens` | Mapping between starting tokens and end tokens to extract text with their entities. | `Path` | | +| `--train-folder` | ID of the training folder to import from Arkindex. | `uuid` | | +| `--val-folder` | ID of the validation folder to import from Arkindex. | `uuid` | | +| `--test-folder` | ID of the training folder to import from Arkindex. | `uuid` | | +| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering. | `str` or `uuid` | | +| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use `manual` for manual filtering | `str` or `uuid` | | +| `--max-width` | Images larger than this width will be resized to this width. | `int` | | +| `--max-height` | Images larger than this height will be resized to this height. | `int` | | +| `--keep-spaces` | Transcriptions are trimmed by default. Use this flag to disable this behaviour. | `bool` | False | +| `--image-format` | Images will be saved under this format. | `str` | `.jpg` | The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively. diff --git a/tests/test_extract.py b/tests/test_extract.py index 6e9808774b40c4b089974a14c43c54a2f081f994..24ec2905aeb60176b6a83105abf81309a29e4e06 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -69,7 +69,7 @@ def test_insert_token(text, offset, length, expected): def test_reconstruct_text_no_end_token_error(): - arkindex_extractor = ArkindexExtractor() + arkindex_extractor = ArkindexExtractor(entity_separators=[]) arkindex_extractor.tokens = { "X": EntityType(start="â“§"), }