Add default values for entity separators

fe0f6eb4 · Yoann Schneider · 9bba97a2 · fe0f6eb4 · fe0f6eb4 · fe0f6eb4
Commit fe0f6eb4 authored 1 year ago by Yoann Schneider
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -107,6 +107,7 @@ def add_extract_parser(subcommands) -> None:
            Do not give any arguments to keep the whole text.
        """,
        required=False,
+        default=list(map(validate_char, ("\n", " "))),
    )
    parser.add_argument(
        "--tokens",

--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -63,7 +63,7 @@ class ArkindexExtractor:
        element_type: List[str] = [],
        parent_element_type: str = None,
        output: Path = None,
-        entity_separators: list = [],
+        entity_separators: List[str] = ["\n", " "],
        tokens: Path = None,
        transcription_worker_version: Optional[Union[str, bool]] = None,
        entity_worker_version: Optional[Union[str, bool]] = None,
@@ -389,7 +389,7 @@ def run(
    element_type: List[str],
    parent_element_type: str,
    output: Path,
-    entity_separators: list,
+    entity_separators: List[str],
    tokens: Path,
    train_folder: UUID,
    val_folder: UUID,

--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -10,23 +10,23 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind

 If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed.

-| Parameter                        | Description                                                                                                                                                                                                                          | Type            | Default                              |
-| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | ------------------------------------ |
-| `database`                       | Path to an Arkindex export database in SQLite format.                                                                                                                                                                                | `Path`          |                                      |
-| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                                                                                                                                                                     | `str`           |                                      |
-| `--parent-element-type`          | Type of the parent element containing the data.                                                                                                                                                                                      | `str`           | `page`                               |
-| `--output`                       | Folder where the data will be generated.                                                                                                                                                                                             | `Path`          |                                      |
-| `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str`           | (see [dedicated section](#examples)) |
-| `--tokens`                       | Mapping between starting tokens and end tokens to extract text with their entities.                                                                                                                                                  | `Path`          |                                      |
-| `--train-folder`                 | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                      |
-| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                                                                                                                                                                 | `uuid`          |                                      |
-| `--test-folder`                  | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                      |
-| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.                                                                                                                                                          | `str` or `uuid` |                                      |
-| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering                                                                                                                                                  | `str` or `uuid` |                                      |
-| `--max-width`                    | Images larger than this width will be resized to this width.                                                                                                                                                                         | `int`           |                                      |
-| `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                      |
-| `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                |
-| `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                               |
+| Parameter                        | Description                                                                                                                                                                                                                          | Type            | Default                                            |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | -------------------------------------------------- |
+| `database`                       | Path to an Arkindex export database in SQLite format.                                                                                                                                                                                | `Path`          |                                                    |
+| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                                                                                                                                                                     | `str`           |                                                    |
+| `--parent-element-type`          | Type of the parent element containing the data.                                                                                                                                                                                      | `str`           | `page`                                             |
+| `--output`                       | Folder where the data will be generated.                                                                                                                                                                                             | `Path`          |                                                    |
+| `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str`           | `["\n", " "]` (see [dedicated section](#examples)) |
+| `--tokens`                       | Mapping between starting tokens and end tokens to extract text with their entities.                                                                                                                                                  | `Path`          |                                                    |
+| `--train-folder`                 | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                                    |
+| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                                                                                                                                                                 | `uuid`          |                                                    |
+| `--test-folder`                  | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                                    |
+| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.                                                                                                                                                          | `str` or `uuid` |                                                    |
+| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering                                                                                                                                                  | `str` or `uuid` |                                                    |
+| `--max-width`                    | Images larger than this width will be resized to this width.                                                                                                                                                                         | `int`           |                                                    |
+| `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                                    |
+| `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                              |
+| `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                                             |

 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.


--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -69,7 +69,7 @@ def test_insert_token(text, offset, length, expected):


 def test_reconstruct_text_no_end_token_error():
-    arkindex_extractor = ArkindexExtractor()
+    arkindex_extractor = ArkindexExtractor(entity_separators=[])
    arkindex_extractor.tokens = {
        "X": EntityType(start="ⓧ"),
    }